Sun Jul 2 09:02:38 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit fecd7fc31c5b401ada6224f8fd4f42e2363aa78b
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sat Nov 19 12:12:22 2016 +0100

    new code for multithreading, currently in impulse_response.cc
    ... where it could be easily developed and tested. I was unhappy with
    'divide_and_conquer' and wanted generalized code for multithreading
    over sections of manifolds. Staring out with several attempts (still in
    this commit) using tuples, I finally figured out that it would be best to
    introduce proper notions of 'range' and 'partitioning' and use them explicitly.
    I now have a multithreading routine which is just about as general as possible,
    taking any number of arguments. The downside is a bit of extra burden on the
    functors pased to the individual threads: they aren't fed the partitions
    themselves, but only the partitioning information, so that they have to
    apply that partitioning information to the manifolds they process.
---
 brace.h                     |  18 +-
 bspline.h                   |   2 +-
 common.h                    | 360 +------------------------------
 eval.h                      |  40 ++--
 example/impulse_response.cc | 511 +++++++++++++++++++++++++++++++++++++++++++-
 example/roundtrip.cc        |   2 +-
 filter.h                    |   2 +-
 mapping.h                   |  30 +--
 prefilter.h                 |   8 +-
 remap.h                     |  50 ++---
 10 files changed, 592 insertions(+), 431 deletions(-)

diff --git a/brace.h b/brace.h
index 3c91e08..0d6c703 100644
--- a/brace.h
+++ b/brace.h
@@ -207,7 +207,7 @@ struct bracer
 /// is the section of the coeficient array the evaluation code will look at.
 
   static shape_type target_shape ( shape_type source_shape ,
-                            typename aggregate_traits < bc_code , dimension > :: type bcv ,
+                            vigra::TinyVector < bc_code , dimension > bcv ,
                             int spline_degree )
   {
     shape_type target_shape ;
@@ -224,14 +224,14 @@ struct bracer
 //                             bc_code bc ,
 //                             int spline_degree )
 //   {
-//     typename aggregate_traits < bc_code , dimension > :: type bcv ( bc ) ;
+//     vigra::TinyVector < bc_code , dimension > bcv ( bc ) ;
 //     return target_shape ( source_shape , bcv , spline_degree ) ;
 //   }
   
 /// this method gives the left offset to the 'core' subarray (array minus bracing),
 /// given the BC codes and the spline degree
 
-  static shape_type left_corner ( typename aggregate_traits < bc_code , dimension > :: type bcv ,
+  static shape_type left_corner ( vigra::TinyVector < bc_code , dimension > bcv ,
                                   int spline_degree )
   {
     shape_type target_offset ;
@@ -243,7 +243,7 @@ struct bracer
 /// this method gives the right offset to the 'core' subarray (array minus bracing),
 /// given the BC codes and the spline degree
 
-  static shape_type right_corner ( typename aggregate_traits < bc_code , dimension > :: type bcv ,
+  static shape_type right_corner ( vigra::TinyVector < bc_code , dimension > bcv ,
                                    int spline_degree )
   {
     shape_type target_offset ;
@@ -255,7 +255,7 @@ struct bracer
 /// given a braced array, return the size of it's 'core', the array without applied bracing
 
   static shape_type core_shape ( view_type& a ,
-                                 typename aggregate_traits < bc_code , dimension > :: type bcv ,
+                                 vigra::TinyVector < bc_code , dimension > bcv ,
                                  int spline_degree )
   {
     return a.subarray (   a.shape()
@@ -266,7 +266,7 @@ struct bracer
 /// produce a view to the core
 
   static view_type core_view ( view_type& a ,
-                               typename aggregate_traits < bc_code , dimension > :: type bc ,
+                               vigra::TinyVector < bc_code , dimension > bc ,
                                int spline_degree )
   {
     return a.subarray ( left_corner ( bc , spline_degree ) ,
@@ -498,9 +498,9 @@ struct bracer
 /// This variant of apply braces along all axes in one go.
 
   static void apply ( view_type& a ,          ///< target array, containing the core and (empty) frame
-               typename aggregate_traits < bc_code , dimension > :: type bcv ,  ///< boundary condition codes
-               typename aggregate_traits < int , dimension > :: type left_corner ,  ///< sizes of left braces
-               typename aggregate_traits < int , dimension > :: type right_corner ) ///< sizes of right braces
+               vigra::TinyVector < bc_code , dimension > bcv ,  ///< boundary condition codes
+               vigra::TinyVector < int , dimension > left_corner ,  ///< sizes of left braces
+               vigra::TinyVector < int , dimension > right_corner ) ///< sizes of right braces
   {
     for ( int dim = 0 ; dim < dimension ; dim++ )
       apply ( a , bcv[dim] , left_corner[dim] , right_corner[dim] , dim ) ;
diff --git a/bspline.h b/bspline.h
index 7b2a79f..51cf5e6 100644
--- a/bspline.h
+++ b/bspline.h
@@ -166,7 +166,7 @@ struct bspline
   /// multidimensional index type
   typedef typename view_type::difference_type shape_type ;
   /// nD type for one boundary condition per axis
-  typedef typename aggregate_traits < bc_code , dimension > :: type bcv_type ;
+  typedef typename vigra::TinyVector < bc_code , dimension > bcv_type ;
 
   /// type for pointer to a prefiltering method
   typedef void (*p_solve) ( view_type& ,
diff --git a/common.h b/common.h
index a645639..b07acd9 100644
--- a/common.h
+++ b/common.h
@@ -49,18 +49,18 @@
 
 #include <Vc/Vc>
 
+#endif
+
+namespace vspline {
+
+#ifdef USE_VC
+
 // initially, I used Vc::SimdArray in many places, but I figured that having a central
 // place to fix a simdized type would be a good idea, so here's a traits class. With this
 // central definition of the simdized type, it's also easy to make more of an effort to get
 // that simdized type which is most suited for the situation, so I introduced a conditional
 // which picks plain Vc::Vector<T> if vsize is the right size, and SimdArray only if it is not.
 
-// TODO maybe add types for masks and index vectors?
-
-// TODO: I'd like to increase the default _vsize, like to twice the value used now,
-// just to see what happens, but I can't yet get this modification to trickle down
-// without a bunch of errors I can't even relate to the change...
-
 /// traits class vector_traits picks the appropriate simdized type for a given
 /// elementary type T and a vector length of _vsize.
 /// if vsize isn't given, it is taken to be a Vc::Vector<T>'s Size.
@@ -68,316 +68,10 @@
 /// is used as the simdized type, and SimdArray<T,_vsize> is only used if it can't be
 /// avoided. Why so? I have the suspicion that the code generated for SimdArrays
 /// may be less efficient than the code for Vc::Vectors.
-/// hmmm... the difference is, if at all, small, and the extra finessing produces
-/// problems further down, so I stick with sall SimdArrays for now, but I always
-/// use SimdArrays of twice the vector size, because this produces the fastest
-/// evaluation on my system.
-
-template < class T , int _vsize >
-struct masked_pseudo_vector ;
-
-template < class T , int _vsize >
-struct pseudo_vector: public vigra::TinyVector < T , _vsize >
-{
-  typedef pseudo_vector < T , _vsize > this_type ;
-  typedef vigra::TinyVector < T , _vsize > base_type ;
-  typedef T value_type ;
-  typedef T EntryType ;
-  typedef pseudo_vector < int , _vsize > IndexType ;
-  typedef pseudo_vector < bool , _vsize > MaskType ;
-  typedef pseudo_vector < int , _vsize > index_type ;
-  typedef pseudo_vector < bool , _vsize > mask_type ;
-  typedef pseudo_vector < bool , _vsize > Mask ;
-  
-  enum { vsize = _vsize } ;
-  enum { Size = _vsize } ;
-  
-  pseudo_vector()
-  : base_type()
-  {
-  }
-  
-  template < class I >
-  pseudo_vector ( const I & i )
-  : base_type ( i )
-  {} ;
-  
-  pseudo_vector ( const T* mem )
-  {
-    this->load ( mem ) ;
-  }
-  
-  pseudo_vector ( const T* mem , const index_type & indexes )
-  {
-    this->gather ( mem , indexes ) ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator- () const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = - (*this)[e] ;
-    return result ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator< ( const this_type& other ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] < other[e] ;
-    return result ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator< ( const value_type& rhs ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] < rhs ;
-    return result ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator> ( const this_type& other ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] > other[e] ;
-    return result ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator> ( const value_type& rhs ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] > rhs ;
-    return result ;
-  }
-  
-  
-  pseudo_vector < bool , _vsize > operator<= ( const this_type& other ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] <= other[e] ;
-    return result ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator<= ( const value_type& rhs ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] <= rhs ;
-    return result ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator>= ( const this_type& other ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] >= other[e] ;
-    return result ;
-  }
-  
-  pseudo_vector < bool , _vsize > operator>= ( const value_type& rhs ) const
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = (*this)[e] >= rhs ;
-    return result ;
-  }
-  
-  friend pseudo_vector < bool , _vsize > asin ( const this_type & arg )
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = asin ( arg[e] ) ;
-    return result ;
-  }
-  
-  friend pseudo_vector < bool , _vsize > atan2 ( const this_type & arg1 , const this_type & arg2 )
-  {
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = atan2 ( arg1[e] , arg2[e] ) ;
-    return result ;
-  }
-  
-//   this_type & operator++ ( int i = 1 )
-//   {
-//     for ( int e = 0 ; e < vsize ; e++ )
-//       ++(this[e]) ;
-//     return *this ;
-//   }
-  
-  friend this_type operator| ( const this_type & a , const this_type & b )
-  {    
-    pseudo_vector < bool , _vsize > result ;
-    for ( int e = 0 ; e < vsize ; e++ )
-      result[e] = a[e] | b[e] ;
-    return result ;
-  }
-  
-  static this_type IndexesFromZero()
-  {
-    this_type result ;
-    for ( int i = 0 ; i < vsize ; i++ )
-      result[i] = i ;
-    return result ;
-  }
-  
-  template < typename align_flag >
-  void load ( const T * const base , const align_flag f = false )
-  {
-    for ( int e = 0 ; e < vsize ; e++ )
-      (*this)[e] = base[e] ;
-  }
-  
-  void gather ( const T * const base , const IndexType & indexes )
-  {
-    for ( int e = 0 ; e < vsize ; e++ )
-      (*this)[e] = base[indexes[e]] ;
-  }
-  
-  void gather ( const T * const base , const IndexType & indexes , const MaskType & mask )
-  {
-    for ( int e = 0 ; e < vsize ; e++ )
-    {
-      if ( mask[e] )
-        (*this)[e] = base[indexes[e]] ;
-    }
-  }
-  
-  template < typename align_flag >
-  void store ( T * base , const align_flag f = false )
-  {
-    for ( int e = 0 ; e < vsize ; e++ )
-      base[e] = (*this)[e] ;
-  }
-  
-  void scatter ( T * base , const IndexType & indexes )
-  {
-    for ( int e = 0 ; e < vsize ; e++ )
-      base[indexes[e]] = (*this)[e] ;
-  }
-  
-  void scatter ( T * base , const IndexType & indexes , const MaskType & mask )
-  {
-    for ( int e = 0 ; e < vsize ; e++ )
-    {
-      if ( mask[e] )
-        base[indexes[e]] = (*this)[e] ;
-    }
-  }
-  
-  masked_pseudo_vector < T , vsize > operator() ( const MaskType& mask )
-  {
-    return masked_pseudo_vector < T , _vsize > ( *this , mask ) ;
-  }
-  
-  bool isFull() const
-  {
-    return this->all() ;
-  }
-
-} ;
-
-template < class T , int _vsize >
-bool any_of ( const pseudo_vector < T , _vsize > & v )
-{
-  return v.any() ;
-}
-
-template < class T , int _vsize >
-bool none_of ( const pseudo_vector < T , _vsize > & v )
-{
-  return ! v.any() ;
-}
-
-template < class T , int _vsize >
-bool all_of ( const pseudo_vector < T , _vsize > & v )
-{
-  return v.all() ;
-}
-
-template < class T , int _vsize >
-pseudo_vector < T , _vsize > floor ( const pseudo_vector < T , _vsize > & v )
-{
-  pseudo_vector < T , _vsize > result ;
-  for ( int e = 0 ; e < _vsize ; e++ )
-    result[e] = floor ( v[e] ) ;
-  return result ;
-}
-
-template < class T , int _vsize >
-struct masked_pseudo_vector
-: public pseudo_vector < T , _vsize >
-{
-  typedef pseudo_vector < T , _vsize > base_type ;
-  typedef masked_pseudo_vector < T , _vsize > this_type ;
-  
-  const typename pseudo_vector < T , _vsize > :: Mask mask ;
-  base_type & target ;
-  
-  masked_pseudo_vector ( base_type & _target ,
-                         const typename base_type :: Mask & _mask )
-  : target ( _target ) ,
-    mask ( _mask )
-  { } ;
-  
-  base_type & operator= ( const base_type & other )
-  {
-    for ( int e = 0 ; e < _vsize ; e++ )
-    {
-      if ( mask[e] )
-        target[e] = other[e] ;
-    }
-    return target ;
-  }
-  
-  base_type & operator-= ( const base_type & other )
-  {
-    for ( int e = 0 ; e < _vsize ; e++ )
-    {
-      if ( mask[e] )
-        target[e] -= other[e] ;
-    }
-    return target ;
-  }
-  
-  base_type & operator+= ( const base_type & other )
-  {
-    for ( int e = 0 ; e < _vsize ; e++ )
-    {
-      if ( mask[e] )
-        target[e] += other[e] ;
-    }
-    return target ;
-  }
-  
-  base_type & operator*= ( const base_type & other )
-  {
-    for ( int e = 0 ; e < _vsize ; e++ )
-    {
-      if ( mask[e] )
-        target[e] *= other[e] ;
-    }
-    return target ;
-  }
-  
-  base_type & operator/= ( const base_type & other )
-  {
-    for ( int e = 0 ; e < _vsize ; e++ )
-    {
-      if ( mask[e] )
-        target[e] /= other[e] ;
-    }
-    return target ;
-  }
-  
-} ;
 
 /// struct vector_traits is the central location to fix vecorization throughout vspline.
 /// Nothe how we use a default vector width twice the size of a standard Vc vector for
-/// the given T. In my tests, this made the code faster. TODO test on other systems
+/// the given T. In my tests, this made the code faster. TODO test more, and on other systems
 
 template < class T , int _vsize = 2 * Vc::Vector < T > :: Size >
 struct vector_traits
@@ -391,51 +85,13 @@ struct vector_traits
                                       Vc::Vector < T > ,
                                       Vc::SimdArray < T , vsize > > :: type type ;
 
-// to always use Simdarrays, I use
+// to always use Simdarrays, use this definition instead:
 
 //   typedef Vc::SimdArray < T , vsize > simdized_type ;
 } ;
 
-/// while I haven't put it to use everywhere, here is a similar struct to fix what I call
-/// aggregation: the grouping together of a fixed number (_channels) of class  T, packed
-/// tightly in memory. This is similar to a SIMD vector, but represents a view to data which
-/// would, if at all, lend itself to vertical vectorization. Types fixed by this traits class
-/// would typically be things like single pixels or nD coordinates.
-
-template < class T , int _channels >
-struct aggregate_traits
-{
-  enum { channels = _channels } ;
-  typedef T value_type ;
-
-// I'd like to use the definition below, but so far I have trouble getting it to work
-// smoothly with the rest of my code. This is due to the fact that at times I have variants
-// of routines which only differ in the type of a parameter which may be an aggregate type.
-// if the 'aggregate of one' type specification is removed by the definition below, the signatures
-// become identical producing a 'can't overload' error.
-
-//   typedef typename std::conditional < channels == 1 ,
-//                                       T ,
-//                                       vigra::TinyVector < T , channels > > :: type type ;
-
-// so for now, it's TinyVectors throughout. This does at least put the aggregation
-// in a single place and keeps the TinyVectors out of the remainder of the code.
-
-  typedef typename vigra::TinyVector < T , channels > type ;
-} ;
-
-template < class T , int _vsize = 2 * Vc::Vector < T > :: Size >
-struct ps_vector_traits
-{
-  enum { vsize = _vsize } ;
-
-  typedef pseudo_vector < T , vsize > simdized_type ;
-} ;
-
 #endif
 
-namespace vspline {
-
 /// dimension-mismatch is thrown if two arrays have different dimensions
 /// which should have the same dimensions.
 
diff --git a/eval.h b/eval.h
index f7f0fbc..3627a57 100644
--- a/eval.h
+++ b/eval.h
@@ -494,10 +494,10 @@ public:
   /// types for a multidimensional real/integral coordinate
   // TODO this is duplicating the typedef in mapping.h
 
-  typedef typename aggregate_traits < rc_type , dimension > :: type   nd_rc_type ;
-  typedef typename aggregate_traits < ic_type , dimension > :: type   nd_ic_type ;
+  typedef vigra::TinyVector < rc_type , dimension >   nd_rc_type ;
+  typedef vigra::TinyVector < ic_type , dimension >   nd_ic_type ;
 
-  typedef typename aggregate_traits < int , dimension > :: type   derivative_spec_type ;
+  typedef vigra::TinyVector < int , dimension >                       derivative_spec_type ;
 
   /// type for a 'split' coordinate, mainly used internally in the evaluator, but
   /// also in 'warp arrays' used for remapping
@@ -509,7 +509,7 @@ public:
   typedef typename MultiArray < 1 , ic_type > :: iterator          offset_iterator ;
   typedef MultiArrayView < dimension + 1 , ele_type >              component_view_type ;
   typedef typename component_view_type::difference_type            expanded_shape_type ;
-  typedef typename aggregate_traits < ele_type* , channels > :: type component_base_type ;
+  typedef vigra::TinyVector < ele_type* , channels >               component_base_type ;
 
 #ifdef USE_VC
 
@@ -537,19 +537,19 @@ public:
 
   /// multichannel value as SoA, for pixels etc.
 
-  typedef typename aggregate_traits < ele_v , channels > :: type mc_ele_v ;
+  typedef vigra::TinyVector < ele_v , channels > mc_ele_v ;
 
   /// SoA for nD coordinates/components
 
-  typedef typename aggregate_traits < rc_v , dimension > :: type nd_rc_v ;
+  typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
 
   /// SoA for nD shapes (or multidimensional indices)
 
-  typedef typename aggregate_traits < ic_v , dimension > :: type nd_ic_v ;
+  typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
 
   /// SoA for multichannel indices (used for gather/scatter operations)
 
-  typedef typename aggregate_traits < ic_v , channels > :: type  mc_ic_v ;
+  typedef vigra::TinyVector < ic_v , channels >  mc_ic_v ;
 
 #else
   
@@ -573,7 +573,7 @@ public:
   
   /// we need one functor per dimension:
     
-  typedef typename aggregate_traits < weight_functor_base_type* , dimension >  :: type nd_weight_functor ;
+  typedef vigra::TinyVector < weight_functor_base_type* , dimension > nd_weight_functor ;
   
   // while in the context of B-splines the weight functors are, of course, functors which
   // calculate the weights via the B-spline basis functions, the formulation we use here
@@ -842,13 +842,11 @@ public:
   /// allows calling code to provide their own weights, together with their shape, in one handy
   /// packet. And in the normal sequence of delegation inside class eval, the next routine 'up'
   /// can also package the weights nicely in a MultiArrayView. Note that select is now an ic_type,
-  /// a single integral value representing an offset into the coefficient array. We can either use
-  /// a different signature or a different name for the routine to keep it distinct, I chose to
-  /// name it differently (not eval):
+  /// a single integral value representing an offset into the coefficient array.
   
-  void eval0 ( const ic_type & select ,
-               const MultiArrayView < 2 , ele_type > & weight ,
-               value_type & result )
+  void eval ( const ic_type & select ,
+              const MultiArrayView < 2 , ele_type > & weight ,
+              value_type & result )
   {
     const value_type * base = coefficients.data() + select ;
     offset_iterator ofs = offsets.begin() ;  // offsets reflects the positions inside the subarray
@@ -865,7 +863,7 @@ public:
               const MultiArrayView < 2 , ele_type > & weight ,
               value_type & result )
   {
-    eval0 ( sum ( select * coefficients.stride() ) , weight , result ) ;
+    eval ( sum ( select * coefficients.stride() ) , weight , result ) ;
   }
 
   /// evaluation variant taking the components of a split coordinate, first the shape_type
@@ -1109,13 +1107,13 @@ public:
   
   /// this is the vectorized version of the final delegate, calling _v_eval. The first
   /// argument is a vector of offsets to windows of coefficients, the vectorized equivalent
-  /// of the single offset in eval0, the unvectorized routine.
+  /// of the single offset in the unvectorized routine.
   /// The second argument is a 2D array of vecorized weights, the third a reference to
   /// the result.
   
-  void v_eval0 ( const ic_v& select ,  // offsets to lower corners of the subarrays
-                 const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
-                 mc_ele_v & result )
+  void v_eval ( const ic_v& select ,  // offsets to lower corners of the subarrays
+                const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
+                mc_ele_v & result )
   {
     // we need an instance of this iterator because it's passed into _v_eval by reference
     // and manipulated by the code there:
@@ -1144,7 +1142,7 @@ public:
     
     // now we delegate to the final delegate
     
-    v_eval0 ( origin , weight , result ) ;
+    v_eval ( origin , weight , result ) ;
   }
 
   // again, pretty much like the unvectorized version, now using simdized data:
diff --git a/example/impulse_response.cc b/example/impulse_response.cc
index e41e356..71fef16 100644
--- a/example/impulse_response.cc
+++ b/example/impulse_response.cc
@@ -71,9 +71,493 @@
 
 
 #include <iomanip>
+#include <assert.h>
 #include <vspline/vspline.h>
 #include <vigra/multi_math.hxx>
 #include <random>
+#include <tuple>
+
+// iterator_splitter will try to set up n ranges from a range. the partial
+// ranges are stored in a std::vector. The split may succeed producing n
+// or less ranges, and if iter_range can't be split at all, a single range
+// encompassing the whole of iter_range will be returned in the result vector.
+
+template < class _iterator_type >
+struct iterator_splitter
+{
+  typedef _iterator_type iterator_type ;
+  typedef vigra::TinyVector < iterator_type , 2 > range_type ;
+  typedef std::vector < range_type > partition_type ;
+
+  static partition_type part ( const range_type & iter_range ,
+                               int n )
+  {
+    std::vector < range_type > res ;
+    assert ( n > 0 ) ;
+
+    iterator_type start = iter_range [ 0 ] ;
+    iterator_type end = iter_range [ 1 ] ;
+    int size = end - start ;
+    if ( n > size )
+      n = size ;
+    
+    int chunk_size = size / n ; // will be at least 1
+    
+    for ( int i = 0 ; i < n - 1 ; i++ )
+    {
+      res.push_back ( range_type ( start , start + chunk_size ) ) ;
+      start += chunk_size ;
+    }
+    res.push_back ( range_type ( start , end ) ) ;
+    return res ;
+  }
+} ;
+
+// shape_splitter will try to split a shape into n ranges by 'chopping' it
+// along the outermost axis that can be split n-ways. The additional parameter
+// 'forbid' prevents that particular axis from being split. The split may succeed
+// producing n or less ranges, and if 'shape' can't be split at all, a single range
+// encompassing the whole of 'shape' will be returned in the result vector.
+
+template < int dim >
+struct shape_splitter
+{
+  typedef typename vigra::MultiArrayShape < dim > :: type shape_type ;
+  typedef vigra::TinyVector < shape_type , 2 > range_type ;
+  typedef std::vector < range_type > partition_type ;
+  
+  static partition_type part ( const shape_type & shape , ///< shape to be split n-ways
+                               int n ,                    ///< intended number of chunks
+                               int forbid = -1 )          ///< axis which shouldn't be split
+  {
+    partition_type res ;
+
+    // find the outermost dimension that can be split n ways, and it's extent 
+    int split_dim = -1 ;
+    int max_extent = -1 ;
+    for ( int md = dim - 1 ; md >= 0 ; md-- )
+    {
+      if (    md != forbid
+          && shape[md] > max_extent
+          && shape[md] >= n )
+      {
+        max_extent = shape[md] ;
+        split_dim = md ;
+        break ;
+      }
+    }
+    
+    // if the search did not yet succeed:
+    if ( max_extent == -1 )
+    {
+      // repeat process with relaxed conditions: now the search will also succeed
+      // if there is an axis which can be split less than n ways
+      for ( int md = dim - 1 ; md >= 0 ; md-- )
+      {
+        if (    md != forbid
+            && shape[md] > 1 )
+        {
+          max_extent = shape[md] ;
+          split_dim = md ;
+          break ;
+        }
+      }
+    }
+    
+    if ( split_dim == -1 )
+    {   
+      // we have not found a dimension for splitting. We pass back res with
+      // a range over the whole initial shape as it's sole member
+      res.push_back ( range_type ( shape_type() , shape ) ) ;
+    }
+    else
+    {
+      // we can split the shape along split_dim
+      
+      int w = shape [ split_dim ] ;  // extent of the dimension we can split
+      n = std::min ( n , w ) ;       // just in case, if that is smaller than n
+      
+      int cut [ n ] ;   // where to chop up this dimension
+      
+      for ( int i = 0 ; i < n ; i++ )
+        cut[i] = ( (i+1) * w ) / n ;   // roughly equal chunks, but certainly last cut == a.end()
+
+      shape_type start , end = shape ;
+
+      for ( int i = 0 ; i < n ; i++ )
+      {
+        end [ split_dim ] = cut [ i ];                  // apply the cut locations
+        res.push_back ( range_type ( start , end ) ) ;
+        start [ split_dim ] = end [ split_dim ] ;
+      }
+    }
+    return res ; // return the number of chunks
+  }
+} ;
+
+// // we have a std::vector of tuples of bunch_type to which we want to assign
+// // array chunks, so that the first tuple in the vector receives the first chunks
+// // of all elements of bunch, the second tuple receives the second set of chunks
+// // and so forth. We can't iterate over tuple elements in a loop (get<K> requires
+// // a const K), but we can recurse over the elements, filling the Kth member
+// // of all tuples in the target vector in every step of the recursion:
+// 
+// template < class bunch_type , int e , int dimension >
+// struct push_members
+// {
+//   static void push ( std::vector < bunch_type > & target ,
+//                      const bunch_type & bunch ,
+//                      const typename shape_splitter < dimension > :: partition_type & ranges )
+//   {
+//     // extract member e-1 from the unsplit tuple
+//     auto current = std::get<e-1> ( bunch ) ;
+// 
+//     // to perform the shape consistency check, we compare to the first member:
+//     if ( e > 1 )
+//     {
+//       const auto & first = std::get<0> ( bunch ) ; 
+//       if ( first.shape() != current.shape() )
+//         throw vspline::shape_mismatch ( "coprocess: all arrays must have the same size" ) ;
+//     }
+//     
+//     // obtain the parts by applying 'ranges'
+//     // and fill in the parts at position e-1 in every tuple in 'target'
+//     for ( int k = 0 ; k < ranges.size() ; k++ )
+//       std::get < e - 1 > ( target [ k ] ) = current.subarray ( ranges[k][0] , ranges[k][1] ) ;
+//     // and recurse to the next-lower member
+//     push_members < bunch_type , e - 1 , dimension > :: push ( target , bunch , ranges ) ;
+//   }
+// } ;
+// 
+// // terminate the recursion over e, the ordinal number of the tuple's element
+// // with a partial specialization for e == 0
+// 
+// template < class bunch_type , int dimension >
+// struct push_members < bunch_type , 0 , dimension >
+// {
+//   static void push ( ... ) { } // doesn't do anything at this level.
+// } ;
+// 
+// // using C++ magic gleaned from first answer in 
+// // http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer?noredirect=1&lq=1
+// 
+// // seq is an empty struct with a variadic template argument list of int
+// template<int ...>
+// struct seq { };
+// 
+// // level N 'gens' inherits from level N-1 'gens'
+// template<int N, int ...S>
+// struct gens : gens<N-1, N-1, S...> { };
+// 
+// // level 0 'gens' is partially specialized (N==0) and typedefs 'type'
+// // this typedef is inherited by all higher-level 'gens'
+// template<int ...S>
+// struct gens<0, S...> {
+//   typedef seq<S...> type;
+// };
+// 
+// // why all the artistry? we have our parameter sets in tuples, but the
+// // function we want to call expects individual arguments. while we might
+// // require that the function be rewritten to accept a tuple instead of
+// // individual arguments (which is trivial), being able to use the function
+// // 'as is' is more intuitive, especially if the function's definition is
+// // somewhere else and we'd have to code a wrapper for it. We use two steps
+// // to achieve the call to func:
+// //
+// // here we have the 'inner' callFunc.
+// // the actual call to func happens here: _callFunc takes func, an argument
+// // tuple and a 'seq' type 'S' with a template parameter list of ints 0...N-1
+// // with N being the number of arguments func expects.
+// // the sequence is submitted to pack expansion with a pattern which
+// // produces the tuple's kth element for each k in S. The comma-separated
+// // list of the tuple members which results is passed as argument list to func.
+// 
+// template < typename return_type , typename ...Args , int ...S >
+// return_type _callFunc ( std::function < return_type ( Args... ) > func , 
+//                        std::tuple < Args... > params ,
+//                        seq<S...> )
+// {
+//   return func ( std::get < S > ( params ) ... ) ; // expand pack params and call func
+// }
+// 
+// // the 'outer' callFunc receives func and a tuple of arguments.
+// // Here, the number of arguments is established (sizeof...(Args)) and used to
+// // create a 'seq' object, whose type reflects this number of arguments and
+// // which, by pack expansion, can produce individual arguments from the tuple
+// // in the previous routine _callFunc. func and params are passed on to the
+// // 'inner' _callFunc, together with the seq object. The inner _callFunc
+// // performs the actual function call.
+// 
+// template < typename return_type , typename ...Args >
+// return_type callFunc ( std::function < return_type ( Args... ) > func , 
+//                        std::tuple < Args... > params )
+// {
+//   return _callFunc ( func , params , typename gens<sizeof...(Args)>::type() ) ;
+// }
+// 
+// // with the infrastructure code at hand, we can proceed to formulate a generalized
+// // multithreading routine for sets of MultiArrayViews. Here we have 'coprocess'
+// // which receives a std::function taking one or several MultiArrayViews, the
+// // desired number of threads, and as many MultiArrayViews as 'func' can process.
+// // TODO: I haven't managed yet to pass the views per reference
+// // coprocess tries to create a set of tuples of the same type as the tuple
+// // which was created from the arguments passed to coprocess (bunch_type),
+// // so that each tuple in the set contains arguments for a call to func which
+// // operates on a set of subarrays of the original arguments. With this set of tuples,
+// // representing a partition of the work, the work can be delegated to several threads.
+// 
+// template<class ... Types>
+// int coprocess ( std::function < void ( Types... ) > func , int parts , Types ...args  )
+// {
+//   // we gather the arguments in a tuple
+//   auto bunch = std::make_tuple ( args... ) ;
+//   typedef decltype ( bunch ) bunch_type ;
+//   const int members = std::tuple_size < bunch_type > :: value ;
+//   
+//   auto first = std::get<0> ( bunch ) ;
+//   typedef decltype ( first ) view_type ;
+//   const int dimension = view_type::actual_dimension ;
+//   
+//   auto shape = first.shape() ;
+//   auto ranges = shape_splitter < dimension > :: part ( shape , parts ) ;
+//   parts = ranges.size() ;
+// 
+//   // set up the set of tuples containing arguments for the partitioned job,
+// 
+//   std::vector < bunch_type > partition ( parts ) ;
+//   
+//   // fill in the tuples corresponding to the ranges
+//   push_members < bunch_type , members , dimension > :: push ( partition , bunch , ranges ) ;
+// 
+//   // set up thread objects for the additional threads we'll use
+//   int new_threads = parts - 1 ;
+//   std::thread * t[new_threads] ;
+//   
+//   // start the additional threads, passing callFunc as the thread's functional,
+//   // and func and the tuple containing the partial job's arguments. the new threads
+//   // will proceed to 'explode' the tuple and feed the individual arguments to func:
+//   int s ;
+//   for ( s = 0 ; s < new_threads ; s++ )
+//   {
+//     t[s] = new std::thread ( callFunc < void , Types... > , func , partition[s] ) ;
+//   }
+//   // do the last partial - or only - job in this very thread: 
+//   callFunc ( func , partition[s] ) ;
+//   
+//   // finally, join the additional threads and delete their thread objects
+//   for ( s = 0 ; s < new_threads ; s++ )
+//   {
+//     t[s]->join() ;
+//     delete t[s] ;
+//   }
+//   return parts ;
+// }
+// 
+// // the simpler task is to formulate coprocess using 'bunched' arguments in the first
+// // place. This makes the formulation of 'func' a bit more involved, as func now has
+// // to accept a tuple as it's sole argument and unpack the tuple's contents, but the
+// // processing in this variant of 'coprocess' is much more straightforward, since the
+// // artistry needed to call func with an 'exploded' tuple is not needed here:
+// 
+// template < class bunch_type >
+// int coprocess ( std::function < void ( bunch_type ) > func , int parts , bunch_type bunch  )
+// {
+//   const int members = std::tuple_size < bunch_type > :: value ;
+//   
+//   auto first = std::get<0> ( bunch ) ;
+//   typedef decltype ( first ) view_type ;
+//   const int dimension = view_type::actual_dimension ;
+// 
+//   auto shape = first.shape() ;
+//   auto ranges = shape_splitter < dimension > :: part ( shape , parts ) ;
+//   parts = ranges.size() ;
+// 
+//   // set up the set of tuples containing arguments for the partitioned job
+//   std::vector < bunch_type > partition ( parts ) ;
+//   
+//   // fill in the tuples corresponding to the ranges
+//   push_members < bunch_type , members , dimension > :: push ( partition , bunch , ranges ) ;
+// 
+//   // set up thread objects for the additional threads we'll use (if any)
+//   int new_threads = parts - 1 ;
+//   std::thread * t[new_threads] ;
+//   
+//   // start the additional threads, passing func as the thread's functional and the tuple
+//   // containing the partial job's arguments. the new threads will proceed to pass their tuple
+//   // straight to func:
+//   int s ;
+//   for ( s = 0 ; s < new_threads ; s++ )
+//   {
+//     t[s] = new std::thread ( func , partition[s] ) ;
+//   }
+//   // do the last partial job in this very thread. this may be the only job, if push_members
+//   // has returned 1, failing to partition the job
+//   func ( partition[s] ) ;
+//   
+//   // finally, join the additional threads and delete their thread objects
+//   for ( s = 0 ; s < new_threads ; s++ )
+//   {
+//     t[s]->join() ;
+//     delete t[s] ;
+//   }
+//   // and return the number of threads which was used.
+//   return parts ;
+// }
+// 
+// // sample function to run with coprocess:
+// 
+// void twosome ( vigra::MultiArrayView < 1 , double > in1 , vigra::MultiArrayView < 1 , double > in2 )
+// {
+// //   std::cout << "shape 1 : " << in1 . shape() << std::endl ;
+// //   std::cout << "shape 2 : " << in2 . shape() << std::endl ;
+//   for ( int i = 0 ; i < in1.shape(0) ; i++ )
+//     in2[i] = in1[i] + 42.0 ;
+// }
+// 
+// // sample function to run with the bunched version of coprocess:
+// 
+// void btwosome ( std::tuple < vigra::MultiArrayView < 1 , double > ,
+//                              vigra::MultiArrayView < 1 , double > > in )
+// {
+//   for ( int i = 0 ; i < std::get<1>(in).shape(0) ; i++ )
+//     std::get<1>(in) [ i ] += std::get<0>(in) [ i ] + 43.0 ;
+// }
+
+// given limit_type, we define range_type as a TinyVector of two limit_types,
+// the first denoting the beginning of the range and the second it's end, with
+// end being outside of the range.
+
+template < class limit_type >
+using range_type = vigra::TinyVector < limit_type , 2 > ;
+
+// given range_type, we define partition_type as a std::vector of range_type.
+// This data type is used to hold the partitioning of a range into subranges.
+
+template < class range_type >
+using partition_type = std::vector < range_type > ;
+
+// given a dimension, we define a shape_type as a TinyVector of long
+// of this dimension. This is equivalent to vigra's shape type.
+
+template < int dimension >
+using shape_type = vigra::TinyVector < long , dimension > ;
+
+// given a dimension, we define shape_range_type as a range defined by
+// two shapes of the given dimension. This definition allows us to directly
+// pass the two shapes as arguments to a call of subarray() on a MultiArrayView
+// of the given dimension
+
+template < int dimension >
+using shape_range_type = range_type < shape_type < dimension > > ;
+
+// for any dimension d, we specialize function partition() for shape_range_type<d>.
+// we use the extant class shape_splitter to perform the split
+
+template < int d >
+partition_type < shape_range_type<d> > partition ( shape_range_type<d> range , int nparts )
+{
+  // perform the split, silently assuming that the range passed in starts at origin
+  // TODO while we don't ever call this specialization of partition() with a range which
+  // doensn't start at origin, we might either accomodate such use, forbid it, or rethink
+  // the logic...
+  
+  return shape_splitter < d > :: part ( range[1] , nparts ) ;
+}
+
+/// multithread is a routine to perform multithreading with functors accessing several
+/// containers, iterators etc. ('manifolds') of identical structure together.
+/// when such a functor is invoked for a specific thread, it will only process a subset
+/// of each of the original manifold objects. This 'subset' notion is codified as range_type:
+/// range_type specifies a starting point and an end point inside the manifolds defining
+/// the extent of the subset. While it is tempting to code so that the multithreading
+/// routine feeds the subsets to the individual threads, I found that passing the
+/// range to the thread, together with the original variadic arguments to multithread,
+/// produces clean and compact code with a slightly higher burden on the functor, since
+/// it has to 'cut out' the subset it is meant to process itself. Why is this better?
+/// Because it unburdens the multithreading code from knowing any specifics about the
+/// 'cutting out' procedure, and at the same time allows the functor to access the
+/// 'whole' manifold and it's metrics if this is needed.
+/// The partitioning of the incoming range is left to function 'partition' which has to
+/// be specialized for any concrete range_type used, and multithread simply relies on
+/// a specialized version being present.
+
+template < class range_type , class ...Types >
+int multithread ( std::function < void ( range_type , Types... ) > func ,
+                  int nparts ,
+                  range_type range ,
+                  Types ...args  )
+{
+  // partition the range and update nparts in case the partitioning produced
+  // less ranges than we initially asked for. Rely on the presence of a function
+  // partition() which can process range_type:
+
+  auto partitioning = partition ( range , nparts ) ;
+  nparts = partitioning.size() ;
+
+  // set up thread objects for the additional threads we'll use (if any)
+
+  int new_threads = nparts - 1 ;
+  std::thread * t[new_threads] ;
+  
+  // start the additional threads (if any), passing func as the thread's functional,
+  // followed by the range over which func is meant to operate, followed by any other
+  // arguments passed to multithread as parameter pack 'args'
+
+  for ( int s = 0 ; s < new_threads ; s++ )
+  {
+    t[s] = new std::thread ( func , partitioning[s] , args... ) ;
+  }
+
+  // do the last partial job in this very thread. this may be the only job,
+  // if partition() has failed to partition the job
+
+  func ( partitioning [ new_threads ] , args... ) ;
+  
+  // join the additional threads (if any) and delete their thread objects
+
+  for ( int s = 0 ; s < new_threads ; s++ )
+  {
+    t[s]->join() ;
+    delete t[s] ;
+  }
+
+  // and return the number of threads which was used.
+
+  return nparts ;
+}
+
+/// sample function to run with multithread
+/// for any dimension d, we define 'rtwosome' as a function taking a shape_range_type
+/// of this dimension (this is a TinyVector of two nD shapes) and two MultiArrayViews
+/// of this dimension, containing doubles. Depending on the concrete coding needs,
+/// the functions used with multithread might be more or less specialized.
+
+template < int dimension >
+void rtwosome ( shape_range_type<dimension> r ,
+                vigra::MultiArrayView < dimension , double > _in1 ,
+                vigra::MultiArrayView < dimension , double > _in2 )
+{
+  // first, 'cut out' the subsets of the manifolds. Here, the ends of the range
+  // are nD shapes, which can be passed directly to subarray().
+  
+  const auto in1 = _in1.subarray ( r[0] , r[1] ) ;
+  auto in2 = _in2.subarray ( r[0] , r[1] ) ;
+  
+  // now set up iterators over the subsets:
+  
+  auto it1 = in1.begin() ;
+  auto it2 = in2.begin() ;
+  const auto end = in2.end() ;
+  
+  // and perform some calculation
+  
+  while ( it2 < end )
+  {
+    *it2 = *it1 + 42.0 ;
+    ++it1 ;
+    ++it2 ;
+  }
+}
 
 int main ( int argc , char * argv[] )
 {
@@ -129,6 +613,27 @@ int main ( int argc , char * argv[] )
   coords[2] = 5000.1 ;
   vigra::MultiArray < 1 , double > target ( 25 ) ;
   
+  typedef vigra::MultiArrayView < 1 , double > view_type ;
+  typedef typename view_type::difference_type shape_type ;
+  
+  view_type va ( coords ) ;
+  view_type vb ( target ) ;
+  view_type vc ;
+//   // we need an instance here, passing twosome directly doesn't compile
+//   std::function < void ( view_type , view_type ) > fun ( twosome ) ;
+//   coprocess ( fun , 4 , va , vb ) ;
+//   
+//   std::function < void ( std::tuple < view_type , view_type > ) > bfun ( btwosome ) ;
+//   auto t = std::make_tuple ( va , vb ) ;
+//   coprocess ( bfun , 4 , t ) ;
+  
+  shape_range_type<1> range ( shape_type() , va.shape() ) ;
+  std::function < void ( shape_range_type<1> , view_type , view_type ) > rfun ( rtwosome<1> ) ;
+  multithread ( rfun , 4 , range , va , vb ) ;
+  
+  for ( auto v : vb )
+    std::cout << v << std::endl ;
+
 //   typedef vspline::split_type < 1 , int , double > splt ;
 //   if ( degree & 1 )
 //   {
@@ -149,8 +654,8 @@ int main ( int argc , char * argv[] )
   for ( auto v : target )
     std::cout << v << std::endl ;
   
-  typedef vector_traits < double > :: type dvs_t ;
-  typedef aggregate_traits < dvs_t , 1 > :: type dv_t ;
+  typedef vspline::vector_traits < double > :: type dvs_t ;
+  typedef vigra::TinyVector < dvs_t , 1 > dv_t ;
   
   dv_t dv ;
   dv[3] = 5000.1 ;
@@ -174,6 +679,4 @@ int main ( int argc , char * argv[] )
 //     for ( int i = 0 ; i < 3 ; i++ )
 //       std::cout << less[i] << std::endl ;
 //   }
-  
-  
 }
\ No newline at end of file
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index 86052c9..db5a6cb 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -115,7 +115,7 @@ void roundtrip ( view_type & data ,
 #ifdef USE_VC
   
 //   const int vsize = Vc::Vector < real_type > :: Size ;
-  const int vsize = vector_traits < real_type > :: vsize ;
+  const int vsize = vspline::vector_traits < real_type > :: vsize ;
   
 #else
   
diff --git a/filter.h b/filter.h
index 94dac75..6fa0cb7 100644
--- a/filter.h
+++ b/filter.h
@@ -1683,7 +1683,7 @@ template < typename input_array_type ,      // type of array with knot point dat
            typename output_array_type >     // type of array for coefficients (may be the same)
 void filter_nd ( input_array_type & input ,
                  output_array_type & output ,
-                 typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type bc ,
+                 vigra::TinyVector<bc_code,input_array_type::actual_dimension> bc ,
                  int nbpoles ,
                  const double * pole ,
                  double tolerance ,
diff --git a/mapping.h b/mapping.h
index 49fb480..557733f 100644
--- a/mapping.h
+++ b/mapping.h
@@ -117,7 +117,7 @@ using namespace std ;
 using namespace vigra ;
 
 /// class split_type contains n-dimensional 'split coordinates', consisting of the
-/// integral and fracional parts of the original real coordinates, separated so that
+/// integral and fracional part of the original real coordinates, separated so that
 /// they can be processed by the evaluation routine.
 
 template < int N ,
@@ -129,9 +129,9 @@ struct split_type
   typedef FT rc_type ;
   typedef FT value_type ;
   enum { dimension = N } ;
-  typedef typename aggregate_traits < IT , N > :: type select_type ;
+  typedef vigra::TinyVector < IT , N > select_type ;
   select_type select ; ///< named select because it selects the range of coefficients
-  typedef typename aggregate_traits < FT , N > :: type tune_type ;
+  typedef vigra::TinyVector < FT , N > tune_type ;
   tune_type tune ; ///< named tune because it is the base for the weights 
 } ;
 
@@ -1200,19 +1200,21 @@ template < typename ic_t , typename rc_t , int dimension , int vsize = 1 >
 struct nd_mapping
 {
   typedef mapping < ic_t , rc_t , vsize > mapping_type ;
-  typedef typename aggregate_traits < mapping_type , dimension > :: type nd_mapping_type ;
+  typedef typename vigra::TinyVector < mapping_type , dimension > nd_mapping_type ;
 
-  typedef typename aggregate_traits < bc_code , dimension > :: type bcv_type ;
-  typedef typename aggregate_traits < rc_t , dimension > :: type nd_rc_t ;
-  typedef typename aggregate_traits < ic_t , dimension > :: type nd_ic_t ;
+  typedef typename vigra::TinyVector < bc_code , dimension > bcv_type ;
+  typedef typename vigra::MultiArrayShape < dimension > :: type shape_type ;
+
+  typedef vigra::TinyVector < rc_t , dimension > nd_rc_t ;
+  typedef vigra::TinyVector < ic_t , dimension > nd_ic_t ;
   typedef split_type < dimension , ic_t , rc_t > split_t ;
 
 #ifdef USE_VC
 
   typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
   typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
-  typedef typename aggregate_traits < ic_v , dimension > :: type nd_ic_v ;
-  typedef typename aggregate_traits < rc_v , dimension > :: type nd_rc_v ;
+  typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
+  typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
 
 #endif
 
@@ -1227,17 +1229,19 @@ struct nd_mapping
 
   nd_mapping ( const bcv_type&   _bcv ,
                const int&        _spline_degree ,
-               const nd_ic_t&    _shape  )
+               const shape_type& _shape  )
   : bcv ( _bcv ) ,
     spline_degree ( _spline_degree ) ,
     shape ( _shape )
   {
+    // here vigra's shape_type, which uses long ints, is explicitly cast down to int.
+    // TODO is this a problem?
     for ( int d = 0 ; d < dimension ; d++ )
-      map [ d ] = mapping_type ( bcv[d] , spline_degree , shape[d] ) ;
+      map [ d ] = mapping_type ( bcv[d] , spline_degree , ic_t ( shape[d] ) ) ;
   }
   
-  nd_mapping()
-  { } ;
+//   nd_mapping()
+//   { } ;
   
 //   /// convenience variant taking a single boundary condition code, which is used for all axes
 //   
diff --git a/prefilter.h b/prefilter.h
index ed6179a..0b06b47 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -158,7 +158,7 @@ template < typename input_array_type ,      // type of array with knot point dat
            typename output_array_type >     // type of array for coefficients (may be the same)
 void solve_vigra ( input_array_type & input ,
                    output_array_type & output ,
-                   typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type bcv ,
+                   TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
                    int degree ,
                    double tolerance ,
                    int nslices = ncores )
@@ -187,7 +187,7 @@ void solve_vigra ( input_array_type & input ,
 {
   filter_nd ( input ,
               output ,
-              typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type ( bc ) ,
+              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
               degree / 2 ,
               precomputed_poles [ degree ] ,
               tolerance ,
@@ -201,7 +201,7 @@ template < typename input_array_type ,      // type of array with knot point dat
            typename output_array_type >     // type of array for coefficients (may be the same)
 void solve_vc ( input_array_type & input ,
                 output_array_type & output ,
-                typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type bcv ,
+                TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
                 int degree ,
                 double tolerance ,
                 int nslices = ncores )
@@ -230,7 +230,7 @@ void solve_vc ( input_array_type & input ,
 {
   filter_nd ( input ,
               output ,
-              typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type ( bc ) ,
+              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
               degree / 2 ,
               precomputed_poles [ degree ] ,
               tolerance ,
diff --git a/remap.h b/remap.h
index 9f9ee37..f32e624 100644
--- a/remap.h
+++ b/remap.h
@@ -117,9 +117,9 @@ struct coordinate_traits < vigra::TinyVector < T , N > >
 // TODO: I'd like to write it like this, but it won't compile:
 
 // template < typename T , int N >
-// struct coordinate_traits < typename aggregate_traits < T , N > :: type >
+// struct coordinate_traits < vigra::TinyVector < T , N > >
 // {
-//   typedef typename aggregate_traits < T , N > :: type::value_type rc_type ;
+//   typedef vigra::TinyVector < T , N >::value_type rc_type ;
 // } ;
 
 /// since remap can also operate with pre-split coordinates, we need another partial
@@ -313,7 +313,7 @@ int remap ( const bspline < value_type , dim_in > & bspl ,
 /// one-shot remaps where the spline isn't reused.
 
 template < int dimension >
-using bcv_type = typename aggregate_traits < bc_code , dimension > :: type ;
+using bcv_type = vigra::TinyVector < bc_code , dimension > ;
 
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class coordinate_type , // usually TinyVector of float for coordinates
@@ -364,8 +364,8 @@ class warper
 {
 public:
   
-  typedef typename aggregate_traits < ic_type , coordinate_dimension > :: type nd_ic_type ;
-  typedef typename aggregate_traits < rc_type , coordinate_dimension > :: type nd_rc_type ;
+  typedef vigra::TinyVector < ic_type , coordinate_dimension > nd_ic_type ;
+  typedef vigra::TinyVector < rc_type , coordinate_dimension > nd_rc_type ;
 
   vigra::MultiArray < dimension , nd_ic_type > _select ;
   vigra::MultiArray < dimension , nd_rc_type > _tune ;
@@ -551,8 +551,8 @@ template < class rc_type ,       // elementary type for coordinates, usually flo
            int dim_in ,          // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >         // number of dimensions of output array
 using transform
-= std::function < void ( const typename aggregate_traits < rc_type , dim_out > :: type & ,
-                               typename aggregate_traits < rc_type , dim_in > :: type & ) > ;
+= std::function < void ( const vigra::TinyVector < rc_type , dim_out > & ,
+                               vigra::TinyVector < rc_type , dim_in > & ) > ;
 
 #ifdef USE_VC
 
@@ -576,8 +576,8 @@ template < class rc_v ,            // simdized type of vsize real coordinates
            int dim_out >           // number of dimensions of output array
 using vtransform
 = std::function < void
-                   ( const typename aggregate_traits < rc_v , dim_out > :: type & ,
-                           typename aggregate_traits < rc_v , dim_in > :: type & ) > ;
+                   ( const vigra::TinyVector < rc_v , dim_out > & ,
+                           vigra::TinyVector < rc_v , dim_in > & ) > ;
 
 #endif
 
@@ -624,16 +624,16 @@ class transformation
 
   transform_type tf ;   /// functor for single-value coordinate transformation
 
-  typedef typename aggregate_traits < rc_type , dim_in > :: type rc_in_type ;
-  typedef typename aggregate_traits < rc_type , dim_out > :: type rc_out_type ;
+  typedef vigra::TinyVector < rc_type , dim_in > rc_in_type ;
+  typedef vigra::TinyVector < rc_type , dim_out > rc_out_type ;
   
 #ifdef USE_VC
 
   typedef rc_v < rc_type , value_type > rc_v_type ;
   const int vsize = rc_v_type::Size ;
   
-  typedef typename aggregate_traits < rc_v_type , dim_in > :: type rc_v_in_type ;
-  typedef typename aggregate_traits < rc_v_type , dim_out > :: type rc_v_out_type ;
+  typedef vigra::TinyVector < rc_v_type , dim_in > rc_v_in_type ;
+  typedef vigra::TinyVector < rc_v_type , dim_out > rc_v_out_type ;
 
   typedef vtransform < rc_v_type , dim_in , dim_out > vtransform_type ;
   
@@ -739,11 +739,11 @@ template < class _rc_v , int _dimension >
 struct coordinate_iterator_v
 {
   enum { dimension = _dimension } ;
-  typedef typename aggregate_traits < int , dimension > :: type shape_type ;
+  typedef vigra::TinyVector < int , dimension > shape_type ;
 
   typedef _rc_v rc_v ;
   enum { vsize = rc_v::Size } ;
-  typedef typename aggregate_traits < rc_v , dimension > :: type nd_rc_v ; // vectorized n-dimensional coordinate
+  typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ; // vectorized n-dimensional coordinate
 
   shape_type start ;
   shape_type end ;
@@ -845,10 +845,10 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
   const int vsize = evaluator_type::vsize ;
 
    // incoming coordinates (into output, which has dim_out dims)
-  typename aggregate_traits < rc_v , dim_out > :: type c_in ;
+  vigra::TinyVector < rc_v , dim_out > c_in ;
 
   // transformed coordinates (into input, which has dim_in dims)
-  typename aggregate_traits < rc_v , dim_in > :: type c_out ;
+  vigra::TinyVector < rc_v , dim_in > c_out ;
   mc_ele_v result ;                    // result, as struct of vectors
   
 #endif
@@ -895,8 +895,8 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
   
 #endif // USE_VC
 
-  typename aggregate_traits < rc_type , dim_out > :: type cs_in ;
-  typename aggregate_traits < rc_type , dim_in > :: type cs_out ;
+  vigra::TinyVector < rc_type , dim_out > cs_in ;
+  vigra::TinyVector < rc_type , dim_in > cs_out ;
   
   // process leftovers, if any - if vc isn't used, this loop does all the processing
   while ( leftover-- )
@@ -994,7 +994,7 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            int dim_out >           // number of dimensions of output array
 int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
                          MultiArrayView < dim_out ,
-                                          typename aggregate_traits < rc_type , dim_in > :: type >
+                                          vigra::TinyVector < rc_type , dim_in > >
                            & warp_array ,
                          typename MultiArrayView < dim_out , value_type > :: difference_type offset ,
                          bool use_vc = true
@@ -1004,12 +1004,12 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
   /// generated by vigra's CoupledIterator or, in the vectorized code,
   /// by coordinate_iterator_v
 
-  typedef typename aggregate_traits < rc_type , dim_out > :: type nd_rc_in ;
+  typedef vigra::TinyVector < rc_type , dim_out > nd_rc_in ;
 
   /// transformed coordinates (into input, which has dim_in dims)
   /// populating the resulting warp array
 
-  typedef typename aggregate_traits < rc_type , dim_in > :: type nd_rc_out ;
+  typedef vigra::TinyVector < rc_type , dim_in > nd_rc_out ;
 
   nd_rc_in cs_in ;
   nd_rc_out cs_out ;
@@ -1025,10 +1025,10 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
   typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
 
   /// vecorized incoming coordinates (into output, which has dim_out dims)
-  typedef typename aggregate_traits < rc_v , dim_out > :: type nd_rc_in_v ;
+  typedef vigra::TinyVector < rc_v , dim_out > nd_rc_in_v ;
 
   /// vectorized transformed coordinates (into input, which has dim_in dims)
-  typedef typename aggregate_traits < rc_v , dim_in > :: type nd_rc_out_v ;
+  typedef vigra::TinyVector < rc_v , dim_in > nd_rc_out_v ;
 
   if ( use_vc )
   {
@@ -1076,7 +1076,7 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            int dim_out >           // number of dimensions of output array
 int make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
                       MultiArrayView < dim_out ,
-                                       typename aggregate_traits < rc_type , dim_in > :: type >
+                                       vigra::TinyVector < rc_type , dim_in > >
                         warp_array ,
                       bool use_vc = true ,
                       int nthreads = ncores

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git