[vspline] 32/72: worked on eval.h; now have specialized code for NN and linear interpolation inside class evaluator, which made my handcoded interpolators in pv obsolete. Also added a new mapping mode fo splines made with REFLECT bcs which don't actually need the reflection and can be evaluated RAW. The new mapping mode RRAW takes care of the brace which may be widened.

Sun Jul 2 09:02:40 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 188fd4181e62fff596e4d0e6a9c42c219ff01d39
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Mon Feb 6 11:50:30 2017 +0100

    worked on eval.h; now have specialized code for NN and linear interpolation
    inside class evaluator, which made my handcoded interpolators in pv obsolete.
    Also added a new mapping mode fo splines made with REFLECT bcs which don't
    actually need the reflection and can be evaluated RAW. The new mapping mode
    RRAW takes care of the brace which may be widened.
---
 common.h  |   6 +-
 eval.h    | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 mapping.h |  50 ++++++++++++++-
 3 files changed, 220 insertions(+), 40 deletions(-)

diff --git a/common.h b/common.h
index e593982..6cfe19d 100644
--- a/common.h
+++ b/common.h
@@ -205,7 +205,8 @@ typedef enum { PERIODIC,   ///< periodic boundary conditions / periodic mapping
                TAG ,       ///< mapping mode, out-of-bounds coordinates produce delta -2.0
                TAGP ,      ///< mapping mode, like TAG, but ceiling 1 higher
                RTAG ,      ///< mapping mode, like TAG, but for widened brace
-               RAW   ,     ///< mapping mode, processes coordinates unchecked, may crash the program
+               RAW ,       ///< mapping mode, processes coordinates unchecked, may crash the program
+               RRAW
 } bc_code;
 
 /// This enumeration is used by the convenience class 'bspline' to determine the prefiltering
@@ -239,7 +240,8 @@ const std::string bc_name[] =
   "TAG" ,
   "TAGP" ,
   "RTAG" ,
-  "RAW"
+  "RAW" ,
+  "RRAW"
 } ;
 
 } ; // end of namespace vspline
diff --git a/eval.h b/eval.h
index 4886e8f..c41c7e6 100644
--- a/eval.h
+++ b/eval.h
@@ -464,7 +464,8 @@ template < int _dimension ,         ///< dimension of the spline
            class _value_type ,      ///< type of spline coefficients/result (pixels etc.)
            class _rc_type ,         ///< singular real coordinate, float or double
            class _ic_type ,         ///< singular integral coordinate, currently only int
-           bool use_tag = false >   ///< flag switching tag checking on/off
+           bool use_tag = false ,   ///< flag switching tag checking on/off
+           int specialize = -1 >
 class evaluator
 // we could inherit from class interpolator, since class evaluator satisfies the
 // interpolator interface, but there's no advantage in doing so.
@@ -842,6 +843,38 @@ public:
     }
   } ;
 
+  template < int level , class dtype >
+  struct _eval_linear
+  {
+    dtype operator() ( const dtype* & pdata ,
+                       offset_iterator & ofs ,
+                       const nd_rc_type& tune
+                     ) const
+    {
+      dtype result = ( 1.0 - tune[level] )
+                     * _eval_linear < level - 1 , dtype >() ( pdata , ofs , tune ) ;
+      result += tune[level]
+                * _eval_linear < level - 1 , dtype >() ( pdata , ofs , tune ) ;
+      return result ;
+    }
+  } ;
+
+  template < class dtype >
+  struct _eval_linear < 0 , dtype >
+  {
+    dtype operator() ( const dtype* & pdata ,
+                       offset_iterator & ofs ,
+                       const nd_rc_type& tune
+                     ) const
+    {
+      dtype result = ( 1.0 - tune[level] ) * pdata [ *ofs ] ;
+      ++ofs ;
+      result += tune[level] * pdata [ *ofs ] ;
+      ++ofs ;
+      return result ;
+    }
+  } ;
+
   /// _xeval takes an ele_type** 'weight', pointing to level ele_type*, which each
   /// point to ORDER weights. This variant is used when the per-axis components
   /// of the weights appear repeatedly,like in mesh grid processing.
@@ -948,21 +981,39 @@ public:
               const nd_rc_type& tune ,
               value_type & result ) const
   {
-    // To calculate the weights we want efficient storage, being very much inner-loop here,
-    // but at the same time we want adequate packaging for the weights as well. So we obtain
-    // the memory by creating an adequately-sized C++ array right here (cheap, no new needed)
-    // and package it in a MultiArrayView, which is also cheap.
-
-    ele_type weight_data [ workspace_size ] ; // get space for the weights
-    MultiArrayView < 2 , ele_type >           // package them in a MultiArrayView
-     weight ( Shape2 ( ORDER , dimension ) ,  // with dimension sets of ORDER weights
-              weight_data ) ;                 // using the space claimed above
-    
-    // now we call obtain_weights, which will fill in 'weight' with weights for the
-    // given set of fractional coordinate parts in 'tune'
+    if ( specialize == 0 )
+    {
+      // nearest neighbour. simply pick the coefficient
+      result = coefficients [ sum ( select * coefficients.stride() ) ] ;
+    }
+    else if ( specialize == 1 )
+    {
+      // linear interpolation. use specialized _eval_linear object
+      const value_type * base =   coefficients.data()
+                                + sum ( select * coefficients.stride() ) ;
+      offset_iterator ofs = offsets.begin() ;  // offsets reflects the positions inside the subarray
+      result = _eval_linear<level,value_type>() ( base , ofs , tune ) ;
+    }
+    else
+    {
+      // general case. this works for degree 0 and 1 as well, but is less efficient
+
+      // To calculate the weights we want efficient storage, being very much inner-loop here,
+      // but at the same time we want adequate packaging for the weights as well. So we obtain
+      // the memory by creating an adequately-sized C++ array right here (cheap, no new needed)
+      // and package it in a MultiArrayView, which is also cheap.
+
+      ele_type weight_data [ workspace_size ] ; // get space for the weights
+      MultiArrayView < 2 , ele_type >           // package them in a MultiArrayView
+      weight ( Shape2 ( ORDER , dimension ) ,  // with dimension sets of ORDER weights
+                weight_data ) ;                 // using the space claimed above
+      
+      // now we call obtain_weights, which will fill in 'weight' with weights for the
+      // given set of fractional coordinate parts in 'tune'
 
-    obtain_weights ( weight , tune ) ;
-    eval ( select , weight , result ) ;   // delegate
+      obtain_weights ( weight , tune ) ;
+      eval ( select , weight , result ) ;   // delegate
+    }
   }
 
   /// this variant of eval() takes a split coordinate. This method isn't used
@@ -1199,6 +1250,67 @@ public:
     }  
   } ;
 
+  // for linear (degree=1) interpolation we use a specialized routine
+  // which is slightly faster, bcause it directly uses the 'tune' values
+  // instead of building an array of weights first and passing that in.
+
+  template < class dtype , int level >
+  struct _v_eval_linear
+  {
+    dtype operator() ( const component_base_type& base , // base adresses of components
+                       const ic_v& origin ,              // offsets to evaluation window origins
+                       offset_iterator & ofs ,           // offsets to coefficients inside this window
+                       const nd_rc_v& tune ) const       // weights to apply
+    {
+      dtype sum ;    ///< to accumulate the result
+      dtype subsum ;
+
+      sum = _v_eval_linear < dtype , level - 1 >() ( base , origin , ofs , tune ) ;
+      for ( int ch = 0 ; ch < channels ; ch++ )
+      {
+        sum[ch] *= ( rc_type ( 1.0 ) - tune [ level ] ) ;
+      }
+      
+      subsum = _v_eval_linear < dtype , level - 1 >() ( base , origin , ofs , tune );
+      for ( int ch = 0 ; ch < channels ; ch++ )
+      {
+        sum[ch] += ( tune [ level ] ) * subsum[ch] ;
+      }
+      
+      return sum ;
+    }  
+  } ;
+
+  /// the level 0 routine terminates the recursion
+  
+  template < class dtype >
+  struct _v_eval_linear < dtype , 0 >
+  {
+    dtype operator() ( const component_base_type& base , // base adresses of components
+                       const ic_v& origin ,              // offsets to evaluation window origins
+                       offset_iterator & ofs ,           // offsets to coefficients in this window
+                       const nd_rc_v& tune ) const       // weights to apply
+    {
+      dtype sum ;
+      auto o1 = *ofs ;
+      ++ofs ;
+      auto o2 = *ofs ;
+      ++ofs ;
+      
+      for ( int ch = 0 ; ch < channels ; ch++ )
+      {
+        sum[ch] = ( rc_type ( 1.0 ) - tune [ 0 ] )
+                  * ele_v ( base[ch] , origin + o1 ) ;
+                  
+        sum[ch] += tune [ 0 ]
+                   * ele_v ( base[ch] , origin + o2 ) ; 
+      }
+      
+      
+      return sum ;
+    }  
+  } ;
+
 //   template < class dtype , int level >
 //   struct _v_xeval
 //   {
@@ -1290,31 +1402,52 @@ public:
                 const nd_rc_v& tune ,      // fractional parts of the coordinates
                 mc_ele_v & result ) const  // target
   {
-    // like in the unvectorized code, the 2D MultiArrayView to the weights is created
-    // as a view to data on the stack (weight_data) which is lightweight and fast
+    if ( specialize == 0 )
+    {
+      // nearest neighbour. here, no weights are needed and we can instantly
+      // gather the data from the coefficient array.
+
+      for ( int ch = 0 ; ch < channels ; ch++ )
+        result[ch].gather ( component_base[ch] , select ) ;
+    }
+    else if ( specialize == 1 )
+    {
+      // linear interpolation. this uses a specialized _v_eval object which
+      // directly processes 'tune' instead of creating an array of weights first.
+
+      offset_iterator ofs = component_offsets.begin() ;
     
-    // ... but clang++ does not accept this (variable sized array of non-POD type)
-    // When the second variant is used, the code runs here compiled with clang++,
-    // but not when compiled with gcc (probably due to an alignment problem),
-    // so I have this TODO ugly ifdef :( and TODO make sure the clang-specific code
-    // is safe regarding alignment
-
- #ifndef __clang__
-    ele_v weight_data [ workspace_size ] ;
-    MultiArrayView < 2 , ele_v > weight ( Shape2 ( ORDER , dimension ) , weight_data ) ;
+      result = _v_eval_linear < mc_ele_v , level >()
+               ( component_base , select , ofs , tune ) ;
+    }
+    else
+    {
+      // like in the unvectorized code, the 2D MultiArrayView to the weights is created
+      // as a view to data on the stack (weight_data) which is lightweight and fast
+      
+      // ... but clang++ does not accept this (variable sized array of non-POD type)
+      // When the second variant is used, the code runs here compiled with clang++,
+      // but not when compiled with gcc (probably due to an alignment problem),
+      // so I have this TODO ugly ifdef :( and TODO make sure the clang-specific code
+      // is safe regarding alignment
+
+#ifndef __clang__
+      ele_v weight_data [ workspace_size ] ;
+      MultiArrayView < 2 , ele_v > weight ( Shape2 ( ORDER , dimension ) , weight_data ) ;
 #else
-    ele_type weight_data [ workspace_size * vsize ] ;
-    MultiArrayView < 2 , ele_v > weight ( Shape2 ( ORDER , dimension ) , (ele_v*)weight_data ) ;
+      ele_type weight_data [ workspace_size * vsize ] ;
+      MultiArrayView < 2 , ele_v > weight ( Shape2 ( ORDER , dimension ) , (ele_v*)weight_data ) ;
 #endif
 
-    // obtain_weights is the same code as for unvectorized operation, the arguments
-    // suffice to pick the right template arguments
-  
-    obtain_weights ( weight , tune ) ;
-    
-    // having obtained the weights, we delegate to the final delegate.
+      // obtain_weights is the same code as for unvectorized operation, the arguments
+      // suffice to pick the right template arguments
     
-    v_eval ( select , weight , result ) ;
+      obtain_weights ( weight , tune ) ;
+      
+      // having obtained the weights, we delegate to the final delegate.
+      
+      v_eval ( select , weight , result ) ;
+    }
 
     if ( use_tag )
     {
@@ -1332,7 +1465,6 @@ public:
         }
       }
     }
-
   }
 
   /// here we transform incoming nD coordinates into offsets into the coefficient
diff --git a/mapping.h b/mapping.h
index d07e3be..fafe8c5 100644
--- a/mapping.h
+++ b/mapping.h
@@ -950,6 +950,37 @@ public:
 #endif
 } ;
 
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_reflect_raw
+{
+  
+public:
+  
+  odd_mapping_reflect_raw ()
+    { } ;
+  
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv ) const
+  {
+    v += rc_t ( 1.0 ) ;        // shift coordinate further to use with wider brace
+    odd_mapping_raw<ic_t,rc_t,vsize>() ( v , iv , fv ) ;
+  }
+  
+#ifdef USE_VC
+
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+
+  /// this is the operator() for vectorized operation
+
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv ) const
+  {
+    v += rc_t ( 1.0 ) ;        // shift coordinate further to use with wider brace
+    odd_mapping_raw<ic_t,rc_t,vsize>() ( v , iv , fv ) ;
+  }
+
+#endif
+} ;
+
 ///  mapping for REFLECT boundary conditions
 ///
 ///  for even splines this mapping will map coordinates to [ -0.5 - M-0.5 ]. for the 
@@ -1017,7 +1048,7 @@ map_func pick_mapping ( bc_code bc ,        ///< mapping mode
       {
         // raw mapping simply splits the coordinate with no checks
         // whatsoever. So it's constructor doesn't take an argument.
-        odd_mapping_raw < ic_t , rc_t , vsize > () ;
+        return odd_mapping_raw < ic_t , rc_t , vsize > () ;
         break ;
       }
       case LIMIT :
@@ -1105,6 +1136,14 @@ map_func pick_mapping ( bc_code bc ,        ///< mapping mode
         return odd_mapping_reflect < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
+      case RRAW :
+      {
+        // like REFLECT, but doesn't apply the mirroring. the only difference
+        // to odd_mapping_raw is the shift of the input by +1, to account for the
+        // widened brace used with REFLECT boundary conditions
+        return odd_mapping_reflect_raw < ic_t , rc_t , vsize > () ;
+        break ;
+      }
       case PERIODIC :
       {
         return odd_mapping_periodic < ic_t , rc_t , vsize > ( M + 1 ) ;
@@ -1124,7 +1163,7 @@ map_func pick_mapping ( bc_code bc ,        ///< mapping mode
     {
       case RAW :
       {
-        even_mapping_raw < ic_t , rc_t , vsize > () ;
+        return even_mapping_raw < ic_t , rc_t , vsize > () ;
         break ;
       }
       case LIMIT :
@@ -1179,6 +1218,13 @@ map_func pick_mapping ( bc_code bc ,        ///< mapping mode
         return even_mapping_reflect < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
+      case RRAW :
+      {
+        // like REFLECT, but doesn't apply the mirroring. for even splines,
+        // this is the same as even_mapping_raw.
+        return even_mapping_raw < ic_t , rc_t , vsize > () ;
+        break ;
+      }
       case PERIODIC :
       {
         return even_mapping_periodic < ic_t , rc_t , vsize > ( M + 1 ) ;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git