[vspline] 60/72: clened up code in filter.h, documentation

Sun Jul 2 09:02:43 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 244a232234d6f05bc090aa9bf878b33cfc9df621
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sat May 13 17:13:14 2017 +0200

    clened up code in filter.h, documentation
---
 bspline.h           |   2 +-
 doxy.h              |   2 +-
 example/examples.sh |   2 +-
 example/gradient.cc |  35 ++++++++-
 filter.h            | 203 ++++++++++++++++++----------------------------------
 prefilter.h         |   4 +-
 6 files changed, 105 insertions(+), 143 deletions(-)

diff --git a/bspline.h b/bspline.h
index e01b528..dfc58ac 100644
--- a/bspline.h
+++ b/bspline.h
@@ -580,7 +580,7 @@ public:
 //           br ( coeffs , bcv[d] , spline_degree , d ) ;
         break ;
       case EXPLICIT:
-        // first apply bracing with BC codes passed in, then solve with BC code GUESS
+        // first create frame with BC codes passed in, then solve with BC code GUESS
         // this automatically fills the brace as well, since it's part of the frame.
         // TODO: the values in the frame will not come out precisely the same as they
         // would by filling the brace after the coefficients have been calculated.
diff --git a/doxy.h b/doxy.h
index 1fde37a..59dd8bb 100644
--- a/doxy.h
+++ b/doxy.h
@@ -81,7 +81,7 @@ On the evaluation side I provide
  
  - evaluation of nD arrays of coordinates (generalized remap function)
  
- - remap-like functions processing nD arrays of data
+ - functor based remap (aka 'transform') and 'apply' functions
  
  \section install_sec Installation
  
diff --git a/example/examples.sh b/example/examples.sh
index fa7919a..e4de9cc 100755
--- a/example/examples.sh
+++ b/example/examples.sh
@@ -6,5 +6,5 @@ for f in $(ls *.cc)
 do
   body=$(basename $f .cc)
   echo compiling $body
-  clang++ -O3 -std=c++11 -march=native -pthread -o$body $f -lVc -lvigraimpex
+  clang++ -O3 -std=c++11 -DUSE_VC -march=native -pthread -o$body $f -lVc -lvigraimpex
 done
diff --git a/example/gradient.cc b/example/gradient.cc
index 1ecf8a7..e059cf7 100644
--- a/example/gradient.cc
+++ b/example/gradient.cc
@@ -39,7 +39,7 @@
 /// This is a good test for both the precision of the evaluation and it's
 /// correct functioning, particularly with higher-D arrays.
 ///
-/// compile: clang++ -std=c++11 -pthread -o gradient gradient.cc
+/// compile: clang++ -O3 -DUSE_VC -march=native -std=c++11 -pthread -o gradient gradient.cc -lVc
 
 #include <vspline/vspline.h>
 #include <random>
@@ -53,29 +53,58 @@ int main ( int argc , char * argv[] )
   typedef typename spline_type::view_type view_type ;
   typedef typename spline_type::bcv_type bcv_type ;
   
-  shape_type core_shape = { 10 , 4 , 13 } ;
-  spline_type bspl ( core_shape , 3 , bcv_type ( vspline::NATURAL ) ) ; // , vspline::EXPLICIT ) ;
+  // let's have a knot point array with nicely odd shape
+
+  shape_type core_shape = { 35 , 43 , 19 } ;
+  
+  // we have to use a longish call to the constructor since we want to pass
+  // 0.0 to 'tolerance' and it's way down in the argument list, so we have to
+  // explicitly pass a few arguments which usually take default values before
+  // we have a chance to pass the tolerance
+
+  spline_type bspl ( core_shape ,                    // shape of knot point array
+                     3 ,                             // cubic b-spline
+                     bcv_type ( vspline::NATURAL ) , // natural boundary conditions
+                     vspline::BRACED ,               // implicit scheme, bracing coeffs
+                     -1 ,                            // default, not using EXPLICIT
+                     view_type() ,                   // empty view
+                     0.0 ) ;                         // tolerance 0.0 for this test!
+
+  // get a view to the bspline's core, to fill it with data
+
   view_type core = bspl.core ;
   
+  // create the gradient in each dimension
+
   for ( int d = 0 ; d < bspl.dimension ; d++ )
   {
     for ( int c = 0 ; c < core_shape[d] ; c++ )
       core.bindAt ( d , c ) += c ;
   }
   
+  // now prefilter the spline
+
   bspl.prefilter() ;
 
+  // set up an evaluator
+
   typedef vigra::TinyVector < double , 3 > coordinate_type ;
   typedef vspline::evaluator < coordinate_type , double > evaluator_type ;
   
   evaluator_type ev ( bspl ) ;
   
+  // we want to bombard the evaluator with random in-range coordinates
+  
   std::random_device rd;
   std::mt19937 gen(rd());
   // std::mt19937 gen(12345);   // fix starting value for reproducibility
 
   coordinate_type c ;
   
+  // here comes our test, feed 100 random 3D coordinates and compare the
+  // evaluator's result with the expected value, which is precisely the
+  // sum of the coordinate's components
+
   for ( int times = 0 ; times < 100 ; times++ )
   {
     for ( int d = 0 ; d < bspl.dimension ; d++ )
diff --git a/filter.h b/filter.h
index 4e2bb70..5bbad9a 100644
--- a/filter.h
+++ b/filter.h
@@ -766,7 +766,7 @@ filter ( int _M ,               ///< number of input/output elements (DataLength
 // n-dimensional arrays. We use the following strategy:
 // - perform the prefiltering collinear to each axis separately
 // - when processing a specific axis, split the array(s) into chunks and use one job per chunk
-// - perform a traverse on each chunk, copying out 1D subsets collinear to the processing axis
+// - perform a traverse on each chunk, copying out subsets collinear to the processing axis
 //   to a buffer
 // - perform the filter on the buffer
 // - copy the filtered data to the target
@@ -964,16 +964,6 @@ void nonaggregating_filter ( range_type < typename source_view_type::difference_
 
 #ifdef USE_VC
 
-/* currently unused
-// test if an index vector contains sequential indices
-
-template < typename IT >
-bool sequential ( const IT & indexes )
-{
-  return Vc::all_of ( indexes - indexes[0] == IT::IndexesFromZero() ) ;
-}
-*/
-
 /// extended gather and scatter routines taking 'extrusion parameters'
 /// which handle how many times and with which stride the gather/scatter
 /// operation is repeated. With these routines, strided memory can be
@@ -1003,38 +993,21 @@ gather ( const typename source_type::value_type * source ,
        )
 {
   register source_type x ;
+  // if the mask is all-true, load the data with an unmasked gather operation
   if ( mask.isFull() )
   {
-// this is strange: using this code is actually slower on my system:
-//
-//     if ( sequential ( indexes ) )
-//     {
-//       // set 'source' to proper starting point
-//       source += int ( indexes[0] ) ;
-//       while ( count-- )
-//       {
-//         x.load ( source ) ;
-//         x.store ( target , Vc::Aligned ) ;
-//         source += stride ;
-//         target += VT::Size ;
-//       }
-//     }
-//     else
+    while ( count-- )
     {
-      while ( count-- )
-      {
-        x.gather ( source , indexes ) ;
-        auto tx = target_type ( x ) ;
-        tx.store ( target , Vc::Aligned ) ;
-        source += stride ;
-        target += target_type::Size ;
-      }
+      x.gather ( source , indexes ) ;
+      auto tx = target_type ( x ) ;
+      tx.store ( target , Vc::Aligned ) ;
+      source += stride ;
+      target += target_type::Size ;
     }
   }
   else
   {
-    // for both sequential and non-sequential indexes,
-    // if there is a mask, perform a gather operation
+    // if there is a partially filled mask, perform a masked gather operation
     while ( count-- )
     {
       x.gather ( source , indexes , mask ) ;
@@ -1057,38 +1030,21 @@ scatter ( const typename source_type::value_type * source ,
         )
 {
   register source_type x ;
+  // if the mask is all-true, store the data with an unmasked scatter operation
   if ( mask.isFull() )
   {
-// this is strange: using this code is actually slower on my system:
-//
-//     if ( sequential ( indexes ) )
-//     {
-//       // set 'target' to proper starting point
-//       target += int ( indexes[0] ) ;
-//       while ( count-- )
-//       {
-//         x.load ( source , Vc::Aligned ) ;
-//         x.store ( target ) ;
-//         source += VT::Size ;
-//         target += stride ;
-//       }
-//     }
-//     else
+    while ( count-- )
     {
-      while ( count-- )
-      {
-        x.load ( source , Vc::Aligned ) ;
-        auto tx = target_type ( x ) ;
-        tx.scatter ( target , indexes ) ;
-        source += source_type::Size ;
-        target += stride ;
-      }
+      x.load ( source , Vc::Aligned ) ;
+      auto tx = target_type ( x ) ;
+      tx.scatter ( target , indexes ) ;
+      source += source_type::Size ;
+      target += stride ;
     }
   }
   else
   {
-    // for both sequential and non-sequential indexes,
-    // if there is a mask, perform a scatter operation
+    // if there is a partially filled mask, perform a masked scatter operation
     while ( count-- )
     {
       x.load ( source , Vc::Aligned ) ;
@@ -1100,8 +1056,8 @@ scatter ( const typename source_type::value_type * source ,
   }
 }
 
-/// aggregating_filter keeps a buffer of vector-aligned memory, which it fills
-/// with 1D subarrays of the source array which are collinear to the processing axis.
+/// aggregating_filter keeps a buffer of vector-aligned memory, which it fills with
+/// vsize 1D subarrays of the source array which are collinear to the processing axis.
 /// The buffer is then submitted to vectorized forward-backward recursive filtering
 /// and finally stored back to the corresponding memory area in target, which may
 /// be the same as source, in which case the operation is seemingly performed
@@ -1224,11 +1180,13 @@ void ele_aggregating_filter ( source_view_type &source ,
       
       if ( e < vsize )
         
-        // have got less than vsize only? mask the operation
+        // have got less than vsize? must be the last few items.
+        // mask was all-true before, so now we limit it to the first e fields:
         
         mask = ( simdized_math_type::IndexesFromZero() < e ) ;
       
-      // perform extended gather with extrusion parameters
+      // perform extended gather with extrusion parameters to transport the unfiltered data
+      // to the buffer
       
       gather < simdized_source_type , simdized_math_type >
         ( source_base_adress ,
@@ -1260,6 +1218,7 @@ void ele_aggregating_filter ( source_view_type &source ,
     // pretty much the same as the previouse operation, with the distinction that
     // copying the filtered data from the buffer to the target now needs it's own set of
     // indexes etc., since all these may be different.
+
     // TODO we might permute source_slice's strides to ascending and apply the same
     // permutation to target_slice.
     
@@ -1335,41 +1294,29 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
   auto source = p_original_source->subarray ( range[0] , range[1] ) ;
   auto target = p_original_target->subarray ( range[0] , range[1] ) ;
   
-  if ( vigra::ExpandElementResult < value_type > :: size != 1 )
-  {
-    // value_type is an aggregate type, but we want to operate on elementary types
-    // so we element-expand the array and call ele_aggregating_filter, which works on
-    // arrays with elementary types.
-    auto expanded_source = source.expandElements ( 0 ) ;
-    auto expanded_target = target.expandElements ( 0 ) ;
-    ele_aggregating_filter < decltype ( expanded_source ) ,
-                             decltype ( expanded_target ) ,
-                             math_type >
-                ( expanded_source ,
-                  expanded_target ,
-                  axis + 1 ,
-                  gain ,
-                  bc ,
-                  nbpoles ,
-                  pole ,
-                  tolerance ) ;
-  }
-  else
-  {
-    // this looks odd - why do we create new views here when the input
-    // contains ele_type data already? Because if the input contains aggregates,
-    // a direct call to ele_filter here wouldn't go away, but won't compile, since
-    // in ele_aggregating_filter we try and typedef a vector of the contained
-    // (aggregate) data type. Hence the acrobatics...
-    vigra::MultiArrayView < source.actual_dimension , ele_type >
-      es ( source.shape() , source.stride() , (ele_type*)(source.data()) ) ;
-    vigra::MultiArrayView < target.actual_dimension , ele_type >
-      et ( target.shape() , target.stride() , (ele_type*)(target.data()) ) ;
-    ele_aggregating_filter < decltype ( es ) ,
-                             decltype ( es ) ,
-                             math_type >
-      ( es , et , axis , gain , bc , nbpoles , pole , tolerance ) ;
-  }
+  // value_type may be an aggregate type, but we want to operate on elementary types
+  // so we element-expand the array and call ele_aggregating_filter, which works on
+  // arrays with elementary types. If value_type is elementary already, the call to
+  // expandElements inserts a singleton dimension, but this has next to no performance
+  // impact, so contrary to my initial implementation I don't handle the 1-channel
+  // case separately any more.
+
+  auto expanded_source = source.expandElements ( 0 ) ;
+  auto expanded_target = target.expandElements ( 0 ) ;
+
+  // with the element-expanded arrays at hand, we can now delegate to ele_aggregating_filter:
+  
+  ele_aggregating_filter < decltype ( expanded_source ) ,
+                           decltype ( expanded_target ) ,
+                           math_type >
+              ( expanded_source ,
+                expanded_target ,
+                axis + 1 ,
+                gain ,
+                bc ,
+                nbpoles ,
+                pole ,
+                tolerance ) ;
 }
 
 #endif
@@ -1379,7 +1326,7 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
 ///
 /// filter_1d, which is the routine processing nD arrays along a specific axis, might as well
 /// be a function. But functions can't be partially specialized (at least not with my compiler)
-/// so I use a functor instead, which, as a class, can be partially specialized. We want a
+/// so I use a functor instead, which, as a class, can be partially specialized. We'll want a
 /// partial specialization for 1D arrays, where all of our usual schemes of multithreading and
 /// vectorization don't intrinsically work and we have to employ a different method, see there.
 
@@ -1390,7 +1337,7 @@ template < typename input_array_type ,  ///< type of array with knot point data
 class filter_1d
 {
 public:
-  void operator() ( input_array_type &input ,   ///< source data. the routine can also operate in-place
+  void operator() ( input_array_type &input ,    ///< source data. the routine can also operate in-place
                     output_array_type &output ,  ///< where input == output.
                     int axis ,
                     double gain ,
@@ -1401,40 +1348,24 @@ public:
                     int njobs = default_njobs )  ///< number of jobs to use when multithreading
 {
   typedef typename input_array_type::value_type value_type ;
-  typedef typename input_array_type::difference_type shape_type ;
-  typedef range_type < shape_type > range_t ;
-  typedef partition_type < range_t > partition_t ;
-  typedef typename vigra::MultiArray < 1 , value_type > ::iterator iter_type ;
-
-  // multithread() expects a std::function, so here we fix the type of
-  // std::function we want to pass to it, create an instance of the std::function
-  // and assign the routine appropriate to the task:
-  
-  typedef
-  std::function < void ( range_t ,
-                         input_array_type * ,
-                         output_array_type * ,
-                         int ,
-                         double ,
-                         bc_code ,
-                         int ,
-                         const double * ,
-                         double ) > filter_functor_type ;
-                         
-  filter_functor_type filter_functor ;
-  
-  auto pf = & nonaggregating_filter < input_array_type ,
-                                      output_array_type ,
-                                      typename input_array_type::value_type > ; // for now...
-  
+
+  // depending on whether Vc is used or not, we choose the appropriate (single-threaded)
+  // filtering routine, which is to be passed to multitheread()
+
 #ifdef USE_VC
 
   typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
 
-  pf = & aggregating_filter < input_array_type ,
-                              output_array_type ,
-                              ele_type > ;
+  auto pf = & aggregating_filter < input_array_type ,
+                                   output_array_type ,
+                                   ele_type > ;
 
+#else
+
+  auto pf = & nonaggregating_filter < input_array_type ,
+                                      output_array_type ,
+                                      value_type > ;
+  
 #endif
   
   // obtain a partitioning of the data array into subranges. We do this 'manually' here
@@ -1442,10 +1373,10 @@ public:
   // (by passing the 'forbid' parameter, which is taken as -1 (no effect) in the default
   // partitioning of nD arrays).
 
-  partition_t partitioning = shape_splitter < dim > :: part
-    ( input.shape() ,
-      njobs ,
-      axis ) ;
+  auto partitioning = shape_splitter<dim>::part ( input.shape() , njobs , axis ) ;
+  
+  // now use multithread() to distribute ranges of data to individual jobs which are
+  // executed by the it's thread pool.
   
   multithread ( pf ,
                 partitioning ,
@@ -1469,6 +1400,8 @@ public:
 /// With this 'cheat' we can handle 1D arrays with full multithreading and vectorization,
 /// while the 'orthodox' approach would have to process the data in linear order with
 /// a single thread. Cleaning up the 'dirty' margins is cheap for large arrays.
+/// The code is making guesses as to whether it's worth while to follow this strategy;
+/// the array has to be quite large before 'fake 2D processing' is actually applied.
 
 template < typename input_array_type ,  ///< type of array with knot point data
            typename output_array_type , ///< type of array for coefficients (may be the same)
diff --git a/prefilter.h b/prefilter.h
index 18fb2ea..ae49859 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -143,8 +143,8 @@ using namespace vigra ;
 /// done very quickly, since no additional memory access is needed beyond a buffer's worth of data
 /// already present in core memory.
 ///
-/// solve is just athin wrapper around filter_nd in filter.h, injecting the actual number of poles,
-/// the poles themselves and a flag wether to use vector code or not.
+/// solve is just a thin wrapper around filter_nd in filter.h, injecting the actual number of poles
+/// and the poles themselves.
 ///
 /// Note how smoothing comes into play here: it's done simply by
 /// prepending an additional pole to the filter cascade, taking a positive value between

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git