[vspline] 04/72: broadened simdized type in prefilter, saved 1 thread two small changes with performance gains which surprised me. The first one is changing simdized_type in the vectorized prefiltering code from plain Vc::Vector<ele_type> to a SimdArray twice that size. The performance gain isn't much, but seems to be a few percent and every little helps. It's just one place in the code where the change is done and it trickles down, so it's easy to change back or try something else. The larger speedup came from a simple change in the multithreading code in common.h (divide_and_conquer, ~_2), where I changed from creating nthreads new threads for the nthreads chunks to creating only one thread less and processing the last chunk in the current thread instead of leaving it to idle until the other threads are done. Seems creating threads is reasonably expensive after all, I hadn't thougt the effect so large. Makes me wonder if it weren't better to have a thread pool and use that instead. Furthermore there were some small changes in the prefilter's type handling (mattype is now an optional template argument, removing the necessity of defining an element-expansion for simdized types) - and weeding out unneccessary 'using namespace vigra::multi_math' statements in several headers.

Kay F. Jahnke kfj-guest at moszumanska.debian.org
Sun Jul 2 09:02:37 UTC 2017


This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 8692fea3160ba386b1563e08fd1b9948ccf3513d
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Thu Oct 20 11:37:55 2016 +0200

    broadened simdized type in prefilter, saved 1 thread
    two small changes with performance gains which surprised me.
    The first one is changing simdized_type in the vectorized prefiltering code
    from plain Vc::Vector<ele_type> to a SimdArray twice that size. The performance
    gain isn't much, but seems to be a few percent and every little helps. It's just
    one place in the code where the change is done and it trickles down, so it's
    easy to change back or try something else.
    The larger speedup came from a simple change in the multithreading code in
    common.h (divide_and_conquer, ~_2), where I changed from creating nthreads
    new threads for the nthreads chunks to creating only one thread less and processing
    the last chunk in the current thread instead of leaving it to idle until the
    other threads are done. Seems creating threads is reasonably expensive after all,
    I hadn't thougt the effect so large. Makes me wonder if it weren't better to have
    a thread pool and use that instead.
    Furthermore there were some small changes in the prefilter's type handling
    (mattype is now an optional template argument, removing the necessity of
    defining an element-expansion for simdized types) - and weeding out unneccessary
    'using namespace vigra::multi_math' statements in several headers.
---
 brace.h              |  3 --
 common.h             | 22 ++++++++----
 eval.h               |  2 --
 example/roundtrip.cc | 52 ++++++++++++++--------------
 mapping.h            |  2 --
 prefilter.h          | 96 ++++++++++++++++++++++++----------------------------
 remap.h              |  1 -
 7 files changed, 88 insertions(+), 90 deletions(-)

diff --git a/brace.h b/brace.h
index 409cae4..25895e3 100644
--- a/brace.h
+++ b/brace.h
@@ -140,13 +140,10 @@
 
 #include <vigra/multi_array.hxx>
 #include <vigra/multi_iterator.hxx>
-#include <vigra/multi_math.hxx>
 #include "common.h"
 
 namespace vspline {
 
-using namespace vigra::multi_math;
-
 /// class bracer encodes the entire bracing process. It also gives the metrics
 /// for the size of the braces expected by the evaluation code.
 
diff --git a/common.h b/common.h
index 49e7c0b..2652841 100644
--- a/common.h
+++ b/common.h
@@ -264,10 +264,13 @@ struct divide_and_conquer
 
       std::vector<view_type> dn ; // space for chunks
       nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
-      std::thread * t[nthreads] ;
+      
+      int new_threads = nthreads - 1 ;
+      std::thread * t[new_threads] ;
       shape_type offset ;
 
-      for ( int s = 0 ; s < nthreads ; s++ )
+      int s ;
+      for ( s = 0 ; s < new_threads ; s++ )
       {
         t[s] = new std::thread ( func , std::ref(dn[s]) , offset ) ;
         for ( int d = 0 ; d < dim ; d ++ )
@@ -276,7 +279,9 @@ struct divide_and_conquer
             offset[d] += dn[s].shape(d) ;
         }
       }
-      for ( int s = 0 ; s < nthreads ; s++ )
+      // and process the last chunk in this thread
+      func ( dn[s] , offset ) ;
+      for ( int s = 0 ; s < new_threads ; s++ )
       {
         t[s]->join() ;
         delete t[s] ;
@@ -500,10 +505,13 @@ struct divide_and_conquer_2
       std::vector<view_type_2> d2n ; // space for chunks of data2
       nthreads = split_array_to_chunks <view_type_2> ( data2 , d2n , nthreads , forbid ) ;
 
-      std::thread * t[nthreads] ;
+      int new_threads = nthreads - 1 ;
+      
+      std::thread * t[new_threads] ;
       shape_type offset ;
 
-      for ( int s = 0 ; s < nthreads ; s++ )
+      int s ;
+      for ( s = 0 ; s < new_threads ; s++ )
       {
         t[s] = new std::thread ( func , std::ref(dn[s]) , std::ref(d2n[s]), offset ) ;
         for ( int d = 0 ; d < dim ; d ++ )
@@ -512,7 +520,9 @@ struct divide_and_conquer_2
             offset[d] += dn[s].shape(d) ;
         }
       }
-      for ( int s = 0 ; s < nthreads ; s++ )
+      // process the last chunk in this thread
+      func ( std::ref(dn[s]) , std::ref(d2n[s]), offset ) ;
+      for ( int s = 0 ; s < new_threads ; s++ )
       {
         t[s]->join() ;
         delete t[s] ;
diff --git a/eval.h b/eval.h
index a3bc39c..e14590a 100644
--- a/eval.h
+++ b/eval.h
@@ -64,7 +64,6 @@
 
 #include <vigra/multi_array.hxx>
 #include <vigra/multi_iterator.hxx>
-#include <vigra/multi_math.hxx>
 
 #include "basis.h"
 #include "mapping.h"
@@ -74,7 +73,6 @@ namespace vspline {
 
 using namespace std ;
 using namespace vigra ;
-using namespace vigra::multi_math;
 
 /// The routine 'calculate_weight_matrix' originates from vigra. I took the original
 /// routine BSplineBase<ORDER, T>::calculateWeightMatrix() from vigra and changed it
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index bb0f034..17918ec 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -60,8 +60,6 @@ using namespace std ;
 namespace detail
 {
 using namespace vigra ;
-using namespace vigra::acc;
-using namespace vigra::multi_math;
 
 /// we use identity transformations on coordinates. Incoming coordinates will be
 /// translated straight to outgoing coordinates. If we use such a transformation
@@ -85,12 +83,17 @@ void vtf_identity ( const TinyVector < rc_v , 2 > & c_in ,
 
 #endif
 
-template < class array_type , typename real_type >
-void check_diff ( const array_type v1 )
+template < class view_type >
+void check_diff ( const view_type& a , const view_type& b )
 {
-//   array_type v1 = ( target - data ) ;
-  typedef vigra::MultiArray<2,real_type> error_array ;
-  error_array ea ( vigra::multi_math::squaredNorm ( v1 ) ) ;
+  using namespace vigra::multi_math ;
+  using namespace vigra::acc;
+  
+  typedef typename view_type::value_type value_type ;
+  typedef typename ExpandElementResult < value_type > :: type real_type ;
+  typedef MultiArray<2,real_type> error_array ;
+
+  error_array ea ( vigra::multi_math::squaredNorm ( b - a ) ) ;
   AccumulatorChain<real_type,Select< Mean, Maximum> > ac ;
   extractFeatures(ea.begin(), ea.end(), ac);
   std::cout << "warped image diff Mean: " << sqrt(get<Mean>(ac)) << std::endl;
@@ -138,7 +141,7 @@ void roundtrip ( view_type & data ,
   
 #ifdef PRINT_ELAPSED
   std::chrono::system_clock::time_point end = std::chrono::system_clock::now();
-  cout << "avg 10 x prefilter:........................ "
+  cout << "avg " << TIMES << " x prefilter:........................ "
        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
@@ -223,12 +226,12 @@ void roundtrip ( view_type & data ,
   
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg 10 x remap1 from pre-split coordinates: "
-       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / 10.0
+  cout << "avg " << TIMES << " x remap1 from pre-split coordinates: "
+       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
-  
-  check_diff<array_type,real_type> ( target - data ) ;
+       
+  check_diff<view_type> ( target , data ) ;
   
 #ifdef PRINT_ELAPSED
   start = std::chrono::system_clock::now();
@@ -241,12 +244,12 @@ void roundtrip ( view_type & data ,
   
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg 10 x remap1 from unsplit coordinates:.. "
-       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / 10.0
+  cout << "avg " << TIMES << " x remap1 from unsplit coordinates:.. "
+       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
        
-  check_diff<array_type,real_type> ( target - data ) ;
+  check_diff<view_type> ( target , data ) ;
   
 #ifdef PRINT_ELAPSED
   start = std::chrono::system_clock::now();
@@ -259,12 +262,12 @@ void roundtrip ( view_type & data ,
  
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg 10 x remap with internal spline:....... "
-       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / 10.0
+  cout << "avg " << TIMES << " x remap with internal spline:....... "
+       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
 
-  check_diff<array_type,real_type> ( target - data ) ;
+  check_diff<view_type> ( target , data ) ;
 
 #ifdef USE_VC
 
@@ -295,8 +298,8 @@ void roundtrip ( view_type & data ,
       
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg 10 x remap with functor & internal bspl "
-       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / 10.0
+  cout << "avg " << TIMES << " x remap with functor & internal bspl "
+       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
 
@@ -312,17 +315,16 @@ void roundtrip ( view_type & data ,
       
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg 10 x remap with functor & external bspl "
-       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / 10.0
+  cout << "avg " << TIMES << " x remap with functor & external bspl "
+       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
 
-  check_diff < array_type , real_type > ( target - data ) ;
+  check_diff<view_type> ( target , data ) ;
 
   cout << "difference original data/restored data:" << endl ;
   vspline::restore_from_braced<view_type> ( bsp.coeffs , bsp.coeffs , DEGREE ) ;
-  array_type diff ( cfview - data ) ;
-  check_diff<view_type,real_type> ( diff ) ;
+  check_diff<view_type> ( cfview , data ) ;
   cout << endl ;
 }
 
diff --git a/mapping.h b/mapping.h
index 7819af2..23c4379 100644
--- a/mapping.h
+++ b/mapping.h
@@ -108,7 +108,6 @@
 #define VSPLINE_MAPPING_H
  
 #include <vigra/multi_iterator.hxx>
-#include <vigra/multi_math.hxx>
 
 #include "common.h"
 
@@ -116,7 +115,6 @@ namespace vspline {
 
 using namespace std ;
 using namespace vigra ;
-using namespace vigra::multi_math;
 
 typedef int default_ic_type ;
 typedef float default_rc_type ;
diff --git a/prefilter.h b/prefilter.h
index 02db3ed..ffc64fb 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -115,19 +115,16 @@
 
 #include <vigra/multi_array.hxx>
 #include <vigra/multi_iterator.hxx>
-#include <vigra/multi_math.hxx>
 #include <vigra/navigator.hxx>
 #include <vigra/bordertreatment.hxx>
 #include <vigra/multi_convolution.hxx>
 
 #include "common.h"
-#include "basis.h"
 
 namespace vspline {
 
 using namespace std ;
 using namespace vigra ;
-using namespace vigra::multi_math;
 
 /// class solver performs the conversion of a 1D string of data to spline coefficients with
 /// the 'DSP aproach', which performs coefficient generation by means of an IIR filter.
@@ -163,9 +160,14 @@ using namespace vigra::multi_math;
 /// class solver needs two template arguments, one for the type of iterator over the incoming
 /// data, and one for the type of iterator to the resultant coefficients. These will usually be
 /// the same, but formulating the code with two separate types makes it more versatile.
+/// Optionally there's a third template argument, specifying 'mattype', the elementary type
+/// of the iterator's value_type. When the value_types are vigra aggregates (TinyVectors etc.)
+/// vigra's ExpandElementResult mechanism will provide, but at times we may wish to be explicit
+/// here, e.g. when iterating over simdized types.
 
 template < typename in_iter ,   // iterator over the knot point values
-           typename out_iter >  // iterator over the coefficient array
+           typename out_iter ,  // iterator over the coefficient array
+           typename mattype = typename ExpandElementResult < typename in_iter::value_type > :: type >
 class solver
 {
   // both iterators must define value_type and have the same value_type
@@ -175,13 +177,6 @@ class solver
   static_assert ( std::is_same < typename out_iter::value_type , value_type > :: value ,
                   "prefilter input and output iterator must have the same value_type" ) ;
   
-  // while the iterators may refer to aggregates (pixels etc.), we also need access
-  // to the fundamental type of the aggregates. Further down, in the vectorized code,
-  // a specialization for Vc::Vector objects is given, so that the element type of
-  // Vc::Vectors can be inferred as well.
-
-  typedef typename ExpandElementResult < value_type > :: type mattype ;
-
 //   // both iterators should be random access iterators.
 //   // currently not enforced, too lazy to code the traits for vector stripe iterators...
 //   typedef typename std::iterator_traits < in_iter > :: iterator_category in_cat ;
@@ -194,11 +189,12 @@ class solver
                   
   
   /// typedef the fully qualified type for brevity
-  typedef solver<in_iter,out_iter> solver_type ;
+  typedef solver<in_iter,out_iter,mattype> solver_type ;
 
 public:
   
-  ArrayVector<mattype> Pole ;        ///< Poles of the IIR filter
+  const double* Pole ;               ///< Poles of the IIR filter
+//   ArrayVector<mattype> Pole ;        ///< Poles of the IIR filter
   ArrayVector<int> Horizon ;         ///< as many 'Horizons' as Poles
   mattype  Lambda ;                  ///< (potentiated) overall gain.  
   const int NbPoles ;                ///< Number of filter poles
@@ -561,7 +557,7 @@ int solve_gain_inlined ( in_iter c , out_iter x )
   
   for (int n = 1; n < M; n++)
   {
-    x[n] = X = Lambda * c[n] + Pole[0] * X ;
+    x[n] = X = Lambda * c[n] + mattype ( Pole[0] ) * X ;
   }
   
   // now the input is used up and won't be looked at any more; all subsequent
@@ -574,7 +570,7 @@ int solve_gain_inlined ( in_iter c , out_iter x )
   /* anticausal recursion */
   for (int n = M - 2; 0 <= n; n--)
   {
-    x[n] = X = Pole[0] * ( X - x[n]);
+    x[n] = X = mattype ( Pole[0] ) * ( X - x[n]);
   }
   
   // for the remaining poles, if any, don't apply the gain
@@ -588,7 +584,7 @@ int solve_gain_inlined ( in_iter c , out_iter x )
     /* causal recursion */
     for (int n = 1; n < M; n++)
     {
-      x[n] = X = x[n] + Pole[k] * X ;
+      x[n] = X = x[n] + mattype ( Pole[k] ) * X ;
     }
     
     /* anticausal initialization */
@@ -597,7 +593,7 @@ int solve_gain_inlined ( in_iter c , out_iter x )
     /* anticausal recursion */
     for (int n = M - 2; 0 <= n; n--)
     {
-      x[n] = X = Pole[k] * ( X - x[n] );
+      x[n] = X = mattype ( Pole[k] ) * ( X - x[n] );
     }
   }
 }
@@ -619,7 +615,7 @@ int solve_no_gain ( in_iter c , out_iter x )
   /* causal recursion */
   for ( int n = 1; n < M; n++)
   {
-    x[n] = X = c[n] + Pole[0] * X ;
+    x[n] = X = c[n] + mattype ( Pole[0] ) * X ;
   }
   
   /* anticausal initialization */
@@ -628,7 +624,7 @@ int solve_no_gain ( in_iter c , out_iter x )
   /* anticausal recursion */
   for ( int n = M - 2; 0 <= n; n--)
   {
-    x[n] = X = Pole[0] * ( X - x[n]);
+    x[n] = X = mattype ( Pole[0] ) * ( X - x[n]);
   }
   
   // for the remaining poles, if any, work on the result
@@ -642,7 +638,7 @@ int solve_no_gain ( in_iter c , out_iter x )
     /* causal recursion */
     for (int n = 1; n < M; n++)
     {
-      x[n] = X = x[n] + Pole[k] * X ;
+      x[n] = X = x[n] + mattype ( Pole[k] ) * X ;
     }
     
     /* anticausal initialization */
@@ -651,7 +647,7 @@ int solve_no_gain ( in_iter c , out_iter x )
     /* anticausal recursion */
     for (int n = M - 2; 0 <= n; n--)
     {
-      x[n] = X = Pole[k] * ( X - x[n] );
+      x[n] = X = mattype ( Pole[k] ) * ( X - x[n] );
     }
   }
 }
@@ -694,7 +690,8 @@ solver ( int _M ,           ///< number of input/output elements (DataLength)
 : M ( _M ) ,
   SplineOrder ( spline_order ) ,
   SplineDegree ( spline_order - 1 ) ,
-  NbPoles ( ( spline_order - 1 ) / 2 )
+  NbPoles ( ( spline_order - 1 ) / 2 ) ,
+  Pole ( precomputed_poles [ spline_order - 1 ] )
 {
   // TODO: make tolerance a parameter
 
@@ -720,10 +717,10 @@ solver ( int _M ,           ///< number of input/output elements (DataLength)
   
   for ( int i = 0 ; i < NbPoles ; i++ )
   {
-    double pole = precomputed_poles [ SplineDegree ] [ i ] ;
-    Pole.push_back ( pole );
+//     double pole = precomputed_poles [ SplineDegree ] [ i ] ;
+//     Pole.push_back ( pole );
     if ( Tolerance )
-      Horizon.push_back ( ceil ( log ( Tolerance ) / log ( fabs ( pole ) ) ) ) ;
+      Horizon.push_back ( ceil ( log ( Tolerance ) / log ( fabs ( Pole[i] ) ) ) ) ;
     else
       Horizon.push_back ( M ) ;
   }
@@ -993,32 +990,12 @@ void restore_from_unbraced ( array &spline , array& target , BorderTreatmentMode
   separableConvolveMultiArray(spline, target, spline_kernel);
 }
 
-} ; // end of namespace vspline
-
 // the use of Vc has to be switched on with the flag USE_VC.
 // The ramainder of the code in this file provides vectorized prefiltering
 // using Vc.
 
 #ifdef USE_VC
 
-namespace vigra
-{
-  /// specializes ExpandElementResult for Vc::Vector so that
-  /// solver objects operating on Vc::Vectors can infer the type of the
-  /// elements in the Vector.
-
-template <class T>
-struct ExpandElementResult < Vc::Vector<T> >
-{
-    typedef T type;
-    enum { size = Vc::Vector<T>::Size };
-} ;
-
-} ;
-
-namespace vspline
-{
-
 // test if an idex vector contains sequential indices
 
 template < typename IT >
@@ -1172,7 +1149,20 @@ int ele_aggregating_filter ( source_type &source ,
                            )
 {
   typedef typename source_type::value_type ele_type ;    ///< elementary type in source
-  typedef Vc::Vector < ele_type > simdized_type ;        ///< vectorized type thereof
+
+  // Initially I used a plain Vc::Vector as the simdized type, but I found that using a SimdArray
+  // of twice that size performed slightly better. Her's the one place where this type can be fixed,
+  // it's choice 'trickles down' to the remainder of the code, so experimentation is cheap. vsize has
+  // to be a multiple of the genuine vector's size, though, because in gather/scatter above we explicitly
+  // use aligned load/stores to the buffer. If we use plain load/stores to the buffer there, other
+  // vsizes can be used, but performance suffers - we want to use the fastest ops possible after all.
+  // TODO: This should be tested with different systems
+  
+//   typedef Vc::Vector < ele_type > simdized_type ;        ///< vectorized type, use plain vector
+
+  typedef Vc::SimdArray < ele_type ,
+                          2 * Vc::Vector<ele_type>::Size > simdized_type ; ///< vectorized type
+
   const int vsize ( simdized_type::Size ) ;              ///< number of ele_type in a simdized_type
   typedef typename simdized_type::IndexType index_type ; ///< indexes for gather/scatter
   typedef typename simdized_type::MaskType mask_type ;   ///< and mask for masked operation
@@ -1184,8 +1174,10 @@ int ele_aggregating_filter ( source_type &source ,
   // anyway, a MultiArray with the proper allocator works just fine, and the dereferencing
   // of the iterator needed in the solver works without further ado. 
   
+  // Vc::Memory < simdized_type > buffer ( count * vsize ) ; // doesn't work for me
+
   MultiArray < 1 , simdized_type , Vc::Allocator<simdized_type> > buffer ( count ) ;
-  
+
   // avoiding being specific about the iterator's type allows us to slot in
   // any old iterator we can get by calling begin() on buffer 
   
@@ -1213,9 +1205,11 @@ int ele_aggregating_filter ( source_type &source ,
   ele_type * target_base_adress = target.data() ;
 
   // we create a solver object capable of handling the iterator producing the successive
-  // simdized_types from the buffer
+  // simdized_types from the buffer. While the unvectorized code can omit passing the third
+  // template argument (the elementary type used inside the solver) we pass it here, as we
+  // don't define an element-expansion via vigra::ExpandElementResult for simdized_type.
 
-  solver < viter_type , viter_type > // , simdized_type , ele_type >
+  solver < viter_type , viter_type , ele_type >
     slv ( count , lambda_exponent , bc , degree + 1 ) ;
 
   if ( source.stride() == target.stride() )
@@ -1464,8 +1458,8 @@ void solve_vc ( input_array_type& input ,
   }
 }
 
-} ; // namespace vspline
-
 #endif // USE_VC
 
+} ; // namespace vspline
+
 #endif // VSPLINE_PREFILTER_H
diff --git a/remap.h b/remap.h
index 61e0622..40dad3b 100644
--- a/remap.h
+++ b/remap.h
@@ -92,7 +92,6 @@ namespace vspline {
 
 using namespace std ;
 using namespace vigra ;
-using namespace vigra::multi_math;
 
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class coordinate_type , // usually TinyVector of real for coordinates

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git



More information about the debian-science-commits mailing list