[vspline] 64/72: activating code in filter.h to use IndexType for gathering if indexes fit

Sun Jul 2 09:02:43 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 0ce0fdd4b4005a685129a2fdf37e1223a55361b4
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Wed May 17 10:28:37 2017 +0200

    activating code in filter.h to use IndexType for gathering if indexes fit
---
 bspline.h            |   2 +-
 example/roundtrip.cc |   2 +-
 filter.h             | 172 ++++++++++++++++++++++++---------------------------
 3 files changed, 84 insertions(+), 92 deletions(-)

diff --git a/bspline.h b/bspline.h
index 61bd18e..4a6bd5b 100644
--- a/bspline.h
+++ b/bspline.h
@@ -52,7 +52,7 @@
   bspline objects can be used without any knowledge of their internals,
   e.g. as parameters to the remap functions.
   
-  While 'raw' coefficient arrays with an evaluation scheme which applies
+  While using 'raw' coefficient arrays with an evaluation scheme which applies
   boundary conditions is feasible and most memory-efficient, it's not so well
   suited for very fast evaluation, since the boundary treatment needs conditionals,
   and 'breaks' uniform access, which is especially detrimental when using
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index fc79c4c..043817c 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -357,7 +357,7 @@ void process_image ( char * name )
   for ( int b = 0 ; b < 4 ; b++ )
   {
     vspline::bc_code bc = bcs[b] ;
-    for ( int spline_degree = 0 ; spline_degree < 6 ; spline_degree++ )
+    for ( int spline_degree = 2 ; spline_degree < 8 ; spline_degree++ )
     {
 #ifdef USE_VC
       cout << "testing bc code " << vspline::bc_name[bc]
diff --git a/filter.h b/filter.h
index bd467e0..6bbf0ad 100644
--- a/filter.h
+++ b/filter.h
@@ -1085,9 +1085,7 @@ scatter ( const source_type * source ,
 /// to using a vigra::TinyVector < ptrdiff_t > as gather/scatter index type, which
 /// may cause Vc to use less performant code for the gather/scatter operations but
 /// is safe.
-/// TODO: On my system, surprisingly, the variant passing TinyVector<ptrdiff_t> to
-/// the gather/scatter routine performs slightly faster than the presumedly optimal
-/// variant.
+// TODO: using different vsize for different axes might be faster.
 
 template < typename source_view_type ,
            typename target_view_type ,
@@ -1222,60 +1220,57 @@ void ele_aggregating_filter ( source_view_type &source ,
         
         mask = ( simdized_math_type::IndexesFromZero() < e ) ;
 
-// The next bit of code, which is commented out, tried to use an 'optimal' type
-// for gather/scatter indices by picking simdized_math_type::IndexType. But it
-// turned out the resulting code was slower on my system and requires testing for
-// overflow. So I leave the code in but I'm not using it:
+      // next we assign the indices (which are ptrdiff_t) to the intended type
+      // for gather/scatter indices - which is what Vc deems appropriate. This should
+      // be the optimal choice in terms of performance. Yet we can't be certain that
+      // the ptrdiff_t values actually fit into this type, which is usually composed of
+      // int only. So we test if the assigned value compares equal to the assignee.
+      // If the test fails for any of the indices, we switch to code using a
+      // vigra::TinyVector < ptrdiff_t > for the indices, which is permissible, since
+      // TinyVector offers operator[], but may be less efficient.
+      // Note: Vc hasn't implemented the gather with intrinsics for AVX2, that's why
+      // using gs_indexes_type can't yet have a speedup effect.
+      // Note: since the gathers are often from widely spaced locations, there is
+      // not too much benefit to be expected.
       
-//       // next we assign the indices (which are ptrdiff_t) to the intended type
-//       // for gather/scatter indices - which is what Vc deems appropriate. This should
-//       // be the optimal choice in terms of performance. Yet we can't be certain that
-//       // the ptrdiff_t values actually fit into this type, which is usually composed of
-//       // int only. So we test if the assigned value compares equal to the assignee.
-//       // If the test fails for any of the indices, we switch to code using a
-//       // vigra::TinyVector < ptrdiff_t > for the indices, which is permissible, since
-//       // TinyVector offers operator[], but may be less efficient. On my system I could
-//       // not detect a run-time penalty, but I leave the test and code differentiation
-//       // in nevertheless.
-//       
-//       bool fits = true ;
-//       for ( e = 0 ; e < vsize ; e++ )
-//       {
-//         source_gs_indexes[e] = source_indexes[e] ;
-//         if ( source_gs_indexes[e] != source_indexes[e] )
-//           fits = false ;
-//       }
-//       
-//       if ( fits )
-//       {
-//         // perform extended gather with extrusion parameters to transport the unfiltered data
-//         // to the buffer, passing in source_gs_indexes for best performance.
-//         
-//         gather
-//           ( first_source_adress ,
-//             buffer_base_adress ,
-//             source_gs_indexes ,
-//             mask ,
-//             source_stride ,
-//             count ) ;
-//                                 
-//         // finally (puh): apply the prefilter, using the solver in-place, iterating over
-//         // the vectors in buffer with maximum efficiency.
-//                                 
-//         solver.solve ( buffer.begin() ) ;
-//         
-//         // and perform extended scatter with extrusion parameters to write the filtered data
-//         // to the destination
-// 
-//         scatter
-//           ( buffer_base_adress ,
-//             first_target_adress ,
-//             source_gs_indexes ,
-//             mask ,
-//             source_stride ,
-//             count ) ;
-//       }
-//       else
+      bool fits = true ;
+      for ( e = 0 ; fits && ( e < vsize ) ; e++ )
+      {
+        source_gs_indexes[e] = source_indexes[e] ;
+        if ( source_gs_indexes[e] != source_indexes[e] )
+          fits = false ;
+      }
+      
+      if ( fits )
+      {
+        // perform extended gather with extrusion parameters to transport the unfiltered data
+        // to the buffer, passing in source_gs_indexes for best performance.
+        
+        gather
+          ( first_source_adress ,
+            buffer_base_adress ,
+            source_gs_indexes ,
+            mask ,
+            source_stride ,
+            count ) ;
+                                
+        // finally (puh): apply the prefilter, using the solver in-place, iterating over
+        // the vectors in buffer with maximum efficiency.
+                                
+        solver.solve ( buffer.begin() ) ;
+        
+        // and perform extended scatter with extrusion parameters to write the filtered data
+        // to the destination
+
+        scatter
+          ( buffer_base_adress ,
+            first_target_adress ,
+            source_gs_indexes ,
+            mask ,
+            source_stride ,
+            count ) ;
+      }
+      else
       {
         // Since the indices did not fit into the optimal type for gather/scatter
         // indices, we pass in a wider type, which may reduce performance, but is
@@ -1336,40 +1331,37 @@ void ele_aggregating_filter ( source_view_type &source ,
       if ( e < vsize )
         mask = ( simdized_math_type::IndexesFromZero() < e ) ;
       
-// The next bit of code, which is commented out, tried to use an 'optimal' type
-// for gather/scatter indices by picking simdized_math_type::IndexType. But it
-// turned out the resulting code was slower on my system and requires testing for
-// overflow. So I leave the code in but I'm not using it:
-      
-//       bool fits = true ;
-//       for ( e = 0 ; e < vsize ; e++ )
-//       {
-//         source_gs_indexes[e] = source_indexes[e] ;
-//         target_gs_indexes[e] = target_indexes[e] ;
-//         if (    source_gs_indexes[e] != source_indexes[e]
-//              || target_gs_indexes[e] != target_indexes[e] )
-//           fits = false ;
-//       }
-// 
-//       if ( fits )
-//       {
-//         gather
-//           ( first_source_adress ,
-//             buffer_base_adress ,
-//             source_gs_indexes ,
-//             mask ,
-//             source_stride ,
-//             count ) ;
-//         solver.solve ( buffer.begin() ) ;
-//         scatter
-//           ( buffer_base_adress ,
-//             first_target_adress ,
-//             target_gs_indexes ,
-//             mask ,
-//             target_stride ,
-//             count ) ;
-//       }
-//       else
+      // similar code here for the idexes, see notes above.
+
+      bool fits = true ;
+      for ( e = 0 ; fits && ( e < vsize ) ; e++ )
+      {
+        source_gs_indexes[e] = source_indexes[e] ;
+        target_gs_indexes[e] = target_indexes[e] ;
+        if (    source_gs_indexes[e] != source_indexes[e]
+             || target_gs_indexes[e] != target_indexes[e] )
+          fits = false ;
+      }
+
+      if ( fits )
+      {
+        gather
+          ( first_source_adress ,
+            buffer_base_adress ,
+            source_gs_indexes ,
+            mask ,
+            source_stride ,
+            count ) ;
+        solver.solve ( buffer.begin() ) ;
+        scatter
+          ( buffer_base_adress ,
+            first_target_adress ,
+            target_gs_indexes ,
+            mask ,
+            target_stride ,
+            count ) ;
+      }
+      else
       {
         gather
           ( first_source_adress ,

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git