[vspline] 72/72: modified multithread's partitioning behaviour multithread now takes a function pointer determine the partitioning function. In remap.h, partition_to_tiles is used by default. This routine has been modified to fall back to partition_to_stripes if the shape passed in isn't 2D or the partitioning to tiles doesn't yield the desired number of parts. This way, most use cases should be adequately covered, but specific needs for partitioning can also be addressed.

Sun Jul 2 09:02:44 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 6c6a977d78f1f872e2e41d984b2b6f1618b2ed9c
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sun Jun 25 11:10:12 2017 +0200

    modified multithread's partitioning behaviour
    multithread now takes a function pointer determine the partitioning
    function. In remap.h, partition_to_tiles is used by default. This
    routine has been modified to fall back to partition_to_stripes if
    the shape passed in isn't 2D or the partitioning to tiles doesn't yield
    the desired number of parts. This way, most use cases should be adequately
    covered, but specific needs for partitioning can also be addressed.
---
 bspline.h               |   1 +
 example/pano_extract.cc |  14 ++-
 multithread.h           | 227 ++++++++++++++++++++++++++++++++----------------
 remap.h                 |  13 ++-
 4 files changed, 178 insertions(+), 77 deletions(-)

diff --git a/bspline.h b/bspline.h
index 2849334..9cdc1bd 100644
--- a/bspline.h
+++ b/bspline.h
@@ -348,6 +348,7 @@ public:
         break ;
       }
     }
+    return core_shape ;
   }
 
   /// construct a bspline object with appropriate storage space to contain and process an array
diff --git a/example/pano_extract.cc b/example/pano_extract.cc
index b423a2d..99a26d2 100644
--- a/example/pano_extract.cc
+++ b/example/pano_extract.cc
@@ -26,7 +26,7 @@
 /// This program extracts a rectilinear image from a  from a (full) spherical
 /// panorama with a given size, horizontal field of view, yaw, pitch and roll
 ///
-/// compile: clang++ -std=c++11 -march=native -o pano_extract -O3 -pthread -DUSE_VC=1 pano_extract.cc -lVc -lvigraimpex
+/// compile: clang++ -std=c++11 -march=native -o pano_extract -O3 -pthread -DUSE_VC pano_extract.cc -lVc -lvigraimpex
 ///
 /// invoke: pano_extract [options] imagefile
 ///
@@ -664,7 +664,11 @@ void process_image ( const char * name ,
     shape_range_type < 2 > range ( shape_type() , bv.shape() ) ;
     
 #ifdef GAMMA
-    multithread ( &rdegamma , ncores , range , &bv ) ;
+    multithread ( &rdegamma ,
+                  vspline::partition_to_stripes ,
+                  ncores ,
+                  range ,
+                  &bv ) ;
 #endif
     
     // still we need to work on floating point data
@@ -710,7 +714,11 @@ void process_image ( const char * name ,
     shape_range_type < 2 > range2 ( shape_type() , core_flat_view.shape() ) ;
     
 #ifdef GAMMA
-    multithread ( &r_v_to_l_rgb<rc_type,65535> , ncores , range2 , &core_flat_view ) ;
+    multithread ( &r_v_to_l_rgb<rc_type,65535> ,
+                  vspline::partition_to_stripes ,
+                  ncores ,
+                  range2 ,
+                  &core_flat_view ) ;
 #endif    
     component_max = 65535 ;
   }
diff --git a/multithread.h b/multithread.h
index a9e44f1..497a3f5 100644
--- a/multithread.h
+++ b/multithread.h
@@ -150,13 +150,16 @@ using partition_type = std::vector < range_type > ;
 /// vigra::MultiArrayIndex of this dimension.
 /// This is equivalent to vigra's shape type.
 
+// TODO: might instead define as: vigra::MultiArrayShape<dimension>
+
 template < int dimension >
 using shape_type = vigra::TinyVector <  vigra::MultiArrayIndex , dimension > ;
 
 /// given a dimension, we define shape_range_type as a range defined by
 /// two shapes of the given dimension. This definition allows us to directly
 /// pass the two shapes as arguments to a call of subarray() on a MultiArrayView
-/// of the given dimension
+/// of the given dimension. Note the subarray semantics: if the range is
+/// [2,2] to [4,4], it refers to elements [2,2], [3,2], [2,3], [3,3].
 
 template < int dimension >
 using shape_range_type = range_type < shape_type < dimension > > ;
@@ -275,7 +278,7 @@ struct shape_splitter
       int w = shape [ split_dim ] ;  // extent of the dimension we can split
       n = std::min ( n , w ) ;       // just in case, if that is smaller than n
       
-      int cut [ n ] ;   // where to chop up this dimension
+      int * cut = new int [ n ] ;    // where to chop up this dimension
       
       for ( int i = 0 ; i < n ; i++ )
         cut[i] = ( (i+1) * w ) / n ;   // roughly equal chunks, but certainly last cut == a.end()
@@ -288,40 +291,41 @@ struct shape_splitter
         res.push_back ( range_t ( start , end ) ) ;
         start [ split_dim ] = end [ split_dim ] ;
       }
+      delete[] cut ; // clean up
     }
     return res ;
   }
 } ;
 
-// /// for any dimension d, we specialize function partition() for shape_range_type<d>.
-// /// we use shape_splitter to perform the split. This function template
-// /// provides the default partitioning mechanism for a shape_range, which is used by
-// /// the second variant of multithread(), which is called with a range and a number
-// /// of desired threads instead of a ready-made partitioning. This currently unused
-// /// code would make shape_splitter the default splitter.
-// 
-// // template < int d >
-// // partition_type < shape_range_type<d> >
-// // partition ( shape_range_type<d> range , int nparts )
-// // {
-// //   if ( range[0].any() )
-// //   {
-// //     // the lower limit of the range is not at the origin, so get the shape
-// //     // of the region between range[0] and range[1], call shape_splitter with
-// //     // this shape, and add the offset to the lower limit of the original range
-// //     // to the partial ranges in the result
-// //     auto shape = range[1] - range[0] ;
-// //     auto res = shape_splitter < d > :: part ( shape , nparts ) ;
-// //     for ( auto & r : res )
-// //     {
-// //       r[0] += range[0] ;
-// //       r[1] += range[0] ;
-// //     }
-// //     return res ;
-// //   }
-// //   // if range[0] is at the origin, we don't have use an offset
-// //   return shape_splitter < d > :: part ( range[1] , nparts ) ;
-// // }
+/// partition a shape range into 'stripes'. This uses shape_splitter with
+/// 'forbid' left at the default of -1, resulting in a split along the
+/// outermost dimension that can be split n ways or the next best thing
+/// shape_splitter can come up with. If the intended split is merely to
+/// distribute the work load without locality considerations, this should
+/// be the split to use. When locality is an issue, consider the next variant.
+
+template < int d >
+partition_type < shape_range_type<d> >
+partition_to_stripes ( shape_range_type<d> range , int nparts )
+{
+  if ( range[0].any() )
+  {
+    // the lower limit of the range is not at the origin, so get the shape
+    // of the region between range[0] and range[1], call shape_splitter with
+    // this shape, and add the offset to the lower limit of the original range
+    // to the partial ranges in the result
+    auto shape = range[1] - range[0] ;
+    auto res = shape_splitter < d > :: part ( shape , nparts ) ;
+    for ( auto & r : res )
+    {
+      r[0] += range[0] ;
+      r[1] += range[0] ;
+    }
+    return res ;
+  }
+  // if range[0] is at the origin, we don't have use an offset
+  return shape_splitter < d > :: part ( range[1] , nparts ) ;
+}
 
 /// alternative partitioning into tiles. For the optimal situation, where
 /// the view isn't rotated or pitched much, the partitioning into bunches
@@ -331,13 +335,28 @@ struct shape_splitter
 /// to identical locality in both cases. So currently I am using this partitioning.
 /// note that the current implementation ignores the argument 'nparts' and
 /// produces tiles 160X160.
+
 // TODO code is a bit clumsy...
 
+// TODO it may be a good idea to have smaller portions towards the end
+// of the partitioning, since they will be processed last, and if the
+// last few single-threaded operations are short, they may result in less
+// situations where a long single-threaded operation has just started when
+// all other tasks are already done, causing the system to idle on the other
+// cores. or at least the problem will not persist for so long.
+
 template < int d >
 partition_type < shape_range_type<d> >
-partition ( shape_range_type<d> range ,
-            int nparts = default_njobs )
+partition_to_tiles ( shape_range_type<d> range ,
+                     int nparts = default_njobs )
 {
+  // To help with the dilemma that this function is really quite specific
+  // for images, for the time being I delegate to return partition_to_stripes()
+  // for dimensions != 2
+
+  if ( d != 2 )
+    return partition_to_stripes ( range , nparts ) ;
+
   auto shape = range[1] - range[0] ;
 
 // currently disregarding incoming nparts parameter:
@@ -345,6 +364,8 @@ partition ( shape_range_type<d> range ,
 //   int ntile = nelements / nparts ;
 //   int nedge = pow ( ntile , ( 1.0 / d ) ) ;
   
+  // TODO fixing this size is system-specific!
+  
   int nedge = 160 ; // instead: heuristic, fixed size tiles
 
   auto tiled_shape = shape / nedge ;
@@ -362,7 +383,15 @@ partition ( shape_range_type<d> range ,
   for ( int a = 0 ; a < d ; a++ )
     tiled_shape[a] = stops[a].size() - 1 ;
   
-  nparts = prod ( tiled_shape ) ;
+  int k = prod ( tiled_shape ) ;
+  
+  // If this partitioning scheme fails to produce a partitioning with
+  // at least nparts components, fall back to using partition_to_stripes()
+  
+  if ( k < nparts )
+    return partition_to_stripes ( range , nparts ) ;
+  
+  nparts = k ;
   partition_type < shape_range_type<d> > res ( nparts ) ;
   
   for ( int a = 0 ; a < d ; a++ )
@@ -395,36 +424,37 @@ partition ( shape_range_type<d> range ,
   return res ;
 }
 
-/// specialization for 1D shape range
-
-template<>
-partition_type < shape_range_type<1> >
-partition ( shape_range_type<1> range ,
-            int nparts )
-{
-  auto size = range[1][0] - range[0][0] ;
-  auto part_size = size / nparts ;
-  if ( part_size < 1 )
-    part_size = size ;
-  
-  nparts = int ( size / part_size ) ;
-  if ( nparts * part_size < size )
-    nparts++ ;
-
-  partition_type < shape_range_type<1> > res ( nparts ) ;
-  
-  auto start = range[0] ;
-  auto stop = start + part_size ;
-  for ( auto & e : res )
-  {
-    e[0] = start ;
-    e[1] = stop ;
-    start = stop ;
-    stop = start + part_size ;
-  }
-  res[nparts-1][1] = size ;
-  return res ;
-}
+// /// specialization for 1D shape range. Obviously we can't make tiles
+// /// from 1D data...
+// 
+// template<>
+// partition_type < shape_range_type<1> >
+// partition_to_tiles ( shape_range_type<1> range ,
+//                      int nparts )
+// {
+//   auto size = range[1][0] - range[0][0] ;
+//   auto part_size = size / nparts ;
+//   if ( part_size < 1 )
+//     part_size = size ;
+//   
+//   nparts = int ( size / part_size ) ;
+//   if ( nparts * part_size < size )
+//     nparts++ ;
+// 
+//   partition_type < shape_range_type<1> > res ( nparts ) ;
+//   
+//   auto start = range[0] ;
+//   auto stop = start + part_size ;
+//   for ( auto & e : res )
+//   {
+//     e[0] = start ;
+//     e[1] = stop ;
+//     start = stop ;
+//     stop = start + part_size ;
+//   }
+//   res[nparts-1][1] = size ;
+//   return res ;
+// }
 
 /// action_wrapper wraps a functional into an outer function which
 /// first calls the functional and then checks if this was the last
@@ -452,10 +482,26 @@ void action_wrapper ( std::function < void() > payload ,
   // notify_all with the lock guard still in effect seemed to remove the
   // problem, but made me unsure of my logic.
   
-  std::lock_guard<std::mutex> lk ( * p_pool_mutex ) ;
-  if ( ++ ( * p_done ) == nparts ) ;
+  // 2017-06-23 after removing a misplaced semicolon after the conditional
+  // below I recoded to perform the notification after closing the lock_guard's
+  // scope, and now there doesn't seem to be any problem any more. I leave
+  // these comments in for reference in case things go wrong
+  // TODO remove this and previous comment if all is well
+  
+  bool last_one = false ;
+
+  {
+    std::lock_guard<std::mutex> lk ( * p_pool_mutex ) ;
+    if ( ++ ( * p_done ) == nparts )
+    {
+      // this was the last action originating from the coordinator
+      // so we set the flag which triggers the notification
+      last_one = true ;
+    }
+  }
+  
+  if ( last_one )
   {
-    // this was the last action originating from the coordinator
     // notify the coordinator that the joint task is now complete
     p_pool_cv->notify_one() ;
   }
@@ -546,12 +592,45 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
   return nparts ;
 }
 
-/// this overload of multithread() takes the desired number of tasks and a
-/// range covering the 'whole' data. partition() is called with the range,
-/// resulting in a partitioning which above overload of multithread() can use.
+// /// this overload of multithread() takes the desired number of tasks and a
+// /// range covering the 'whole' data. partition() is called with the range,
+// /// resulting in a partitioning which above overload of multithread() can use.
+// 
+// template < class range_type , class ...Types >
+// int multithread ( void (*pfunc) ( range_type , Types... ) ,
+//                   int nparts ,
+//                   range_type range ,
+//                   Types ...args )
+// {
+//   if ( nparts <= 1 )
+//   {
+//     // if only one part is requested, we take a shortcut and execute
+//     // the function right here:
+//     (*pfunc) ( range , args... ) ;
+//     return 1 ;
+//   }
+// 
+//   // partition the range using partition_to_tiles()
+// 
+//   partition_type < range_type > partitioning = partition_to_tiles ( range , nparts ) ;
+//   
+//   return multithread ( pfunc , partitioning , args... ) ;
+// }
+
+/// This variant of multithread() takes a pointer to a function performing
+/// the partitioning of the incoming range. The partitioning function is
+/// invoked on the incoming range (provided nparts is greater than 1) and
+/// the resulting partitioning is used as an argument to the first variant
+/// of multithread().
+
+// TODO It might be better to code this using std::function objects.
+
+// TODO may use move semantics for forwarding instead of relying on the
+// optimizer to figure this out
 
 template < class range_type , class ...Types >
 int multithread ( void (*pfunc) ( range_type , Types... ) ,
+                  partition_type < range_type > (*partition) ( range_type , int ) ,
                   int nparts ,
                   range_type range ,
                   Types ...args )
@@ -564,10 +643,12 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
     return 1 ;
   }
 
-  // partition the range. Rely on the presence of a function
-  // partition() which can process range_type:
+  // partition the range using the function pointed to by 'partition'
 
-  partition_type < range_type > partitioning = partition ( range , nparts ) ;
+  auto partitioning = (*partition) ( range , nparts ) ;
+  
+  // then pass pfunc, the partitioning and the remaining arguments
+  // to the variant of multithread() accepting a partitioning
   
   return multithread ( pfunc , partitioning , args... ) ;
 }
diff --git a/remap.h b/remap.h
index 02a0044..57e39ab 100644
--- a/remap.h
+++ b/remap.h
@@ -107,6 +107,11 @@
 ///
 /// Finally, this file also has a routine to restore the original knot point data from a
 /// bspline object. This is done very efficiently using grid_eval().
+///
+/// Note: Currently, the calls to multithread() are hardwired to use partition_to_tiles()
+/// as their partitioner. partition_to_tiles() falls back to partition_to_stripes() if
+/// it's 'own' partitioning scheme fails to produce the desired number of parts or if
+/// the data are 3D and higher. This way, most use cases should receive adequate treatment.
 
 #ifndef VSPLINE_REMAP_H
 #define VSPLINE_REMAP_H
@@ -321,6 +326,7 @@ void fill ( generator_type & gen ,
   // that we can't pass anything on by reference and use pointers instead.
 
   multithread ( & st_fill < generator_type , dim_target > ,
+                vspline::partition_to_tiles < dim_target > ,
                 njobs ,        // desired number of partitions
                 range ,        // 'full' range which is to be partitioned
                 &gen ,         // generator_type object
@@ -1071,7 +1077,12 @@ void grid_eval ( typename evaluator_type::rc_type ** const grid_coordinate ,
 {
   shape_range_type < dim_out > range ( shape_type < dim_out > () , result.shape() ) ;
   multithread ( st_grid_eval < evaluator_type , dim_out > ,
-                ncores * 8 , range , grid_coordinate , &itp , &result ) ;
+                vspline::partition_to_tiles < dim_out > ,
+                ncores * 8 ,
+                range ,
+                grid_coordinate ,
+                &itp ,
+                &result ) ;
 }
 
 /// grid_eval allows us to code a function to restore the original knot point

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git