[vspline] 13/72: multithread() from multithread.h now used for all multithreading I did the rewrite of the code for multithreading in remap.h and pv.cc. This made the 'divide_and_conquer' code in common.h obsolete. While using multithread() I found no way to pass references 'through' multithread(), so now I use pointers throughout to do the passage. Changing the single-threaded code turned out straightforward by simply adding the extra range parameter, changing all access to manifolds to pointer access but instantly on procedure entry forming the subarrays:
Kay F. Jahnke
kfj-guest at moszumanska.debian.org
Sun Jul 2 09:02:38 UTC 2017
This is an automated email from the git hooks/post-receive script.
kfj-guest pushed a commit to branch master
in repository vspline.
commit 8e7ee9df61a95c6b06202a6606cd92a84457884f
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date: Mon Nov 21 17:46:05 2016 +0100
multithread() from multithread.h now used for all multithreading
I did the rewrite of the code for multithreading in remap.h and pv.cc.
This made the 'divide_and_conquer' code in common.h obsolete. While using
multithread() I found no way to pass references 'through' multithread(),
so now I use pointers throughout to do the passage. Changing the single-threaded
code turned out straightforward by simply adding the extra range parameter,
changing all access to manifolds to pointer access but instantly on procedure
entry forming the subarrays:
void pan_v ( shape_range_type < 2 > range ,
warp_array_view_type * p_warp )
{
auto warp = p_warp->subarray ( range[0] , range[1] ) ;
auto ofs = range[0] ;
...
and in the code calling multithread():
shape_range_type < 2 > range ( shape_type < 2 > () , warp.shape() ) ;
warp_array_view_type wv ( warp ) ;
multithread ( & pan_v , ncores , range , &wv ) ;
---
common.h | 489 ----------------------------------------------------------
filter.h | 28 ++--
multithread.h | 74 ++++++++-
remap.h | 278 +++++++++++++++------------------
4 files changed, 212 insertions(+), 657 deletions(-)
diff --git a/common.h b/common.h
index aeb9768..50cb7ff 100644
--- a/common.h
+++ b/common.h
@@ -187,495 +187,6 @@ std::vector < std::string > bc_name =
const int ncores = std::thread::hardware_concurrency() ;
-/// split_array_to_chunks partitions an array. This is used for multithreading:
-/// It's possible to distribute the processing to several threads by splitting
-/// the array to chunks so that the axis which is processed is not broken up.
-/// That's what the 'forbid' parameter is for: it keeps the routine from wrongly
-/// splitting the array along a specific axis. With the default of -1, all axes
-/// are considered valid candidates for splitting.
-/// The return value indicates into how many chunks the input array was actually
-/// split. Splitting sounds like an expensive operation, but it is not. All processing
-/// is done with views, the data aren't copied.
-
-template < class view_t >
-int split_array_to_chunks ( view_t & a , ///< view to array to be split n-ways
- std::vector < view_t > & res , ///< vector to accomodate the chunks
- int n , ///< intended number of chunks
- int forbid = -1 ) ///< axis which shouldn't be split
-{
- const int dim = view_t::actual_dimension ;
- typedef typename view_t::difference_type shape_t ;
-
- // find outermost dimension which can be split n-ways
- shape_t shape = a.shape() ;
-
- // find the outermost dimension that can be split n ways, and it's extent
- int split_dim = -1 ;
- int max_extent = -1 ;
- for ( int md = dim - 1 ; md >= 0 ; md-- )
- {
- if ( md != forbid
- && shape[md] > max_extent
- && shape[md] >= n )
- {
- max_extent = shape[md] ;
- split_dim = md ;
- break ;
- }
- }
-
- // if the search did not yet succeed:
- if ( max_extent == -1 )
- {
- // repeat process with relaxed conditions: now the search will also succeed
- // if there is an axis which can be split less than n ways
- for ( int md = dim - 1 ; md >= 0 ; md-- )
- {
- if ( md != forbid
- && shape[md] > 1 )
- {
- max_extent = shape[md] ;
- split_dim = md ;
- break ;
- }
- }
- }
-
- if ( split_dim != -1 ) // we have found a dimension for splitting
- {
- int w = shape [ split_dim ] ; // extent of the dimension we can split
- n = std::min ( n , w ) ; // just in case, if that is smaller than n
-
- int cut [ n ] ; // where to chop up this dimension
-
- for ( int i = 0 ; i < n ; i++ )
- cut[i] = ( (i+1) * w ) / n ; // roughly equal chunks, but certainly last cut == a.end()
-
- shape_t start , end = shape ;
-
- for ( int i = 0 ; i < n ; i++ )
- {
- end [ split_dim ] = cut [ i ]; // apply the cut locations
- res.push_back ( a.subarray ( start , end ) ) ;
- start [ split_dim ] = end [ split_dim ] ;
- }
- }
- else // no luck, fill the vector with the initial array as it's sole member
- {
- res.push_back ( a ) ;
- n = 1 ;
- }
- return n ; // return the number of chunks
-}
-
-/// divide_and_conquer tries to split an array into chunks and processes each chunk
-/// in a separate thread with the functor passed as 'func'. This functor takes a reference
-/// to a chunk of data and it's offset in the original array. There's a variant of run()
-/// taking a point function and a variant taking a vectorized point function
-
-template < class view_type > // type of input array
-struct divide_and_conquer
-{
- enum { dim = view_type::actual_dimension } ;
- typedef typename view_type::value_type value_type ;
- enum { channels = vigra::ExpandElementResult < value_type > :: size } ;
- typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
- typedef typename view_type::difference_type shape_type ;
-
- /// callable chunk_func is used to process each individual chunk in it's
- /// separate thread. Alternatively, if splitting fails, it is used on the whole
- /// data set. The additional shape which is passed in can be used by the chunk
- /// processing function to determine it's position inside the 'whole' array.
-
- typedef std::function < void ( view_type & , // chunk of data to process
- shape_type ) // offset of chunk in 'whole' data set
- > chunk_func ;
-
- /// run tries to split 'data' into 'nthread' views. If 'forbid' refers to an axis
- /// of 'data', this axis will not be split. If splitting fails, 'func' is applied
- /// to 'data', otherwise, 'func' is applied to all chunks that resulted from splitting,
- /// concurrently, in separate threads.
-
- static void run ( view_type & data , // array of data to divide (into chunks)
- chunk_func func , // functor to apply to each chunk
- int forbid = -1 , // axis that should not be split (default: no restriction)
- int nthreads = ncores )
- {
- if ( nthreads > 1 )
- {
- // we split the array into equal chunks
- // to process them by one thread each
-
- std::vector<view_type> dn ; // space for chunks
- nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
-
- // one thread's here already
- int new_threads = nthreads - 1 ;
- std::thread * t[new_threads] ;
-
- // we'll tell the threads where in relation to the start of the data
- // 'their' chunk is located. This is needed if the coordinates are used
- // as input, like in the transform-based remap
-
- shape_type offset ;
-
- // create the threads, each with correct chunk and offset
- int s ;
- for ( s = 0 ; s < new_threads ; s++ )
- {
- t[s] = new std::thread ( func , std::ref(dn[s]) , offset ) ;
- for ( int d = 0 ; d < dim ; d ++ )
- {
- if ( dn[s].shape ( d ) != data.shape(d) )
- offset[d] += dn[s].shape(d) ;
- }
- }
- // and process the last chunk in this thread
- func ( dn[s] , offset ) ;
-
- // finally, join the threads
- for ( int s = 0 ; s < new_threads ; s++ )
- {
- t[s]->join() ;
- delete t[s] ;
- }
- }
- else // if we're single-threaded we call func in this thread
- {
- func ( data , shape_type() ) ;
- }
- }
-
- /// next we have a few methods for applying point functors to a MultiArrayView.
- /// this is done by building a suitable chunk_func from a point functor and an
- /// 'applicator' routine, and then passing this chunk_func to run() above, to
- /// have it applied in multiple threads. First we define the mechanism for
- /// point functors operating on value_type:
- ///
- /// callable point_func 'treats' a value_type passed in by reference
-
- typedef std::function < void ( value_type & ) > point_func ;
-
- /// apply is a helper function that applies 'f' to each value in 'data'.
- /// By binding it's first agument to a specific point function, we receive
- /// a functor that satisfies the chunk_func signature.
-
- static void apply ( point_func f , view_type & data , shape_type offset )
- {
- for ( auto& value : data )
- f ( value ) ;
- }
-
- /// overload of 'run' applying a point function to all values in all chunks
- /// this is where the chunk_func needed for run() above is built and used
-
- static void run ( view_type & data , // array of data to divide (into chunks)
- point_func func , // point functor to apply to each value
- int nthreads = ncores )
- {
- using namespace std::placeholders ;
-
- // we use bind to make a chunk_func by binding the point function argument
- // to apply() to 'func'. The resulting two-argument functor satisfies the
- // chunk_func signature and can be passed to run() above
-
- chunk_func treat = std::bind ( apply , // use apply to apply
- func , // this point function
- _1 , // to this chunk of data
- _2 ) ; // located at this offset inside 'data'
-
- // the resulting functor is passed to the version of run() taking a chunk_func
-
- run ( data , treat , -1 , nthreads ) ; // delegate processing to run variant above
- }
-
-#ifdef USE_VC
-
- typedef typename vector_traits < ele_type > :: type value_v ;
- enum { vsize = value_v::Size } ;
-
- /// now we repeat the pattern for vectorized operation.
- /// callable v_point_func 'treats' a value_v passed in by reference.
-
- typedef std::function < void ( value_v & ) > v_point_func ;
-
- /// again we have a helper function. We'll use bind on the first
- /// argument to generate a functor satisfying the chunk_func signature.
- /// vapply reads individual elements into a vector-sized buffer and calls
- /// the vectorized point function vf on them.
-
- static void vapply ( v_point_func vf ,
- view_type & data ,
- shape_type offset )
- {
- int aggregates = data.elementCount() / vsize ; // number of full vectors
- int leftovers = data.elementCount() - aggregates * vsize ; // and of any leftover single values
- value_v buffer ;
- auto it = data.begin() ;
-
- if ( data.isUnstrided() )
- {
- typedef typename vector_traits < int , vsize > :: type ic_v ;
- typedef typename vigra::TinyVector < ic_v , channels > :: type mc_ic_v ;
-
- // if data is unstrided, we can process it's contents by gather/scatter
- // operations to vectors which we process with the functor vf. This is
- // about as fast as we can go.
-
- mc_ic_v mc_interleave ;
- for ( int ch = 0 ; ch < channels ; ch++ )
- {
- mc_interleave[ch] = ic_v::IndexesFromZero() ;
- mc_interleave[ch] *= channels ;
- mc_interleave[ch] += ch ;
- }
-
- value_type * destination = data.data() ;
-
- for ( int a = 0 ; a < aggregates ; a++ )
- {
- for ( int ch = 0 ; ch < channels ; ch++ )
- {
- buffer.gather ( (ele_type*) destination , mc_interleave [ ch ] ) ;
- vf ( buffer ) ;
- buffer.scatter ( (ele_type*) destination , mc_interleave [ ch ] ) ;
- }
- destination += vsize ;
- }
- it += aggregates * vsize ;
- }
- else
- {
- // if the data are strided, we 'manually' fill a vector at a time
- for ( int a = 0 ; a < aggregates ; a++ )
- {
- for ( int ch = 0 ; ch < channels ; ch++ )
- {
- for ( int e = 0 ; e < vsize ; e++ ) // fill vector
- buffer[e] = it[e][ch] ;
- vf ( buffer ) ; // call functor
- for ( int e = 0 ; e < vsize ; e++ )
- it[e][ch] = buffer[e] ; // unpack vector
- }
- it += vsize ;
- }
- }
-
- // process leftovers, if any
-
- if ( leftovers )
- {
- for ( int ch = 0 ; ch < channels ; ch++ )
- {
- int e ;
- for ( e = 0 ; e < leftovers ; e++ )
- buffer[e] = it[e][ch] ; // fill in leftover values
-
- for ( ; e < vsize ; e++ ) // just in case no aggregates were processed previously:
- buffer[e] = it[0][ch] ; // fill buffer up with suitable values
-
- vf ( buffer ) ; // process buffer
-
- for ( e = 0 ; e < leftovers ; e++ )
- it[e][ch] = buffer[e] ; // write back processed values
- }
- }
- }
-
- /// overload of 'run' applying a vectorized point function to all elements in all chunks
-
- static void run ( view_type & data , // array of data to divide (into chunks)
- v_point_func vfunc , // vectorized point functor
- int nthreads = ncores )
- {
- using namespace std::placeholders ;
-
- // again we use bind to create a functor satisfying the chunk_func signature
- // by using bind - this time on vapply, where we bind the first two arguments
- // to 'vfunc' and 'func'.
-
- chunk_func treat = std::bind ( vapply , // use vapply to apply
- vfunc , // this vectorized point functor
- _1 , // to this chunk of data
- _2 ) ; // which is at this offset inside 'data'
-
- // the resulting functor is delegated to the version of run() taking a chunk_func
-
- run ( data , treat , -1 , nthreads ) ;
- }
-
-
-#endif
-
-} ;
-
-/// divide_and_conquer_2 works just like divide_and_conquer, but operates on
-/// two arrays synchronously.
-
-template < class view_type , // type of first array
- class view_type_2 > // type of second array
-struct divide_and_conquer_2
-{
- enum { dim = view_type::actual_dimension } ;
- typedef typename view_type::difference_type shape_type ;
-
- // run() takes two MultiArrayViews of the same shape, splits them
- // in the same way, and applies a functor to each resulting pair of chunks. This is
- // needed for functions like remap().
- // TODO: we might even formulate a general manifold by accepting a vector of arrays,
- // but currently there is no code in vspline which uses more than two views together.
-
- typedef std::function < void ( view_type & , // chunk of data to process
- view_type_2 & , // chunk of data2 to process
- shape_type ) // offset of chunk in 'whole' data set
- > chunk_func_2 ;
-
- static void run ( view_type & data , // array of data to divide (into chunks)
- view_type_2 & data2 , // second array to coprocess with data
- chunk_func_2 func , // functor to apply to each pair of chunks
- int forbid = -1 , // axis that should not be split (default: no restriction)
- int nthreads = ncores )
- {
- if ( view_type_2::actual_dimension != dim )
- throw ( dimension_mismatch ( "both views must have the same dimension" ) ) ;
-
- // make sure that data and data2 are shape-compatible
- // we silently assume that we can coiterate both arrays in scan order.
- if ( data.shape() != data2.shape() )
- {
- throw shape_mismatch ( "both views must have the same shape" ) ;
- }
-
- if ( nthreads > 1 )
- {
- // we split both arrays into equal chunks
- // to coprocess them by one thread each
-
- std::vector<view_type> dn ; // space for chunks of data
- nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
-
- std::vector<view_type_2> d2n ; // space for chunks of data2
- nthreads = split_array_to_chunks <view_type_2> ( data2 , d2n , nthreads , forbid ) ;
-
- int new_threads = nthreads - 1 ;
-
- std::thread * t[new_threads] ;
- shape_type offset ;
-
- int s ;
- for ( s = 0 ; s < new_threads ; s++ )
- {
- t[s] = new std::thread ( func , std::ref(dn[s]) , std::ref(d2n[s]), offset ) ;
- for ( int d = 0 ; d < dim ; d ++ )
- {
- if ( dn[s].shape ( d ) != data.shape(d) )
- offset[d] += dn[s].shape(d) ;
- }
- }
- // process the last chunk in this thread
- func ( std::ref(dn[s]) , std::ref(d2n[s]), offset ) ;
- for ( int s = 0 ; s < new_threads ; s++ )
- {
- t[s]->join() ;
- delete t[s] ;
- }
- }
- else // if we're single-threaded we call func in this thread
- {
- func ( data , data2 , shape_type() ) ;
- }
- }
-} ;
-
-/// divide_and_conquer_3 works just like divide_and_conquer_2, but operates on
-/// three arrays synchronously.
-
-template < class view_type , // type of first array
- class view_type_2 , // type of second array
- class view_type_3 >
-struct divide_and_conquer_3
-{
- enum { dim = view_type::actual_dimension } ;
- typedef typename view_type::difference_type shape_type ;
-
- // run() takes two MultiArrayViews of the same shape, splits them
- // in the same way, and applies a functor to each resulting pair of chunks. This is
- // needed for functions like remap().
- // TODO: we might even formulate a general manifold by accepting a vector of arrays,
- // but currently there is no code in vspline which uses more than two views together.
-
- typedef std::function < void ( view_type & , // chunk of data to process
- view_type_2 & , // chunk of data2 to process
- view_type_3 & , // chunk of data3 to process
- shape_type ) // offset of chunk in 'whole' data set
- > chunk_func_3 ;
-
- static void run ( view_type & data , // array of data to divide (into chunks)
- view_type_2 & data2 , // second array to coprocess with data
- view_type_3 & data3 , // third array to coprocess with data
- chunk_func_3 func , // functor to apply to each pair of chunks
- int forbid = -1 , // axis that should not be split (default: no restriction)
- int nthreads = ncores )
- {
- if ( view_type_2::actual_dimension != dim
- || view_type_3::actual_dimension != dim
- )
- throw ( dimension_mismatch ( "all views must have the same dimension" ) ) ;
-
- // make sure that data, data2 and data3 are shape-compatible
- // we silently assume that we can coiterate both arrays in scan order.
- if ( data.shape() != data2.shape()
- || data.shape() != data3.shape() )
- {
- throw shape_mismatch ( "all views must have the same shape" ) ;
- }
-
- if ( nthreads > 1 )
- {
- // we split both arrays into equal chunks
- // to coprocess them by one thread each
-
- std::vector<view_type> dn ; // space for chunks of data
- nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
-
- std::vector<view_type_2> d2n ; // space for chunks of data2
- nthreads = split_array_to_chunks <view_type_2> ( data2 , d2n , nthreads , forbid ) ;
-
- std::vector<view_type_3> d3n ; // space for chunks of data3
- nthreads = split_array_to_chunks <view_type_3> ( data3 , d3n , nthreads , forbid ) ;
-
- int new_threads = nthreads - 1 ;
-
- std::thread * t[new_threads] ;
- shape_type offset ;
-
- int s ;
- for ( s = 0 ; s < new_threads ; s++ )
- {
- t[s] = new std::thread ( func ,
- std::ref(dn[s]) , std::ref(d2n[s]), std::ref(d3n[s]) ,
- offset ) ;
- for ( int d = 0 ; d < dim ; d ++ )
- {
- if ( dn[s].shape ( d ) != data.shape(d) )
- offset[d] += dn[s].shape(d) ;
- }
- }
- // process the last chunk in this thread
- func ( std::ref(dn[s]) , std::ref(d2n[s]) , std::ref(d3n[s]), offset ) ;
- for ( int s = 0 ; s < new_threads ; s++ )
- {
- t[s]->join() ;
- delete t[s] ;
- }
- }
- else // if we're single-threaded we call func in this thread
- {
- func ( data , data2 , data3 , shape_type() ) ;
- }
- }
-} ;
-
} ; // end of namespace vspline
#endif // VSPLINE_COMMON
\ No newline at end of file
diff --git a/filter.h b/filter.h
index e784109..6200ba8 100644
--- a/filter.h
+++ b/filter.h
@@ -1359,6 +1359,7 @@ public:
// multithread() expects a std::function, so here we fix the type of
// std::function we want to pass to it, create an instance of the std::function
// and assign the routine appropriate to the task:
+ // TODO: I'd like to pass the MultiArrayViews per reference, just for consistency.
typedef
std::function < void ( range_t ,
@@ -1373,19 +1374,28 @@ public:
filter_functor_type filter_functor ;
+// #ifdef USE_VC
+//
+// if ( use_vc )
+// filter_functor = aggregating_filter < input_array_type , output_array_type > ;
+// else
+// filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
+//
+// #else
+//
+// filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
+//
+// #endif
+
+ auto pf = & nonaggregating_filter < input_array_type , output_array_type > ;
+
#ifdef USE_VC
if ( use_vc )
- filter_functor = aggregating_filter < input_array_type , output_array_type > ;
- else
- filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
+ pf = &aggregating_filter < input_array_type , output_array_type > ;
-#else
-
- filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
-
#endif
-
+
// obtain a partitioning of the data array into subranges. We do this 'manually' here
// because we must instruct shape_splitter not to chop up the current processing axis
// (by passing the 'forbid' parameter, which is taken as -1 (no effect) in the default
@@ -1393,7 +1403,7 @@ public:
partition_t partitioning = shape_splitter < dim > :: part ( input.shape() , nslices , axis ) ;
- multithread ( filter_functor ,
+ multithread ( pf ,
partitioning ,
input ,
output ,
diff --git a/multithread.h b/multithread.h
index 00bff78..96abd37 100644
--- a/multithread.h
+++ b/multithread.h
@@ -51,7 +51,7 @@
/// range_type merely captures the concept of a range, taking 'limit_type' as it's
/// template parameter, so that any type of range can be accomodated.
/// Next we define an object holding a set of ranges, modeling a partitioning of
-/// an 'original' range into subranges, which, withing the context of this code,
+/// an 'original' range into subranges, which, within the context of this code,
/// are disparate and in sequence. This object is modeled as struct partition_type,
/// taking a range_type as it's template argument.
/// With these types, we model concrete ranges and partitionings. The most important
@@ -283,9 +283,53 @@ partition_type < shape_range_type<d> > partition ( shape_range_type<d> range , i
/// process. The situation inside vspline where this is necessary is in the prefiltering
/// code, where the partitioning has to avoid splitting up the current processing axis,
/// passing the exis as additional parameter 'forbid' to class shape_splitter.
+// TODO I can't yet pass references 'through' the multithreading mechanism.
+
+// template < class range_type , class ...Types >
+// int multithread ( std::function < void ( range_type , Types... ) > func ,
+// partition_type < range_type > partitioning ,
+// Types ...args )
+// {
+// // get the number of ranges in the partitioning
+//
+// int nparts = partitioning.size() ;
+//
+// assert ( nparts > 0 ) ; // make sure, for now, that there is at least one range
+//
+// // set up thread objects for the additional threads we'll use (if any)
+//
+// int new_threads = nparts - 1 ;
+// std::thread * t[new_threads] ;
+//
+// // start the additional threads (if any), passing func as the thread's functional,
+// // followed by the range over which func is meant to operate, followed by any other
+// // arguments passed to multithread as parameter pack 'args'
+//
+// for ( int s = 0 ; s < new_threads ; s++ )
+// {
+// t[s] = new std::thread ( func , partitioning[s] , args... ) ;
+// }
+//
+// // do the last partial job in this very thread. this may be the only job,
+// // if partitioning only contains a single range
+//
+// func ( partitioning [ new_threads ] , args... ) ;
+//
+// // join the additional threads (if any) and delete their thread objects
+//
+// for ( int s = 0 ; s < new_threads ; s++ )
+// {
+// t[s]->join() ;
+// delete t[s] ;
+// }
+//
+// // and return the number of threads which was used.
+//
+// return nparts ;
+// }
template < class range_type , class ...Types >
-int multithread ( std::function < void ( range_type , Types... ) > func ,
+int multithread ( void (*pfunc) ( range_type , Types... ) ,
partition_type < range_type > partitioning ,
Types ...args )
{
@@ -306,13 +350,13 @@ int multithread ( std::function < void ( range_type , Types... ) > func ,
for ( int s = 0 ; s < new_threads ; s++ )
{
- t[s] = new std::thread ( func , partitioning[s] , args... ) ;
+ t[s] = new std::thread ( pfunc , partitioning[s] , args... ) ;
}
// do the last partial job in this very thread. this may be the only job,
// if partitioning only contains a single range
- func ( partitioning [ new_threads ] , args... ) ;
+ pfunc ( partitioning [ new_threads ] , args... ) ;
// join the additional threads (if any) and delete their thread objects
@@ -333,8 +377,24 @@ int multithread ( std::function < void ( range_type , Types... ) > func ,
/// subranges. If the calling code doesn't to influence the partitioning process, this
/// is the variant to use.
+// template < class range_type , class ...Types >
+// int multithread2 ( std::function < void ( range_type , Types... ) > func ,
+// int nparts ,
+// range_type range ,
+// Types ...args )
+// {
+// // partition the range and update nparts in case the partitioning produced
+// // less ranges than we initially asked for. Rely on the presence of a function
+// // partition() which can process range_type:
+//
+// partition_type < range_type > partitioning = partition ( range , nparts ) ;
+// nparts = partitioning.size() ;
+//
+// return multithread ( func , partitioning , args... ) ;
+// }
+
template < class range_type , class ...Types >
-int multithread ( std::function < void ( range_type , Types... ) > func ,
+int multithread ( void (*pfunc) ( range_type , Types... ) ,
int nparts ,
range_type range ,
Types ...args )
@@ -345,8 +405,8 @@ int multithread ( std::function < void ( range_type , Types... ) > func ,
partition_type < range_type > partitioning = partition ( range , nparts ) ;
nparts = partitioning.size() ;
-
- return multithread ( func , partitioning , args... ) ;
+
+ return multithread ( pfunc , partitioning , args... ) ;
}
} ; // end if namespace vspline
diff --git a/remap.h b/remap.h
index f32e624..fdb5e29 100644
--- a/remap.h
+++ b/remap.h
@@ -86,6 +86,7 @@
#ifndef VSPLINE_REMAP_H
#define VSPLINE_REMAP_H
+#include "multithread.h"
#include "eval.h"
namespace vspline {
@@ -114,14 +115,6 @@ struct coordinate_traits < vigra::TinyVector < T , N > >
typedef typename vigra::TinyVector < T , N >::value_type rc_type ;
} ;
-// TODO: I'd like to write it like this, but it won't compile:
-
-// template < typename T , int N >
-// struct coordinate_traits < vigra::TinyVector < T , N > >
-// {
-// typedef vigra::TinyVector < T , N >::value_type rc_type ;
-// } ;
-
/// since remap can also operate with pre-split coordinates, we need another partial
/// specialization of coordinate_traits for split_type, which also defines a value_type
@@ -131,29 +124,50 @@ struct coordinate_traits < split_type < N , IT , FT > >
typedef typename split_type < N , IT , FT >::value_type rc_type ;
} ;
+/// st_remap is the single-thread routine to perform a remap with a given warp array
+/// _warp (containing coordinates) into a target array _output, which takes the interpolated
+/// values at the locations the coordinates indicate, using a bspline as interpolator.
+/// While calling this function directly is entirely possible, code wanting to perform
+/// a remap would rather call the next function which partitions the task at hand and
+/// uses a separate thread for each partial job.
+/// The first parameter, range, specifies a subarray of _warp and _output which this
+/// routine is meant to process. this may be the full range, but normally it will cover
+/// only equally sized parts of these arrays, where the precise extent is determined
+/// by multithred() or it's caller in the next routine.
+/// Note how I pass in several pointers. Normally I'd pass such data by reference,
+/// but I can't do this here (or haven't found a way yet) because of my parameter-handling
+/// in multithread() which I couldn't get to work with references. The effect is the same.
+
template < class value_type , // like, float, double, complex<>, pixels, TinyVectors etc.
class coordinate_type , // usually TinyVector of real for coordinates
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int st_remap ( const bspline < value_type , dim_in > & bspl ,
- MultiArrayView < dim_out , coordinate_type > warp ,
- MultiArrayView < dim_out , value_type > output ,
- bool use_vc = true
- )
+void st_remap ( shape_range_type < dim_out > range ,
+ const bspline < value_type , dim_in > *p_bspl ,
+ MultiArrayView < dim_out , coordinate_type > *p_warp ,
+ MultiArrayView < dim_out , value_type > *p_output ,
+ bool use_vc = true
+ )
{
typedef bspline < value_type , dim_in > spline_type ;
typedef typename coordinate_traits < coordinate_type > :: rc_type rc_type ;
typedef evaluator < dim_in , value_type , rc_type , int > evaluator_type ;
typedef typename evaluator_type::ele_type ele_type ;
+
+ // pick out the subarrays specified by 'range'
+ // this idiom reoccurs throughout, since the multithreading code I use does not
+ // actually partition the containers it handles, but instead only creates information
+ // on how to partition them, and then passes this information to the functionals
+ // used for starting the threads. While this results in slight bloat of the single
+ // thread code, it makes the calling side much more legible and requires no use of
+ // bind, tuples or the likes.
+
+ auto warp = p_warp->subarray ( range[0] , range[1] ) ;
+ auto output = p_output->subarray ( range[0] , range[1] ) ;
+
+ // create an evaluator from the b-spline
- // do a bit of compatibility checking
-
- if ( output.shape() != warp.shape() )
- {
- throw shape_mismatch ( "the shapes of the warp array and the output array do not match" ) ;
- }
-
- evaluator_type ev ( bspl ) ;
+ evaluator_type ev ( *p_bspl ) ;
#ifdef USE_VC
@@ -264,48 +278,47 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
++source_it ;
++target_it ;
}
-
- return 0 ;
}
/// this remap variant performs the remap with several threads. The routine above is single-threaded,
/// but all individual evaluations are independent of each other. So making it multi-threaded
-/// is trivial: just split the warp array and target array n-ways and call the single-threaded
-/// version on these subarrays. We have utility code to perform the splitting of the arrays
-/// and processing the chunks in separate threads, namely divide_and_conquer_2.
+/// is trivial: just split the warp array and target array n-ways and have each thread process
+/// a chunk. Initially I used this approach directly, but later I switched to multithreading code
+/// which doesn't actually touch the containers, but instead only passes partitioning information
+/// to the individual threads, leaving it to them to split the containers accordingly, which
+/// results in simple and transparent code with only few limitations.
template < class value_type , // like, float, double, complex<>, pixels, TinyVectors etc.
class coordinate_type , // usually float for coordinates
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int remap ( const bspline < value_type , dim_in > & bspl ,
- MultiArrayView < dim_out , coordinate_type > warp ,
- MultiArrayView < dim_out , value_type > output ,
- bool use_vc = true ,
- int nthreads = ncores
+void remap ( const bspline < value_type , dim_in > & bspl ,
+ MultiArrayView < dim_out , coordinate_type > & warp ,
+ MultiArrayView < dim_out , value_type > & output ,
+ bool use_vc = true ,
+ int nthreads = ncores
)
{
- using namespace std::placeholders ;
-
- // we use bind to bind the first and last argument of the single-threaded
- // remap function, leaving two free arguments which will accept the two arrays
- // of data to coiterate over (being the chunks of warp and output which are created
- // by divide_and_conquer_2)
-
- auto chunk_func_2
- = std::bind ( st_remap < value_type , coordinate_type , dim_in , dim_out > , // single-threaded remap
- std::ref(bspl) , // bspline passed in above
- _1 , // placeholders to accept data chunks
- _2 , // from divide_and_conquer
- use_vc ) ; // use_vc flag as passed in above
-
- divide_and_conquer_2 < decltype ( warp ) , // run divide_and_conquer_2
- decltype ( output ) >
- :: run ( warp , // on the warp array
- output , // and the output array
- chunk_func_2 , // applying the functor we've made with bind
- -1 , // allowing array splitting along any axis
- nthreads ) ; // using nthreads threads
+ // check shape compatibility
+
+ if ( output.shape() != warp.shape() )
+ {
+ throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
+ }
+
+ // set up 'range' to cover a complete array of output's size
+ // (or, warp's size, which is the same)
+
+ shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
+
+ // call multithread(), specifying the single-threaded remap routine as the functor
+ // to invoke the threads, passing in 'range', which will be partitioned by multithread(),
+ // followed by all the other parameters the single-threaded remap needs, which is
+ // pretty much the set of parameters we've received here, with the subtle difference
+ // that we can't pass anything on by reference and use pointers instead.
+
+ multithread ( & st_remap < value_type , coordinate_type , dim_in , dim_out > ,
+ nthreads , range , &bspl , &warp , &output , use_vc ) ;
} ;
/// This is a variant of remap, which directly takes an array of values and remaps it,
@@ -333,16 +346,17 @@ int remap ( MultiArrayView < dim_in , value_type > input ,
if ( output.shape() != warp.shape() )
{
- throw shape_mismatch ( "the shapes of the warp array and the output array do not match" ) ;
+ throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
}
// create the bspline object
// TODO may want to specify tolerance here instead of using default
+
bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
- // copy in the data
- bsp.core = input ;
- // prefilter
- bsp.prefilter ( use_vc , nthreads ) ;
+
+ // prefilter, taking data in 'input' as knot point data
+
+ bsp.prefilter ( use_vc , nthreads , input ) ;
// and call the remap variant taking a bspline object,
// passing in the spline, the coordinate array and the target array
@@ -358,6 +372,7 @@ int remap ( MultiArrayView < dim_in , value_type > input ,
/// 'manually' de/interleave them. In my tests, this improved performance in unvectorized code,
/// while for vectorized code it made no noticeable difference.
// TODO: runs, but needs more testing
+// TODO: we want an equivalent processing offset/tune pairs which would be even more compact
template < class ic_type , class rc_type , int dimension , int coordinate_dimension >
class warper
@@ -377,14 +392,17 @@ public:
typedef typename vigra::MultiArrayShape < dimension > :: type shape_type ;
+ shape_type shape ;
+
public:
/// constructor taking a shape argument. This creates the containers for integral and
/// remainder parts of the coordinates internally.
- warper ( const shape_type & shape )
- : _select ( shape ) ,
- _tune ( shape )
+ warper ( const shape_type & _shape )
+ : shape ( _shape ) ,
+ _select ( _shape ) ,
+ _tune ( _shape )
{
select = _select ;
tune = _tune ;
@@ -409,10 +427,11 @@ template < class value_type , // like, float, double, complex<>, pixels, Ti
class nd_rc_type ,
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int st_wremap ( const bspline < value_type , dim_in > & bspl ,
- MultiArrayView < dim_out , nd_ic_type > select ,
- MultiArrayView < dim_out , nd_rc_type > tune ,
- MultiArrayView < dim_out , value_type > output ,
+void st_wremap ( shape_range_type < dim_out > range ,
+ const bspline < value_type , dim_in > * bspl ,
+ MultiArrayView < dim_out , nd_ic_type > _select ,
+ MultiArrayView < dim_out , nd_rc_type > _tune ,
+ MultiArrayView < dim_out , value_type > _output ,
bool use_vc = true
)
{
@@ -422,7 +441,11 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
typedef evaluator < dim_in , value_type , rc_type , ic_type > evaluator_type ;
typedef typename evaluator_type::ele_type ele_type ;
- evaluator_type ev ( bspl ) ;
+ auto select = _select.subarray ( range[0] , range[1] ) ;
+ auto tune = _tune.subarray ( range[0] , range[1] ) ;
+ auto output = _output.subarray ( range[0] , range[1] ) ;
+
+ evaluator_type ev ( *bspl ) ;
#ifdef USE_VC
@@ -492,8 +515,6 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
++tune_it ;
++target_it ;
}
-
- return 0 ;
}
/// overload of remap() using presplit coordinates packaged in a warper object
@@ -503,41 +524,25 @@ template < class value_type , // like, float, double, complex<>, pixels, Ti
class rc_type , // real coordinate part
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int remap ( const bspline < value_type , dim_in > & bspl ,
+void remap ( const bspline < value_type , dim_in > & bspl ,
warper < ic_type , rc_type , dim_out , dim_in > & warp ,
MultiArrayView < dim_out , value_type > output ,
bool use_vc = true ,
int nthreads = ncores
)
{
- using namespace std::placeholders ;
-
typedef warper < ic_type , rc_type , dim_out , dim_in > warper_type ;
typedef typename warper_type::nd_ic_type nd_ic_type ;
typedef typename warper_type::nd_rc_type nd_rc_type ;
- // we use bind to bind the first and last argument of the single-threaded
- // remap function, leaving two free arguments which will accept the two arrays
- // of data to coiterate over (being the chunks of warp and output which are created
- // by divide_and_conquer_2)
-
- auto chunk_func_3
- = std::bind ( st_wremap < value_type , nd_ic_type , nd_rc_type , dim_in , dim_out > ,
- std::ref(bspl) , // bspline passed in above
- _1 , // placeholders to accept data chunks
- _2 , // from divide_and_conquer
- _3 ,
- use_vc ) ; // use_vc flag as passed in above
-
- divide_and_conquer_3 < decltype ( warp.select ) ,
- decltype ( warp.tune ) ,
- decltype ( output ) >
- :: run ( warp.select , // on the warp array
- warp.tune ,
- output , // and the output array
- chunk_func_3 , // applying the functor we've made with bind
- -1 , // allowing array splitting along any axis
- nthreads ) ; // using nthreads threads
+ if ( output.shape() != warp.shape )
+ {
+ throw shape_mismatch ( "remap: the shapes of the warper and the output array do not match" ) ;
+ }
+ shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
+
+ multithread ( & st_wremap < value_type , nd_ic_type , nd_rc_type , dim_in , dim_out > ,
+ nthreads , range , &bspl , warp.select , warp.tune , output , use_vc ) ;
} ;
// Next we have some collateral code to get ready for the transformation-based remap
@@ -787,7 +792,6 @@ public:
c[0] ( mask ) -= shape[0] ; // fold back to range
for ( int d = 1 ; d < dimension ; d++ )
{
-// c[d] ( mask ) ++ ; // increase next-higher dimension's index
c[d] ( mask ) += 1 ; // increase next-higher dimension's index
mask = ( c[d] >= end[d] ) ; // check for range overflow
// resultant mask is either empty or the same as before
@@ -825,17 +829,20 @@ template < class value_type , // like, float, double, complex<>, pixels, Ti
class rc_type , // float or double, for coordinates
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
- transformation < value_type , rc_type , dim_in , dim_out > tf ,
- MultiArrayView < dim_out , value_type > output ,
- typename MultiArrayView < dim_out , value_type > :: difference_type offset ,
- bool use_vc = true
- )
+void st_tf_remap ( shape_range_type < dim_out > range ,
+ const bspline < value_type , dim_in > * bspl ,
+ transformation < value_type , rc_type , dim_in , dim_out > tf ,
+ MultiArrayView < dim_out , value_type > _output ,
+ bool use_vc = true
+ )
{
typedef bspline < value_type , dim_in > spline_type ;
typedef evaluator < dim_in , value_type , rc_type , int > evaluator_type ;
typedef typename evaluator_type::ele_type ele_type ;
- evaluator_type ev ( bspl ) ;
+
+ auto output = _output.subarray ( range[0] , range[1] ) ;
+ auto offset = range[0] ;
+ evaluator_type ev ( *bspl ) ;
#ifdef USE_VC
@@ -907,8 +914,6 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
ev.eval ( cs_out , target_it.get<1>() ) ;
++target_it ;
}
-
- return 0 ;
}
/// multithreaded tf_remap routine, splitting the target array to chunks.
@@ -923,33 +928,17 @@ template < class value_type , // like, float, double, complex<>, pixels, Ti
class rc_type , // float or double for coordinates
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int tf_remap ( const bspline < value_type , dim_in > & bspl ,
+void tf_remap ( const bspline < value_type , dim_in > & bspl ,
transformation < value_type , rc_type , dim_in , dim_out > tf ,
MultiArrayView < dim_out , value_type > output ,
bool use_vc = true ,
int nthreads = ncores
)
{
- using namespace std::placeholders ;
-
- // we use bind to bind the first two and last argument of the single-threaded
- // tf_remap function, leaving one free argument which will accept the array
- // of data to iterate over (being the a chunk of 'output' which is created
- // by divide_and_conquer)
-
- auto chunk_func
- = std::bind ( st_tf_remap < value_type , rc_type , dim_in , dim_out > , // single-threaded tf_remap
- std::ref(bspl) , // bspline passed in above
- tf , // transformation passed in above
- _1 , // placeholders to accept data chunk
- _2 , // and offset from divide_and_conquer
- use_vc ) ; // use_vc flag as passed in above
-
- divide_and_conquer < decltype ( output ) > // run divide_and_conquer
- :: run ( output , // on the output array
- chunk_func , // applying the functor we've made with bind
- -1 , // allowing array splitting along any axis
- nthreads ) ; // using nthreads threads
+ shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
+
+ multithread ( st_tf_remap < value_type , rc_type , dim_in , dim_out > ,
+ nthreads , range , &bspl , tf , output , use_vc ) ;
}
/// this highest-level transform-based remap takes an input array and creates
@@ -973,9 +962,7 @@ int tf_remap ( MultiArrayView < dim_in , value_type > input ,
// create the bspline object
bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
// copy in the data
- bsp.core = input ;
- // prefilter
- bsp.prefilter ( use_vc , nthreads ) ;
+ bsp.prefilter ( use_vc , nthreads , input ) ;
tf_remap<value_type,rc_type,dim_in,dim_out>
( bsp , tf , output , use_vc , nthreads ) ;
@@ -992,14 +979,17 @@ template < class value_type , // like, float, double, complex<>, pixels, Ti
class rc_type , // float or double, for coordinates
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
+void st_make_warp_array ( shape_range_type < dim_out > range ,
+ transformation < value_type , rc_type , dim_in , dim_out > tf ,
MultiArrayView < dim_out ,
vigra::TinyVector < rc_type , dim_in > >
- & warp_array ,
- typename MultiArrayView < dim_out , value_type > :: difference_type offset ,
+ _warp_array ,
bool use_vc = true
)
{
+ auto warp_array = _warp_array.subarray ( range[0] , range[1] ) ;
+ auto offset = range[0] ;
+
/// incoming coordinates (into output, which has dim_out dims)
/// generated by vigra's CoupledIterator or, in the vectorized code,
/// by coordinate_iterator_v
@@ -1064,8 +1054,6 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
target_it.get<1>() = cs_out ;
++target_it ;
}
-
- return 0 ;
}
/// multithreaded code to make a warp array using a coordinate transformation.
@@ -1074,33 +1062,19 @@ template < class value_type , // like, float, double, complex<>, pixels, Ti
class rc_type , // float or double for coordinates
int dim_in , // number of dimensions of input array (and of 'warp' coordinates)
int dim_out > // number of dimensions of output array
-int make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
+void make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
MultiArrayView < dim_out ,
vigra::TinyVector < rc_type , dim_in > >
- warp_array ,
+ & warp_array ,
bool use_vc = true ,
int nthreads = ncores
)
{
- using namespace std::placeholders ;
-
- // we use bind to bind the first two and last arguments of the single-threaded
- // function, leaving one free argument which will accept the array
- // of data to iterate over (being the a chunk of 'output' which is created
- // by divide_and_conquer)
-
- auto chunk_func
- = std::bind ( st_make_warp_array < value_type , rc_type , dim_in , dim_out > , // single-threaded make_warp_array
- tf , // transformation passed in above
- _1 , // placeholders to accept data chunk
- _2 , // and offset from divide_and_conquer
- use_vc ) ; // use_vc flag as passed in above
-
- divide_and_conquer < decltype ( warp_array ) > // run divide_and_conquer
- :: run ( warp_array , // on the output array
- chunk_func , // applying the functor we've made with bind
- -1 , // allowing array splitting along any axis
- nthreads ) ; // using nthreads threads
+ shape_range_type < dim_out > range ( shape_type < dim_out > () , warp_array.shape() ) ;
+
+ multithread ( st_make_warp_array < value_type , rc_type , dim_in , dim_out > ,
+ nthreads , range , tf , warp_array , use_vc ) ;
+
}
} ; // end of namespace vspline
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git
More information about the debian-science-commits
mailing list