[vspline] 60/72: clened up code in filter.h, documentation
Kay F. Jahnke
kfj-guest at moszumanska.debian.org
Sun Jul 2 09:02:43 UTC 2017
This is an automated email from the git hooks/post-receive script.
kfj-guest pushed a commit to branch master
in repository vspline.
commit 244a232234d6f05bc090aa9bf878b33cfc9df621
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date: Sat May 13 17:13:14 2017 +0200
clened up code in filter.h, documentation
---
bspline.h | 2 +-
doxy.h | 2 +-
example/examples.sh | 2 +-
example/gradient.cc | 35 ++++++++-
filter.h | 203 ++++++++++++++++++----------------------------------
prefilter.h | 4 +-
6 files changed, 105 insertions(+), 143 deletions(-)
diff --git a/bspline.h b/bspline.h
index e01b528..dfc58ac 100644
--- a/bspline.h
+++ b/bspline.h
@@ -580,7 +580,7 @@ public:
// br ( coeffs , bcv[d] , spline_degree , d ) ;
break ;
case EXPLICIT:
- // first apply bracing with BC codes passed in, then solve with BC code GUESS
+ // first create frame with BC codes passed in, then solve with BC code GUESS
// this automatically fills the brace as well, since it's part of the frame.
// TODO: the values in the frame will not come out precisely the same as they
// would by filling the brace after the coefficients have been calculated.
diff --git a/doxy.h b/doxy.h
index 1fde37a..59dd8bb 100644
--- a/doxy.h
+++ b/doxy.h
@@ -81,7 +81,7 @@ On the evaluation side I provide
- evaluation of nD arrays of coordinates (generalized remap function)
- - remap-like functions processing nD arrays of data
+ - functor based remap (aka 'transform') and 'apply' functions
\section install_sec Installation
diff --git a/example/examples.sh b/example/examples.sh
index fa7919a..e4de9cc 100755
--- a/example/examples.sh
+++ b/example/examples.sh
@@ -6,5 +6,5 @@ for f in $(ls *.cc)
do
body=$(basename $f .cc)
echo compiling $body
- clang++ -O3 -std=c++11 -march=native -pthread -o$body $f -lVc -lvigraimpex
+ clang++ -O3 -std=c++11 -DUSE_VC -march=native -pthread -o$body $f -lVc -lvigraimpex
done
diff --git a/example/gradient.cc b/example/gradient.cc
index 1ecf8a7..e059cf7 100644
--- a/example/gradient.cc
+++ b/example/gradient.cc
@@ -39,7 +39,7 @@
/// This is a good test for both the precision of the evaluation and it's
/// correct functioning, particularly with higher-D arrays.
///
-/// compile: clang++ -std=c++11 -pthread -o gradient gradient.cc
+/// compile: clang++ -O3 -DUSE_VC -march=native -std=c++11 -pthread -o gradient gradient.cc -lVc
#include <vspline/vspline.h>
#include <random>
@@ -53,29 +53,58 @@ int main ( int argc , char * argv[] )
typedef typename spline_type::view_type view_type ;
typedef typename spline_type::bcv_type bcv_type ;
- shape_type core_shape = { 10 , 4 , 13 } ;
- spline_type bspl ( core_shape , 3 , bcv_type ( vspline::NATURAL ) ) ; // , vspline::EXPLICIT ) ;
+ // let's have a knot point array with nicely odd shape
+
+ shape_type core_shape = { 35 , 43 , 19 } ;
+
+ // we have to use a longish call to the constructor since we want to pass
+ // 0.0 to 'tolerance' and it's way down in the argument list, so we have to
+ // explicitly pass a few arguments which usually take default values before
+ // we have a chance to pass the tolerance
+
+ spline_type bspl ( core_shape , // shape of knot point array
+ 3 , // cubic b-spline
+ bcv_type ( vspline::NATURAL ) , // natural boundary conditions
+ vspline::BRACED , // implicit scheme, bracing coeffs
+ -1 , // default, not using EXPLICIT
+ view_type() , // empty view
+ 0.0 ) ; // tolerance 0.0 for this test!
+
+ // get a view to the bspline's core, to fill it with data
+
view_type core = bspl.core ;
+ // create the gradient in each dimension
+
for ( int d = 0 ; d < bspl.dimension ; d++ )
{
for ( int c = 0 ; c < core_shape[d] ; c++ )
core.bindAt ( d , c ) += c ;
}
+ // now prefilter the spline
+
bspl.prefilter() ;
+ // set up an evaluator
+
typedef vigra::TinyVector < double , 3 > coordinate_type ;
typedef vspline::evaluator < coordinate_type , double > evaluator_type ;
evaluator_type ev ( bspl ) ;
+ // we want to bombard the evaluator with random in-range coordinates
+
std::random_device rd;
std::mt19937 gen(rd());
// std::mt19937 gen(12345); // fix starting value for reproducibility
coordinate_type c ;
+ // here comes our test, feed 100 random 3D coordinates and compare the
+ // evaluator's result with the expected value, which is precisely the
+ // sum of the coordinate's components
+
for ( int times = 0 ; times < 100 ; times++ )
{
for ( int d = 0 ; d < bspl.dimension ; d++ )
diff --git a/filter.h b/filter.h
index 4e2bb70..5bbad9a 100644
--- a/filter.h
+++ b/filter.h
@@ -766,7 +766,7 @@ filter ( int _M , ///< number of input/output elements (DataLength
// n-dimensional arrays. We use the following strategy:
// - perform the prefiltering collinear to each axis separately
// - when processing a specific axis, split the array(s) into chunks and use one job per chunk
-// - perform a traverse on each chunk, copying out 1D subsets collinear to the processing axis
+// - perform a traverse on each chunk, copying out subsets collinear to the processing axis
// to a buffer
// - perform the filter on the buffer
// - copy the filtered data to the target
@@ -964,16 +964,6 @@ void nonaggregating_filter ( range_type < typename source_view_type::difference_
#ifdef USE_VC
-/* currently unused
-// test if an index vector contains sequential indices
-
-template < typename IT >
-bool sequential ( const IT & indexes )
-{
- return Vc::all_of ( indexes - indexes[0] == IT::IndexesFromZero() ) ;
-}
-*/
-
/// extended gather and scatter routines taking 'extrusion parameters'
/// which handle how many times and with which stride the gather/scatter
/// operation is repeated. With these routines, strided memory can be
@@ -1003,38 +993,21 @@ gather ( const typename source_type::value_type * source ,
)
{
register source_type x ;
+ // if the mask is all-true, load the data with an unmasked gather operation
if ( mask.isFull() )
{
-// this is strange: using this code is actually slower on my system:
-//
-// if ( sequential ( indexes ) )
-// {
-// // set 'source' to proper starting point
-// source += int ( indexes[0] ) ;
-// while ( count-- )
-// {
-// x.load ( source ) ;
-// x.store ( target , Vc::Aligned ) ;
-// source += stride ;
-// target += VT::Size ;
-// }
-// }
-// else
+ while ( count-- )
{
- while ( count-- )
- {
- x.gather ( source , indexes ) ;
- auto tx = target_type ( x ) ;
- tx.store ( target , Vc::Aligned ) ;
- source += stride ;
- target += target_type::Size ;
- }
+ x.gather ( source , indexes ) ;
+ auto tx = target_type ( x ) ;
+ tx.store ( target , Vc::Aligned ) ;
+ source += stride ;
+ target += target_type::Size ;
}
}
else
{
- // for both sequential and non-sequential indexes,
- // if there is a mask, perform a gather operation
+ // if there is a partially filled mask, perform a masked gather operation
while ( count-- )
{
x.gather ( source , indexes , mask ) ;
@@ -1057,38 +1030,21 @@ scatter ( const typename source_type::value_type * source ,
)
{
register source_type x ;
+ // if the mask is all-true, store the data with an unmasked scatter operation
if ( mask.isFull() )
{
-// this is strange: using this code is actually slower on my system:
-//
-// if ( sequential ( indexes ) )
-// {
-// // set 'target' to proper starting point
-// target += int ( indexes[0] ) ;
-// while ( count-- )
-// {
-// x.load ( source , Vc::Aligned ) ;
-// x.store ( target ) ;
-// source += VT::Size ;
-// target += stride ;
-// }
-// }
-// else
+ while ( count-- )
{
- while ( count-- )
- {
- x.load ( source , Vc::Aligned ) ;
- auto tx = target_type ( x ) ;
- tx.scatter ( target , indexes ) ;
- source += source_type::Size ;
- target += stride ;
- }
+ x.load ( source , Vc::Aligned ) ;
+ auto tx = target_type ( x ) ;
+ tx.scatter ( target , indexes ) ;
+ source += source_type::Size ;
+ target += stride ;
}
}
else
{
- // for both sequential and non-sequential indexes,
- // if there is a mask, perform a scatter operation
+ // if there is a partially filled mask, perform a masked scatter operation
while ( count-- )
{
x.load ( source , Vc::Aligned ) ;
@@ -1100,8 +1056,8 @@ scatter ( const typename source_type::value_type * source ,
}
}
-/// aggregating_filter keeps a buffer of vector-aligned memory, which it fills
-/// with 1D subarrays of the source array which are collinear to the processing axis.
+/// aggregating_filter keeps a buffer of vector-aligned memory, which it fills with
+/// vsize 1D subarrays of the source array which are collinear to the processing axis.
/// The buffer is then submitted to vectorized forward-backward recursive filtering
/// and finally stored back to the corresponding memory area in target, which may
/// be the same as source, in which case the operation is seemingly performed
@@ -1224,11 +1180,13 @@ void ele_aggregating_filter ( source_view_type &source ,
if ( e < vsize )
- // have got less than vsize only? mask the operation
+ // have got less than vsize? must be the last few items.
+ // mask was all-true before, so now we limit it to the first e fields:
mask = ( simdized_math_type::IndexesFromZero() < e ) ;
- // perform extended gather with extrusion parameters
+ // perform extended gather with extrusion parameters to transport the unfiltered data
+ // to the buffer
gather < simdized_source_type , simdized_math_type >
( source_base_adress ,
@@ -1260,6 +1218,7 @@ void ele_aggregating_filter ( source_view_type &source ,
// pretty much the same as the previouse operation, with the distinction that
// copying the filtered data from the buffer to the target now needs it's own set of
// indexes etc., since all these may be different.
+
// TODO we might permute source_slice's strides to ascending and apply the same
// permutation to target_slice.
@@ -1335,41 +1294,29 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
auto source = p_original_source->subarray ( range[0] , range[1] ) ;
auto target = p_original_target->subarray ( range[0] , range[1] ) ;
- if ( vigra::ExpandElementResult < value_type > :: size != 1 )
- {
- // value_type is an aggregate type, but we want to operate on elementary types
- // so we element-expand the array and call ele_aggregating_filter, which works on
- // arrays with elementary types.
- auto expanded_source = source.expandElements ( 0 ) ;
- auto expanded_target = target.expandElements ( 0 ) ;
- ele_aggregating_filter < decltype ( expanded_source ) ,
- decltype ( expanded_target ) ,
- math_type >
- ( expanded_source ,
- expanded_target ,
- axis + 1 ,
- gain ,
- bc ,
- nbpoles ,
- pole ,
- tolerance ) ;
- }
- else
- {
- // this looks odd - why do we create new views here when the input
- // contains ele_type data already? Because if the input contains aggregates,
- // a direct call to ele_filter here wouldn't go away, but won't compile, since
- // in ele_aggregating_filter we try and typedef a vector of the contained
- // (aggregate) data type. Hence the acrobatics...
- vigra::MultiArrayView < source.actual_dimension , ele_type >
- es ( source.shape() , source.stride() , (ele_type*)(source.data()) ) ;
- vigra::MultiArrayView < target.actual_dimension , ele_type >
- et ( target.shape() , target.stride() , (ele_type*)(target.data()) ) ;
- ele_aggregating_filter < decltype ( es ) ,
- decltype ( es ) ,
- math_type >
- ( es , et , axis , gain , bc , nbpoles , pole , tolerance ) ;
- }
+ // value_type may be an aggregate type, but we want to operate on elementary types
+ // so we element-expand the array and call ele_aggregating_filter, which works on
+ // arrays with elementary types. If value_type is elementary already, the call to
+ // expandElements inserts a singleton dimension, but this has next to no performance
+ // impact, so contrary to my initial implementation I don't handle the 1-channel
+ // case separately any more.
+
+ auto expanded_source = source.expandElements ( 0 ) ;
+ auto expanded_target = target.expandElements ( 0 ) ;
+
+ // with the element-expanded arrays at hand, we can now delegate to ele_aggregating_filter:
+
+ ele_aggregating_filter < decltype ( expanded_source ) ,
+ decltype ( expanded_target ) ,
+ math_type >
+ ( expanded_source ,
+ expanded_target ,
+ axis + 1 ,
+ gain ,
+ bc ,
+ nbpoles ,
+ pole ,
+ tolerance ) ;
}
#endif
@@ -1379,7 +1326,7 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
///
/// filter_1d, which is the routine processing nD arrays along a specific axis, might as well
/// be a function. But functions can't be partially specialized (at least not with my compiler)
-/// so I use a functor instead, which, as a class, can be partially specialized. We want a
+/// so I use a functor instead, which, as a class, can be partially specialized. We'll want a
/// partial specialization for 1D arrays, where all of our usual schemes of multithreading and
/// vectorization don't intrinsically work and we have to employ a different method, see there.
@@ -1390,7 +1337,7 @@ template < typename input_array_type , ///< type of array with knot point data
class filter_1d
{
public:
- void operator() ( input_array_type &input , ///< source data. the routine can also operate in-place
+ void operator() ( input_array_type &input , ///< source data. the routine can also operate in-place
output_array_type &output , ///< where input == output.
int axis ,
double gain ,
@@ -1401,40 +1348,24 @@ public:
int njobs = default_njobs ) ///< number of jobs to use when multithreading
{
typedef typename input_array_type::value_type value_type ;
- typedef typename input_array_type::difference_type shape_type ;
- typedef range_type < shape_type > range_t ;
- typedef partition_type < range_t > partition_t ;
- typedef typename vigra::MultiArray < 1 , value_type > ::iterator iter_type ;
-
- // multithread() expects a std::function, so here we fix the type of
- // std::function we want to pass to it, create an instance of the std::function
- // and assign the routine appropriate to the task:
-
- typedef
- std::function < void ( range_t ,
- input_array_type * ,
- output_array_type * ,
- int ,
- double ,
- bc_code ,
- int ,
- const double * ,
- double ) > filter_functor_type ;
-
- filter_functor_type filter_functor ;
-
- auto pf = & nonaggregating_filter < input_array_type ,
- output_array_type ,
- typename input_array_type::value_type > ; // for now...
-
+
+ // depending on whether Vc is used or not, we choose the appropriate (single-threaded)
+ // filtering routine, which is to be passed to multitheread()
+
#ifdef USE_VC
typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
- pf = & aggregating_filter < input_array_type ,
- output_array_type ,
- ele_type > ;
+ auto pf = & aggregating_filter < input_array_type ,
+ output_array_type ,
+ ele_type > ;
+#else
+
+ auto pf = & nonaggregating_filter < input_array_type ,
+ output_array_type ,
+ value_type > ;
+
#endif
// obtain a partitioning of the data array into subranges. We do this 'manually' here
@@ -1442,10 +1373,10 @@ public:
// (by passing the 'forbid' parameter, which is taken as -1 (no effect) in the default
// partitioning of nD arrays).
- partition_t partitioning = shape_splitter < dim > :: part
- ( input.shape() ,
- njobs ,
- axis ) ;
+ auto partitioning = shape_splitter<dim>::part ( input.shape() , njobs , axis ) ;
+
+ // now use multithread() to distribute ranges of data to individual jobs which are
+ // executed by the it's thread pool.
multithread ( pf ,
partitioning ,
@@ -1469,6 +1400,8 @@ public:
/// With this 'cheat' we can handle 1D arrays with full multithreading and vectorization,
/// while the 'orthodox' approach would have to process the data in linear order with
/// a single thread. Cleaning up the 'dirty' margins is cheap for large arrays.
+/// The code is making guesses as to whether it's worth while to follow this strategy;
+/// the array has to be quite large before 'fake 2D processing' is actually applied.
template < typename input_array_type , ///< type of array with knot point data
typename output_array_type , ///< type of array for coefficients (may be the same)
diff --git a/prefilter.h b/prefilter.h
index 18fb2ea..ae49859 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -143,8 +143,8 @@ using namespace vigra ;
/// done very quickly, since no additional memory access is needed beyond a buffer's worth of data
/// already present in core memory.
///
-/// solve is just athin wrapper around filter_nd in filter.h, injecting the actual number of poles,
-/// the poles themselves and a flag wether to use vector code or not.
+/// solve is just a thin wrapper around filter_nd in filter.h, injecting the actual number of poles
+/// and the poles themselves.
///
/// Note how smoothing comes into play here: it's done simply by
/// prepending an additional pole to the filter cascade, taking a positive value between
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git
More information about the debian-science-commits
mailing list