[dh-make-perl] 02/03: parse Contents files in parallel
Damyan Ivanov
dmn at moszumanska.debian.org
Sun Nov 19 18:03:11 UTC 2017
This is an automated email from the git hooks/post-receive script.
dmn pushed a commit to branch master
in repository dh-make-perl.
commit 8bf21b4eb5505afe88cf29b248b6d0892d2e3e34
Author: Damyan Ivanov <dmn at debian.org>
Date: Sun Nov 19 16:35:14 2017 +0000
parse Contents files in parallel
parsing of experimental+sid+stretch main/contrib/non-free
on i5-7200 (2 cores × 2 threads) with warm cache
(10-runs average):
* dh-make-perl 0.96
real 24,339s
user 24,952s
sys 0,556s
* with this commit
real 14,135s
user 29,238s
sys 0,631s
Of cource, this is only noticeable when one has more than
one "big" Contents file, e.g. sid' and stretch's main.
With experimental+sid main/contrib/non-free:
* dh-make-perl 0.96:
real 14,660s
user 14,966s
sys 0,389s
* with this commit:
real 13,635s
user 16,146s
sys 0,408s
More parallelism can be squeezed out if the Contents files
are distributes so that each child has to process relatively
similar byte count (even if compressed)
---
Build.PL | 3 +
lib/Debian/AptContents.pm | 137 +++++++++++++++++++++++++++++++++++++---------
2 files changed, 113 insertions(+), 27 deletions(-)
diff --git a/Build.PL b/Build.PL
index 5ceb616..89e9a9c 100644
--- a/Build.PL
+++ b/Build.PL
@@ -36,6 +36,8 @@ my $builder = My::Builder->new(
'FindBin' => 0,
'Getopt::Long' => 0,
'IO::File' => 0,
+ 'IO::Pipe' => 0,
+ 'IO::Select' => 0,
'IO::Uncompress::Gunzip' => 0,
'List::MoreUtils' => 0,
'LWP::ConnCache' => 0,
@@ -48,6 +50,7 @@ my $builder = My::Builder->new(
'Parse::DebianChangelog' => 0,
'Software::License::Artistic_2_0' => 0,
'Storable' => 0,
+ 'Sys::CPU' => 0,
'Text::Diff' => 0,
'Text::Wrap' => 0,
'Tie::File' => 0,
diff --git a/lib/Debian/AptContents.pm b/lib/Debian/AptContents.pm
index 541374c..7ee4fb0 100644
--- a/lib/Debian/AptContents.pm
+++ b/lib/Debian/AptContents.pm
@@ -217,32 +217,7 @@ sub read_cache {
$cache->{apt_contents} = {};
push @{ $cache->{contents_files} }, @{ $self->contents_files };
- my @cat_cmd = (
- '/usr/lib/apt/apt-helper', 'cat-file', @{ $self->contents_files }
- );
- open( my $f, "-|", @cat_cmd )
- or die
- "Can't run '/usr/lib/apt/apt-helper cat-file' on Contents files: $!\n";
-
- my $line;
- while ( defined( $line = $f->getline ) ) {
- my ( $file, $packages ) = split( /\s+/, $line );
- next unless $file =~ s{
- ^usr/
- (?:share|lib)/
- (?:perl\d+/ # perl5/
- | perl/(?:\d[\d.]+)/ # or perl/5.10/
- | \S+-\S+-\S+/perl\d+/(?:\d[\d.]+)/ # x86_64-linux-gnu/perl5/5.22/
- )
- }{}x;
- $cache->{apt_contents}{$file} = exists $cache->{apt_contents}{$file}
- ? $cache->{apt_contents}{$file}.','.$packages
- : $packages;
-
- # $packages is a comma-separated list of
- # section/package items. We'll parse it when a file
- # matches. Otherwise we'd parse thousands of entries,
- # while checking only a couple
+
{
my $prefix;
for my $f ( @{ $self->contents_files } ) {
@@ -263,8 +238,116 @@ sub read_cache {
@{ $self->contents_files } )
);
}
+
+ require IO::Pipe; require IO::Select; require Sys::CPU;
+
+ # parsing of Contents files goes to forked children. contents_files
+ # is sorted by size, so that no child can have only long Contents.
+ # the number of children is capped to the number of CPUs on the system
+ my @files = sort { -s $b <=> -s $a } @{ $self->contents_files };
+ my $cpus = Sys::CPU::cpu_count();
+ my ( @kids, %kid_by_fh );
+ {
+ my $kid_no = -1;
+ for my $fn (@files) {
+ $kid_no++;
+
+ if ( @kids < $cpus ) {
+ push @kids, { files => [$fn], lines => 0 };
+ }
+ else {
+ push @{ $kids[ $kid_no % @kids ]->{files} }, $fn;
+ }
+ }
+ }
+
+ my $sel = IO::Select->new;
+
+ # start the children
+ for my $kid (@kids) {
+ my $pipe = IO::Pipe->new;
+ if ( my $pid = fork() ) { # parent
+ $pipe->reader;
+
+ $kid_by_fh{ $pipe->fileno } = $kid;
+ $kid->{io} = $pipe;
+ $kid->{pid} = $pid;
+ $sel->add( $pipe->fileno );
+ }
+ elsif ( defined($pid) ) { # child
+ $pipe->writer;
+ my @cat_cmd = (
+ '/usr/lib/apt/apt-helper', 'cat-file',
+ @{ $kid->{files} }
+ );
+ open( my $f, "-|", @cat_cmd )
+ or die sprintf( "Error running '%s': %d\n",
+ join( ' ', @cat_cmd ), $! );
+
+ my $line;
+ while ( defined( $line = $f->getline ) ) {
+ my ( $file, $packages ) = split( /\s+/, $line );
+ next unless $file =~ s{
+ ^usr/
+ (?:share|lib)/
+ (?:perl\d+/ # perl5/
+ | perl/(?:\d[\d.]+)/ # or perl/5.10/
+ | \S+-\S+-\S+/perl\d+/(?:\d[\d.]+)/ # x86_64-linux-gnu/perl5/5.22/
+ )
+ }{}x;
+ $pipe->print("$file\t$packages\n");
+ }
+ close($f);
+ close($pipe);
+ exit(0);
+ }
+ else {
+ die "fork(): $!";
+ }
+ }
+
+ # read children's output
+ while ( $sel->count
+ and my @ready = IO::Select->select( $sel, undef, $sel ) )
+ {
+ my ( $to_read, undef, $errs ) = @ready;
+
+ for my $fh (@$to_read) {
+ my $kid = $kid_by_fh{$fh};
+ my $io = $kid->{io}; my $file = $kid->{file};
+ if ( defined( my $line = <$io> ) ) {
+ chomp($line);
+ $kid->{lines}++;
+ my ( $file, $packages ) = split( '\t', $line );
+ $cache->{apt_contents}{$file} =
+ exists $cache->{apt_contents}{$file}
+ ? $cache->{apt_contents}{$file} . ',' . $packages
+ : $packages;
+
+ # $packages is a comma-separated list of
+ # section/package items. We'll parse it when a file
+ # matches. Otherwise we'd parse thousands of entries,
+ # while checking only a couple
+ }
+ else {
+ warn sprintf( "child %d (%s) EOF after %d lines\n",
+ $kid->{pid}, $kid->{file}, $kid->{lines} )
+ if 0;
+ $sel->remove($fh);
+ close( $kid->{io} );
+ waitpid( $kid->{pid}, 0 );
+ }
+ }
+
+ for my $fh (@$errs) {
+ my $kid = $kid_by_fh{$fh};
+ $sel->remove($fh);
+ close( $kid->{io} );
+ waitpid( $kid->{pid}, 0 );
+ die sprintf( "child %d (%s) returned %d\n",
+ $kid->{pid}, join( ', ', @{ $kid->{files} } ), $? );
+ }
}
- close($f);
if ( %{ $cache->{apt_contents} } ) {
$self->cache($cache);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/dh-make-perl.git
More information about the Pkg-perl-cvs-commits
mailing list