[dh-make-perl] 02/03: parse Contents files in parallel

Damyan Ivanov dmn at moszumanska.debian.org
Sun Nov 19 18:03:11 UTC 2017


This is an automated email from the git hooks/post-receive script.

dmn pushed a commit to branch master
in repository dh-make-perl.

commit 8bf21b4eb5505afe88cf29b248b6d0892d2e3e34
Author: Damyan Ivanov <dmn at debian.org>
Date:   Sun Nov 19 16:35:14 2017 +0000

    parse Contents files in parallel
    
    parsing of experimental+sid+stretch main/contrib/non-free
    on i5-7200 (2 cores × 2 threads) with warm cache
    (10-runs average):
    
     * dh-make-perl 0.96
    real	24,339s
    user	24,952s
    sys	 0,556s
    
     * with this commit
    real	14,135s
    user	29,238s
    sys	 0,631s
    
    Of cource, this is only noticeable when one has more than
    one "big" Contents file, e.g. sid' and stretch's main.
    
    With experimental+sid main/contrib/non-free:
    
     * dh-make-perl 0.96:
    real	14,660s
    user	14,966s
    sys	 0,389s
    
     * with this commit:
    real	13,635s
    user	16,146s
    sys	 0,408s
    
    More parallelism can be squeezed out if the Contents files
    are distributes so that each child has to process relatively
    similar byte count (even if compressed)
---
 Build.PL                  |   3 +
 lib/Debian/AptContents.pm | 137 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 113 insertions(+), 27 deletions(-)

diff --git a/Build.PL b/Build.PL
index 5ceb616..89e9a9c 100644
--- a/Build.PL
+++ b/Build.PL
@@ -36,6 +36,8 @@ my $builder = My::Builder->new(
         'FindBin'                   => 0,
         'Getopt::Long'              => 0,
         'IO::File'                  => 0,
+        'IO::Pipe'                  => 0,
+        'IO::Select'                => 0,
         'IO::Uncompress::Gunzip'    => 0,
         'List::MoreUtils'           => 0,
         'LWP::ConnCache'            => 0,
@@ -48,6 +50,7 @@ my $builder = My::Builder->new(
         'Parse::DebianChangelog'    => 0,
         'Software::License::Artistic_2_0' => 0,
         'Storable'                  => 0,
+        'Sys::CPU'                  => 0,
         'Text::Diff'                => 0,
         'Text::Wrap'                => 0,
         'Tie::File'                 => 0,
diff --git a/lib/Debian/AptContents.pm b/lib/Debian/AptContents.pm
index 541374c..7ee4fb0 100644
--- a/lib/Debian/AptContents.pm
+++ b/lib/Debian/AptContents.pm
@@ -217,32 +217,7 @@ sub read_cache {
             $cache->{apt_contents}   = {};
 
             push @{ $cache->{contents_files} }, @{ $self->contents_files };
-            my @cat_cmd = (
-                '/usr/lib/apt/apt-helper', 'cat-file', @{ $self->contents_files }
-            );
-            open( my $f, "-|", @cat_cmd )
-                or die
-                "Can't run '/usr/lib/apt/apt-helper cat-file' on Contents files: $!\n";
-
-            my $line;
-            while ( defined( $line = $f->getline ) ) {
-                my ( $file, $packages ) = split( /\s+/, $line );
-                next unless $file =~ s{
-                    ^usr/
-                    (?:share|lib)/
-                    (?:perl\d+/            # perl5/
-                    | perl/(?:\d[\d.]+)/   # or perl/5.10/
-                    | \S+-\S+-\S+/perl\d+/(?:\d[\d.]+)/  # x86_64-linux-gnu/perl5/5.22/
-                    )
-                }{}x;
-                $cache->{apt_contents}{$file} = exists $cache->{apt_contents}{$file}
-                    ? $cache->{apt_contents}{$file}.','.$packages
-                    : $packages;
-
-                # $packages is a comma-separated list of
-                # section/package items. We'll parse it when a file
-                # matches. Otherwise we'd parse thousands of entries,
-                # while checking only a couple
+
             {
                 my $prefix;
                 for my $f ( @{ $self->contents_files } ) {
@@ -263,8 +238,116 @@ sub read_cache {
                             @{ $self->contents_files } )
                 );
             }
+
+            require IO::Pipe; require IO::Select; require Sys::CPU;
+
+            # parsing of Contents files goes to forked children. contents_files
+            # is sorted by size, so that no child can have only long Contents.
+            # the number of children is capped to the number of CPUs on the system
+            my @files = sort { -s $b <=> -s $a } @{ $self->contents_files };
+            my $cpus = Sys::CPU::cpu_count();
+            my ( @kids, %kid_by_fh );
+            {
+                my $kid_no = -1;
+                for my $fn (@files) {
+                    $kid_no++;
+
+                    if ( @kids < $cpus ) {
+                        push @kids, { files => [$fn], lines => 0 };
+                    }
+                    else {
+                        push @{ $kids[ $kid_no % @kids ]->{files} }, $fn;
+                    }
+                }
+            }
+
+            my $sel = IO::Select->new;
+
+            # start the children
+            for my $kid (@kids) {
+                my $pipe = IO::Pipe->new;
+                if ( my $pid = fork() ) {    # parent
+                    $pipe->reader;
+
+                    $kid_by_fh{ $pipe->fileno } = $kid;
+                    $kid->{io}                  = $pipe;
+                    $kid->{pid}                 = $pid;
+                    $sel->add( $pipe->fileno );
+                }
+                elsif ( defined($pid) ) {    # child
+                    $pipe->writer;
+                    my @cat_cmd = (
+                        '/usr/lib/apt/apt-helper', 'cat-file',
+                        @{ $kid->{files} }
+                    );
+                    open( my $f, "-|", @cat_cmd )
+                        or die sprintf( "Error running '%s': %d\n",
+                        join( ' ', @cat_cmd ), $! );
+
+                    my $line;
+                    while ( defined( $line = $f->getline ) ) {
+                        my ( $file, $packages ) = split( /\s+/, $line );
+                        next unless $file =~ s{
+                            ^usr/
+                            (?:share|lib)/
+                            (?:perl\d+/            # perl5/
+                            | perl/(?:\d[\d.]+)/   # or perl/5.10/
+                            | \S+-\S+-\S+/perl\d+/(?:\d[\d.]+)/  # x86_64-linux-gnu/perl5/5.22/
+                            )
+                        }{}x;
+                        $pipe->print("$file\t$packages\n");
+                    }
+                    close($f);
+                    close($pipe);
+                    exit(0);
+                }
+                else {
+                    die "fork(): $!";
+                }
+            }
+
+            # read children's output
+            while ( $sel->count
+                and my @ready = IO::Select->select( $sel, undef, $sel ) )
+            {
+                my ( $to_read, undef, $errs ) = @ready;
+
+                for my $fh (@$to_read) {
+                    my $kid = $kid_by_fh{$fh};
+                    my $io = $kid->{io}; my $file = $kid->{file};
+                    if ( defined( my $line = <$io> ) ) {
+                        chomp($line);
+                        $kid->{lines}++;
+                        my ( $file, $packages ) = split( '\t', $line );
+                        $cache->{apt_contents}{$file} =
+                            exists $cache->{apt_contents}{$file}
+                            ? $cache->{apt_contents}{$file} . ',' . $packages
+                            : $packages;
+
+                        # $packages is a comma-separated list of
+                        # section/package items. We'll parse it when a file
+                        # matches. Otherwise we'd parse thousands of entries,
+                        # while checking only a couple
+                    }
+                    else {
+                        warn sprintf( "child %d (%s) EOF after %d lines\n",
+                            $kid->{pid}, $kid->{file}, $kid->{lines} )
+                            if 0;
+                        $sel->remove($fh);
+                        close( $kid->{io} );
+                        waitpid( $kid->{pid}, 0 );
+                    }
+                }
+
+                for my $fh (@$errs) {
+                    my $kid = $kid_by_fh{$fh};
+                    $sel->remove($fh);
+                    close( $kid->{io} );
+                    waitpid( $kid->{pid}, 0 );
+                    die sprintf( "child %d (%s) returned %d\n",
+                        $kid->{pid}, join( ', ', @{ $kid->{files} } ), $? );
+                }
             }
-            close($f);
 
             if ( %{ $cache->{apt_contents} } ) {
                 $self->cache($cache);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/dh-make-perl.git



More information about the Pkg-perl-cvs-commits mailing list