[devscripts] 12/14: rc-alert: Parse a package div at a time

James McCoy jamessan at debian.org
Thu Nov 21 03:53:47 UTC 2013


This is an automated email from the git hooks/post-receive script.

jamessan pushed a commit to branch master
in repository devscripts.

commit 2db36e2701705c7cf6b0631a3b8c1e97835b6300
Author: James McCoy <jamessan at debian.org>
Date:   Tue Nov 19 23:50:12 2013 -0500

    rc-alert: Parse a package div at a time
    
    The HTML parsing is currently very reliant on the specific format that
    bugs.d.o sends.  If this is manipulated in any way (e.g. removal of
    newlines) then the parsing breaks.
    
    To remedy that, slurp the entire page instead of working a line at a
    time.  Then it is trivial to extract the contents of each <div
    class="package"> stanza and parse out the relevant information.
    
    Closes: #729779
    Signed-off-by: James McCoy <jamessan at debian.org>
---
 debian/changelog    |    2 ++
 scripts/rc-alert.pl |   29 ++++++++++++++++-------------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 90eec65..421a6cc 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -10,6 +10,8 @@ devscripts (2.13.5) UNRELEASED; urgency=low
   * test/*:
     + Pass --no-conf to commands which may be influenced by ~/.devscripts
     + Fix a test failure in test_uscan_online due to different version format.
+  * rc-alert: Be more flexible in the formatting of the HTML being parsed.
+    (Closes: #729779)
 
   [ Evgeni Golov ]
   * debcheckout: allow setting the user for auth mode in the config.  (Closes:
diff --git a/scripts/rc-alert.pl b/scripts/rc-alert.pl
index 6ca4e90..afa5619 100755
--- a/scripts/rc-alert.pl
+++ b/scripts/rc-alert.pl
@@ -180,8 +180,8 @@ if (-d $cachedir) {
     chdir $cachedir or die "$progname: can't cd $cachedir: $!\n";
 
     if ("$curl_or_wget" eq "wget") {
-        # Either use the cached version because the remote hasn't been
-        # updated (-N) or download a complete new copy (--no-continue)
+	# Either use the cached version because the remote hasn't been
+	# updated (-N) or download a complete new copy (--no-continue)
 	if (system('wget', '-qN', '--no-continue', $url) != 0) {
 	    die "$progname: wget failed!\n";
 	}
@@ -259,20 +259,23 @@ if ($debtags) {
 my $found_bugs_start;
 my ($current_package, $comment);
 
+my $html;
+{
+    local $/;
+    $html = <BUGS>;
+}
+
+my @stanzas = $html =~ m%<div class="package">(.*?)</div>%gs;
 my %pkg_store;
-while (defined(my $line = <BUGS>)) {
-    if( $line =~ /^<div class="package">/) {
-	$found_bugs_start = 1;
-    }
-    if( ! defined($found_bugs_start)) {
-	next;
-    } elsif ($line =~ m%<a name="([^\"]+)"><strong>Package:</strong></a> <a href="[^\"]+">%i) {
+foreach my $stanza (@stanzas) {
+    if ($stanza =~ m%<a name="([^\"]+)"><strong>Package:</strong></a> <a href="[^\"]+">%i) {
 	$current_package = $1;
 	$comment = '';
-    } elsif ($line =~ m%<a name="(\d+)"></a>\s*<a href="[^\"]+">\d+</a> (\[[^\]]+\])( \[[^\]]+\])? ([^<]+)%i) {
-	my ($num, $tags, $dists, $name) = ($1, $2, $3, $4);
-	chomp $name;
-	store_if_relevant(pkg => $current_package, num => $num, tags => $tags, dists => $dists, name => $name, comment => $comment);
+	while ($stanza =~ m%<a name="(\d+)"></a>\s*<a href="[^\"]+">\d+</a> (\[[^\]]+\])( \[[^\]]+\])? ([^<]+)%igc) {
+	    my ($num, $tags, $dists, $name) = ($1, $2, $3, $4);
+	    chomp $name;
+	    store_if_relevant(pkg => $current_package, num => $num, tags => $tags, dists => $dists, name => $name, comment => $comment);
+	}
     }
 }
 for (sort {$a <=> $b } keys %pkg_store) { print $pkg_store{$_}; }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/collab-maint/devscripts.git



More information about the devscripts-devel mailing list