[Collab-qa-commits] r1628 - in udd: . scripts sql udd

Lucas Nussbaum lucas at alioth.debian.org
Sat Nov 14 08:18:34 UTC 2009


Author: lucas
Date: 2009-11-14 08:18:33 +0000 (Sat, 14 Nov 2009)
New Revision: 1628

Added:
   udd/scripts/fix-removal-timestamps.py
   udd/udd/removals_gatherer.py
Modified:
   udd/config-org.yaml
   udd/crontabs
   udd/sql/setup.sql
Log:
add removals gatherer

Modified: udd/config-org.yaml
===================================================================
--- udd/config-org.yaml	2009-11-14 08:18:00 UTC (rev 1627)
+++ udd/config-org.yaml	2009-11-14 08:18:33 UTC (rev 1628)
@@ -22,6 +22,7 @@
     dehs: module udd.dehs_gatherer
     ldap: module udd.ldap_gatherer
     wannabuild: module udd.wannabuild_gatherer
+    removals: module udd.removals_gatherer
   timestamp-dir: /org/udd.debian.org/timestamps
   lock-dir: /org/udd.debian.org/locks
   archs:
@@ -445,3 +446,9 @@
     i386, ia64, kfreebsd-amd64, kfreebsd-i386, mips,
     mipsel, powerpc, s390, sparc]
 
+removals:
+  type: removals
+  update-command: wget -q http://ftp-master.debian.org/removals-full.txt -O - | scripts/fix-removal-timestamps.py > /org/udd.debian.org/mirrors/removals-full.txt
+  path: /org/udd.debian.org/mirrors/removals-full.txt
+  table: package_removal
+  schema: package_removal

Modified: udd/crontabs
===================================================================
--- udd/crontabs	2009-11-14 08:18:00 UTC (rev 1627)
+++ udd/crontabs	2009-11-14 08:18:33 UTC (rev 1628)
@@ -8,7 +8,7 @@
 # Ubuntu Sources/Packages
 30 2 * * * $UAR ubuntu-lucid ubuntu-karmic ubuntu-hardy ubuntu-intrepid ubuntu-jaunty
 # Various simple things
-0 4 * * * $UAR dehs debian-popcon ubuntu-popcon lintian debtags carnivore ldap | /org/udd.debian.org/udd/scripts/filter-output.rb
+0 4,16 * * * $UAR dehs debian-popcon ubuntu-popcon lintian debtags carnivore ldap removals | /org/udd.debian.org/udd/scripts/filter-output.rb
 49 */6 * * * $UAR upload-history | /org/udd.debian.org/udd/scripts/filter-output.rb
 0 */12 * * * $UAR testing-migrations
 11 */4 * * * $UAR wannabuild | /org/udd.debian.org/udd/scripts/filter-output.rb

Added: udd/scripts/fix-removal-timestamps.py
===================================================================
--- udd/scripts/fix-removal-timestamps.py	                        (rev 0)
+++ udd/scripts/fix-removal-timestamps.py	2009-11-14 08:18:33 UTC (rev 1628)
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+# This file is a part of the Ultimate Debian Database
+# <http://wiki.debian.org/UltimateDebianDatabase>
+#
+# Copyright (C) 2009 Serafeim Zanikolas <serzan at hellug.gr>
+#
+# This file is distributed under the terms of the General Public
+# License version 3 or (at your option) any later version.
+
+"""
+Quick hack to fix broken timestamp entries in ftp-archive package removals
+history file.
+
+Before:
+
+    [Date: Tue, 27 Oct 2009 19:41:19 +0000
+    ] [ftpmaster: Archive Administrator]
+
+After applying this script:
+
+    [Date: Tue, 27 Oct 2009 19:41:19 +0000] [ftpmaster: Archive Administrator]
+"""
+
+import sys
+
+prev_line = None
+for line in sys.stdin:
+    line = line.rstrip()
+    if prev_line is None:
+        prev_line = line
+        continue
+    if line.startswith("] [ftpmaster:"):
+        assert prev_line
+        print "%s%s" % (prev_line, line)
+        prev_line = None
+    else:
+        print prev_line
+        prev_line = line
+if prev_line:
+    print prev_line


Property changes on: udd/scripts/fix-removal-timestamps.py
___________________________________________________________________
Added: svn:executable
   + *

Modified: udd/sql/setup.sql
===================================================================
--- udd/sql/setup.sql	2009-11-14 08:18:00 UTC (rev 1627)
+++ udd/sql/setup.sql	2009-11-14 08:18:33 UTC (rev 1628)
@@ -534,6 +534,29 @@
 );
 GRANT SELECT ON wannabuild TO public;
 
+-- package_removal_batch
+CREATE TABLE package_removal_batch (
+  id int,
+  time timestamp,
+  ftpmaster text,
+  distribution text,
+  requestor text,
+  reasons text,
+  PRIMARY KEY (id)
+);
+GRANT SELECT ON package_removal_batch TO public;
+
+-- package_removal
+CREATE TABLE package_removal (
+  batch_id int,
+  name text,
+  version debversion,
+  arch_array text[],
+  PRIMARY KEY(batch_id, name, version),
+  FOREIGN KEY(batch_id) REFERENCES package_removal_batch(id)
+);
+GRANT SELECT ON package_removal TO public;
+
 -- timings of data operations
 CREATE TABLE timestamps (
   id serial,

Added: udd/udd/removals_gatherer.py
===================================================================
--- udd/udd/removals_gatherer.py	                        (rev 0)
+++ udd/udd/removals_gatherer.py	2009-11-14 08:18:33 UTC (rev 1628)
@@ -0,0 +1,269 @@
+#!/usr/bin/env python
+
+# This file is a part of the Ultimate Debian Database
+# <http://wiki.debian.org/UltimateDebianDatabase>
+#
+# Copyright (C) 2009 Serafeim Zanikolas <serzan at hellug.gr>
+#
+# This file is distributed under the terms of the General Public
+# License version 3 or (at your option) any later version.
+
+""" import data about the removal of packages (from the debian archive) in UDD
+
+Raw data source: http://ftp-master.debian.org/removals-full.txt
+
+Sample removal batch from the above file:
+
+=========================================================================
+[Date: Tue,  9 Jan 2001 20:52:51 -0500] [ftpmaster: James Troup]
+Removed the following packages from unstable:
+
+        dsniff |      2.3-1 | source, i386
+Closed bugs: 81709
+
+------------------- Reason -------------------
+ROM; moved to non-US (now depends on libssl)
+----------------------------------------------
+=========================================================================
+
+Note that a removal batch may have many packages removed (unlike the one
+above, where only dsniff is removed).
+
+This script when ran as a standalone script will not connect to the database
+but will instead run a basic sanity test (to make sure that the input file
+hasn't changed in a way that would break the script).
+"""
+
+import sys
+import re
+
+from gatherer import gatherer
+from aux import quote
+
+def fail(msg):
+    sys.stderr.write("%s\n" % msg)
+    exit(1)
+
+def parse_removals(stream):
+    # We expect lines to appear in the order below. parser.curr_func is set to
+    # one of several functions based on how we expect to show up next in the
+    # file.
+    #
+    # date; ftp-master name
+    # distrib
+    # skip_line*
+    # pkg name | version | arch[, arch] <-- >=1 lines like these
+    # skip_line*
+    #------------------- Reason -------------------
+    # requestor; reasons
+
+    parser = Parser()
+    for line in stream:
+        if parser.skip_line(line):
+            continue
+        if parser.curr_func(line):
+            continue
+    return parser.removal_batches
+
+def get_gatherer(connection, config, source):
+    return removals_gatherer(connection, config, source)
+
+class removals_gatherer(gatherer):
+    """import removals into the database"""
+
+    def __init__(self, connection, config, source):
+        gatherer.__init__(self, connection, config, source)
+        self.assert_my_config('path', 'table')
+
+    def run(self):
+        conf = self.my_config
+
+        try:
+            input_fd = open(conf['path'])
+        except IOError:
+            fail('failed to open %s' % conf['path'])
+
+        batch_removals = parse_removals(input_fd)
+
+        pkg_removal_table = conf['table']
+        pkg_removal_batch_table = "%s_batch" % conf['table']
+
+        cur = self.cursor()
+        cur.execute('DELETE FROM %s' % pkg_removal_table)
+        cur.execute('DELETE FROM %s' % pkg_removal_batch_table)
+
+        # insert data for batches of removals
+        cur.execute('PREPARE batch_removals_insert ' \
+                        'AS INSERT INTO %s (id, time, ftpmaster, ' \
+                                           'distribution, requestor, ' \
+                                           'reasons)' \
+                        'VALUES ($1, $2, $3, $4, $5, $6)' \
+                    % pkg_removal_batch_table)
+        for i, batch_removal in enumerate(batch_removals):
+            cur.execute('EXECUTE batch_removals_insert ' \
+                              '(%s, %s, %s, %s, %s, %s)' \
+                            % (i, quote(batch_removal.timestamp),
+                               quote(batch_removal.ftpmaster),
+                               quote(batch_removal.distribution),
+                               quote(batch_removal.requestor),
+                               quote(batch_removal.reasons)))
+        cur.execute('DEALLOCATE batch_removals_insert')
+        cur.execute("ANALYZE %s" % pkg_removal_batch_table)
+
+        # insert data for removals of individual packages
+        cur.execute('PREPARE pkg_removal_insert ' \
+                        'AS INSERT INTO %s (batch_id, name, version, ' \
+                                           'arch_array)' \
+                        'VALUES ($1, $2, $3, $4)' % pkg_removal_table)
+        for i, batch_removal in enumerate(batch_removals):
+            for pkg in batch_removal.packages:
+                cur.execute('EXECUTE pkg_removal_insert (%s, %s, %s, %s)' \
+                                % (i, quote(pkg.name), quote(pkg.version),
+                                    quote("{%s}" % ",".join(pkg.arches))))
+        cur.execute('DEALLOCATE pkg_removal_insert')
+        cur.execute("ANALYZE %s" % pkg_removal_table)
+
+def test(filename, removal_batches):
+    """compare the number of parsed packages against those counted with a
+    shell one-liner"""
+
+    from commands import getstatusoutput
+
+    status, npackage_removals_via_grep = getstatusoutput(\
+            "egrep '[^ ]+ *\| *[^ ]+ *\| *[^ ]+' %s | " \
+            "awk '-F|' '{print $1, $2}' | sed 's/  */ /g' | wc -l" \
+                % filename)
+    if status != 0:
+        fail("failed to extract removed packages with grep")
+    npackage_removals_via_grep = int(npackage_removals_via_grep)
+
+    npackage_removals_via_python = 0
+    ftpmasters = set()
+    distribs = set()
+    package_removals_via_python = set()
+    for pkg_rm_batch in removal_batches:
+        npackage_removals_via_python += len(pkg_rm_batch.packages)
+        ftpmasters.add(pkg_rm_batch.ftpmaster)
+        distribs.add(pkg_rm_batch.distribution)
+
+    if npackage_removals_via_grep != npackage_removals_via_python:
+        fail("%d removed packages have been parsed but %d were expected" % \
+                (npackage_removals_via_python, npackage_removals_via_grep))
+
+    print '%d packages were removed from %d distributions, in %d\n' \
+          'batches of removals done by %d ftpmaster members' % \
+            (npackage_removals_via_python, len(distribs),
+             len(removal_batches), len(ftpmasters))
+
+
+class Package(object):
+    """container for a single removed package"""
+    def __init__(self, name, version, arches):
+        self.name = name
+        self.version = version
+        self.arches = [arch.strip() for arch in arches.split(",")]
+
+    def __str__(self):
+        return '%s-%s' % (self.name, self.version)
+
+class PackageRemovalBatch(object):
+    """container for a removal batch (refers to one or more packages)"""
+    def __init__(self, timestamp, ftpmaster):
+        self.timestamp = timestamp
+        self.ftpmaster = ftpmaster
+        self.distribution = None
+        self.packages = []
+        self.requestor = None
+        self.reasons = None
+
+    def add_pkg(self, pkg):
+        self.packages.append(pkg)
+
+    def __str__(self):
+        return "removal of %s at %s by %s from %s" \
+                % ("\n".join([str(p) for p in self.packages]), \
+                self.timestamp, self.ftpmaster, self.distribution)
+
+class Parser(object):
+    date_master_pat = re.compile(r"\[Date: ([^\]]+)] \[ftpmaster: ([^\]]+)\]")
+    distrib_pat = re.compile(r"Removed the following packages from ([a-z-]+)[:,]*")
+    pkg_version_arches_pat = re.compile(r"\s*(\S*) *\|\s*(\S+)\s*\|\s*(.*)$")
+    reason_pat = re.compile("-+\s*Reason\s*-+")
+    rene_pat = re.compile("(\[rene[^\]]*\])\s*(.*)")
+
+    def __init__(self):
+        self.removal_batch = None
+        self.removal_batches = []
+        self.curr_func = self.parse_removal
+
+    def skip_line(self, line):
+        if line.isspace() or line == "":
+            return True
+
+    def parse_removal(self, line):
+        match = Parser.date_master_pat.search(line)
+        if match:
+            timestamp, ftpmaster = match.groups()
+            self.removal_batch = PackageRemovalBatch(timestamp, ftpmaster)
+            self.curr_func = self.parse_distrib
+            return True
+
+    def parse_distrib(self, line):
+        match = Parser.distrib_pat.search(line)
+        if match:
+            self.removal_batch.distribution = match.group(1)
+            self.curr_func = self.parse_pkg_version_arch_or_reason_header
+            return True
+
+    def parse_pkg_version_arch_or_reason_header(self, line):
+        match = Parser.pkg_version_arches_pat.search(line)
+        if match:
+            pkg, version, arches = match.groups()
+            pkg_obj = Package(pkg, version, arches)
+            if self.removal_batch:
+                self.removal_batch.add_pkg(pkg_obj)
+                return True
+        elif self.removal_batch:
+            match = Parser.reason_pat.search(line)
+            if match:
+                self.curr_func = self.parse_requestor_reasons
+                return True
+
+    def parse_requestor_reasons(self, line):
+        match = Parser.rene_pat.search(line)
+        if match:
+            self.removal_batch.requestor = match.group(1)
+            self.removal_batch.reasons = match.group(2)
+        else:
+            fields = line.split(';')
+            if fields == 1: # assume no requestor
+                self.removal_batch.requestor = None
+                self.removal_batch.reasons = line
+            else:
+                self.removal_batch.requestor = fields[0]
+                self.removal_batch.reasons = ";".join(fields[1:])
+        self.curr_func = self.conclude_batch
+        return True # assume that we always get fed the correct line
+
+    def conclude_batch(self, line):
+        if line.startswith("---------") and self.removal_batch is not None:
+            self.removal_batches.append(self.removal_batch)
+            self.removal_batch = None
+            self.curr_func = self.parse_removal
+            return True
+
+if '__main__' == __name__:
+    import os
+
+    try:
+        filename = sys.argv[1]
+        input_fd = open(filename)
+    except IndexError:
+        fail("syntax: %s <removals-file>\n" \
+             "(when run from the command line will only prints stats)" \
+                % os.path.basename(sys.argv[0]))
+    except IOError:
+        fail("failed to open %s" % filename)
+
+    batch_removals = parse_removals(input_fd)
+    test(filename, batch_removals)




More information about the Collab-qa-commits mailing list