[Collab-qa-commits] r1023 - in udd/src: . udd

he at alioth.debian.org he at alioth.debian.org
Fri Aug 8 23:58:51 UTC 2008


Author: he
Date: 2008-08-08 23:58:51 +0000 (Fri, 08 Aug 2008)
New Revision: 1023

Added:
   udd/src/udd/carnivore_gatherer.py
Modified:
   udd/src/setup-db.sql
Log:
First try at a carnivore data importer. God, I do hate python.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.


Modified: udd/src/setup-db.sql
===================================================================
--- udd/src/setup-db.sql	2008-08-08 21:20:25 UTC (rev 1022)
+++ udd/src/setup-db.sql	2008-08-08 23:58:51 UTC (rev 1023)
@@ -113,13 +113,30 @@
  (package text, version text, date timestamp with time zone, changed_by text,
   maintainer text, nmu boolean, signed_by text, key_id text);
 
+CREATE TABLE carnivore_emails
+ (id int, email text,
+  PRIMARY KEY(id));
+
+CREATE TABLE carnivore_names
+ (id int, name text,
+   PRIMARY KEY(id));
+
+CREATE TABLE carnivore_keys
+ (id int, key text, key_type text,
+   PRIMARY KEY(id));
+
+CREATE TABLE carnivore_login
+ (id int, login text,
+   PRIMARY KEY(id));
+
 CREATE INDEX packages_source_idx on packages(source);
-CREATE INDEX packages_distrelcomp_idx on packages(distribution, release, component);
-CREATE INDEX sources_distrelcomp_idx on sources(distribution, release, component);
-
+CREATE INDEX sources_distribution_idx on sources(distribution);
+CREATE INDEX sources_release_idx on sources(release);
+CREATE INDEX sources_component_idx on sources(component);
 CREATE INDEX ubuntu_packages_source_idx on ubuntu_packages(source);
-CREATE INDEX ubuntu_packages_distrelcomp_idx on packages(distribution, release, component);
-CREATE INDEX ubuntu_sources_distrelcomp_idx on ubuntu_sources(distribution, release, component);
+CREATE INDEX ubuntu_sources_distribution_idx on ubuntu_sources(distribution);
+CREATE INDEX ubuntu_sources_release_idx on ubuntu_sources(release);
+CREATE INDEX ubuntu_sources_component_idx on ubuntu_sources(component);
 
 GRANT SELECT ON packages TO PUBLIC;
 GRANT SELECT ON sources TO PUBLIC;

Added: udd/src/udd/carnivore_gatherer.py
===================================================================
--- udd/src/udd/carnivore_gatherer.py	                        (rev 0)
+++ udd/src/udd/carnivore_gatherer.py	2008-08-08 23:58:51 UTC (rev 1023)
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+
+"""
+This script imports the carnivore data into the database
+See merkel.debian.org:/org/qa.debian.org/carnivore/
+"""
+
+import aux
+import sys
+import gzip
+from gatherer import gatherer
+import re
+
+def get_gatherer(connection, config):
+  return carnivore_gatherer(connection, config)
+
+class carnivore_gatherer(gatherer):
+  carnivore_field_ignores = ["Packages", "X-MIA", "X-Warning"]
+  carnivore_field_to_DB_map = {
+    "Using emails":    {"name": "emails", "content-type": "comma-separated"},
+    "Known as":        {"name": "names", "content-type": "comma-separated"},
+    "DD":              {"name": "login", "content-type": "unique-login"},
+    "Key in keyring":  {"name": "keyring_key", "content-type": "multiple entries"},
+    "Key in ldap":     {"name": "ldap_key", "content-type": "multiple entries"},
+    "Key in emeritus": {"name": "emeritus_key", "content-type": "multiple entries"},
+    "Key in removed":  {"name": "removed_key", "content-type": "multiple entries"},
+  }
+
+  def __init__(self, connection, config):
+    gatherer.__init__(self, connection, config)
+
+  def run(self, source):
+    try:
+      my_config = self.config[source]
+    except:
+      raise
+
+    #check that the config contains everything we need:
+    for key in ['path', 'emails-table', 'names-table', 'keys-table', 'login-table']:
+      if not key in my_config:
+        raise aux.ConfigException, "%s not configured for source %s" % (key, source)
+
+    #start harassing the DB, preparing the final inserts and making place
+    #for the new data:
+    cur = self.cursor()
+
+    for table in ['emails', 'names', 'keys', 'login']:
+      cur.execute("DELETE FROM %s" % my_config["%s-table" % table])
+
+    cur.execute("""PREPARE carnivore_email_insert 
+      AS INSERT INTO %s (id, email) 
+      VALUES ($1, $2)""" % (my_config['emails-table']))
+    cur.execute("""PREPARE carnivore_name_insert
+      AS INSERT INTO %s (id, name)
+      VALUES ($1, $2)""" % (my_config['name-table']))
+    cur.execute("""PREPARE carnivore_keys_insert
+      AS INSERT INTO %s (id, key, key_type)
+      VALUES ($1, $2, $3)""" % (my_config['keys-table']))
+    cur.execute("""PREPARE carnivore_login_insert
+      AS INSERT INTO %s (id, login)
+      VALUES ($1, $2)""" % (my_config['login-table']))
+
+    carnivore_data = file.open(my_config['path'])
+    (line_number, record_number) = (0, 1);
+    record = {}
+    for line in carnivore_data:
+      line_number += 1
+      if len(line) == 0 or line.isspace():
+        #We require a minimum of data in each record:
+        if 'emails' in record and 'names' in record:
+          #collect all queries:
+          qs = []
+          for email in record[emails]:
+            qs.append("EXECUTE carnivore_email_insert (%d, '%s')" % (record_number, email))
+          for name in record[names]:
+            qs.append("EXECUTE carnivore_name_insert (%d, '%s')" % (record_number, name))
+          if record[login]:
+            qs.append("EXECUTE carnivore_login_insert (%d, '%s')" % (record_number, record[login]))
+          for key_type in ['keyring', 'ldap', 'emeritus', 'removed']
+            if record["%s_key" % key_type]:
+              for key in record["%s_key" % key_type]:
+                qs.append("EXECUTE carnivore_keys_insert (%d, '%s', '%s')" % (record_number, key, key_type))
+          for query in qs:
+            cur.execute(query)
+        record_number += 1
+        record = {}
+      else:
+        (field, content) = line.split(': ', 1)
+        if not (len(field) and len(content)):
+          print "Couldn't parse line %d: %s" % (line_number, line)
+        else:
+          field_info = {}
+          if field in carnivore_field_ignores:
+            continue
+          elif carnivore_field_to_DB_map[field]:
+            info = carnivore_field_to_DB_map[field]
+          else:
+            print "Unknown field in line %d: %s" % (line_number, field)
+            continue
+      
+          if   info["content-type"] == "unique-login":
+            match = re.compile('(\w+)@debian.org').search(content)
+            record[info["name"]] = match.group(1) 
+          elif info["content-type"] == "comma-separated":
+            record[info["name"]] = content.rstrip().split(", ")
+          elif info["content-type"] == "multiple entries":
+            if info["name"] not in record:
+              record[info["name"]] = []
+            record[info["name"]].append(content.rstrip())
+
+if __name__ == '__main__':
+  main()




More information about the Collab-qa-commits mailing list