[Collab-qa-commits] r1023 - in udd/src: . udd
he at alioth.debian.org
he at alioth.debian.org
Fri Aug 8 23:58:51 UTC 2008
Author: he
Date: 2008-08-08 23:58:51 +0000 (Fri, 08 Aug 2008)
New Revision: 1023
Added:
udd/src/udd/carnivore_gatherer.py
Modified:
udd/src/setup-db.sql
Log:
First try at a carnivore data importer. God, I do hate python.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
Modified: udd/src/setup-db.sql
===================================================================
--- udd/src/setup-db.sql 2008-08-08 21:20:25 UTC (rev 1022)
+++ udd/src/setup-db.sql 2008-08-08 23:58:51 UTC (rev 1023)
@@ -113,13 +113,30 @@
(package text, version text, date timestamp with time zone, changed_by text,
maintainer text, nmu boolean, signed_by text, key_id text);
+CREATE TABLE carnivore_emails
+ (id int, email text,
+ PRIMARY KEY(id));
+
+CREATE TABLE carnivore_names
+ (id int, name text,
+ PRIMARY KEY(id));
+
+CREATE TABLE carnivore_keys
+ (id int, key text, key_type text,
+ PRIMARY KEY(id));
+
+CREATE TABLE carnivore_login
+ (id int, login text,
+ PRIMARY KEY(id));
+
CREATE INDEX packages_source_idx on packages(source);
-CREATE INDEX packages_distrelcomp_idx on packages(distribution, release, component);
-CREATE INDEX sources_distrelcomp_idx on sources(distribution, release, component);
-
+CREATE INDEX sources_distribution_idx on sources(distribution);
+CREATE INDEX sources_release_idx on sources(release);
+CREATE INDEX sources_component_idx on sources(component);
CREATE INDEX ubuntu_packages_source_idx on ubuntu_packages(source);
-CREATE INDEX ubuntu_packages_distrelcomp_idx on packages(distribution, release, component);
-CREATE INDEX ubuntu_sources_distrelcomp_idx on ubuntu_sources(distribution, release, component);
+CREATE INDEX ubuntu_sources_distribution_idx on ubuntu_sources(distribution);
+CREATE INDEX ubuntu_sources_release_idx on ubuntu_sources(release);
+CREATE INDEX ubuntu_sources_component_idx on ubuntu_sources(component);
GRANT SELECT ON packages TO PUBLIC;
GRANT SELECT ON sources TO PUBLIC;
Added: udd/src/udd/carnivore_gatherer.py
===================================================================
--- udd/src/udd/carnivore_gatherer.py (rev 0)
+++ udd/src/udd/carnivore_gatherer.py 2008-08-08 23:58:51 UTC (rev 1023)
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+
+"""
+This script imports the carnivore data into the database
+See merkel.debian.org:/org/qa.debian.org/carnivore/
+"""
+
+import aux
+import sys
+import gzip
+from gatherer import gatherer
+import re
+
+def get_gatherer(connection, config):
+ return carnivore_gatherer(connection, config)
+
+class carnivore_gatherer(gatherer):
+ carnivore_field_ignores = ["Packages", "X-MIA", "X-Warning"]
+ carnivore_field_to_DB_map = {
+ "Using emails": {"name": "emails", "content-type": "comma-separated"},
+ "Known as": {"name": "names", "content-type": "comma-separated"},
+ "DD": {"name": "login", "content-type": "unique-login"},
+ "Key in keyring": {"name": "keyring_key", "content-type": "multiple entries"},
+ "Key in ldap": {"name": "ldap_key", "content-type": "multiple entries"},
+ "Key in emeritus": {"name": "emeritus_key", "content-type": "multiple entries"},
+ "Key in removed": {"name": "removed_key", "content-type": "multiple entries"},
+ }
+
+ def __init__(self, connection, config):
+ gatherer.__init__(self, connection, config)
+
+ def run(self, source):
+ try:
+ my_config = self.config[source]
+ except:
+ raise
+
+ #check that the config contains everything we need:
+ for key in ['path', 'emails-table', 'names-table', 'keys-table', 'login-table']:
+ if not key in my_config:
+ raise aux.ConfigException, "%s not configured for source %s" % (key, source)
+
+ #start harassing the DB, preparing the final inserts and making place
+ #for the new data:
+ cur = self.cursor()
+
+ for table in ['emails', 'names', 'keys', 'login']:
+ cur.execute("DELETE FROM %s" % my_config["%s-table" % table])
+
+ cur.execute("""PREPARE carnivore_email_insert
+ AS INSERT INTO %s (id, email)
+ VALUES ($1, $2)""" % (my_config['emails-table']))
+ cur.execute("""PREPARE carnivore_name_insert
+ AS INSERT INTO %s (id, name)
+ VALUES ($1, $2)""" % (my_config['name-table']))
+ cur.execute("""PREPARE carnivore_keys_insert
+ AS INSERT INTO %s (id, key, key_type)
+ VALUES ($1, $2, $3)""" % (my_config['keys-table']))
+ cur.execute("""PREPARE carnivore_login_insert
+ AS INSERT INTO %s (id, login)
+ VALUES ($1, $2)""" % (my_config['login-table']))
+
+ carnivore_data = file.open(my_config['path'])
+ (line_number, record_number) = (0, 1);
+ record = {}
+ for line in carnivore_data:
+ line_number += 1
+ if len(line) == 0 or line.isspace():
+ #We require a minimum of data in each record:
+ if 'emails' in record and 'names' in record:
+ #collect all queries:
+ qs = []
+ for email in record[emails]:
+ qs.append("EXECUTE carnivore_email_insert (%d, '%s')" % (record_number, email))
+ for name in record[names]:
+ qs.append("EXECUTE carnivore_name_insert (%d, '%s')" % (record_number, name))
+ if record[login]:
+ qs.append("EXECUTE carnivore_login_insert (%d, '%s')" % (record_number, record[login]))
+ for key_type in ['keyring', 'ldap', 'emeritus', 'removed']
+ if record["%s_key" % key_type]:
+ for key in record["%s_key" % key_type]:
+ qs.append("EXECUTE carnivore_keys_insert (%d, '%s', '%s')" % (record_number, key, key_type))
+ for query in qs:
+ cur.execute(query)
+ record_number += 1
+ record = {}
+ else:
+ (field, content) = line.split(': ', 1)
+ if not (len(field) and len(content)):
+ print "Couldn't parse line %d: %s" % (line_number, line)
+ else:
+ field_info = {}
+ if field in carnivore_field_ignores:
+ continue
+ elif carnivore_field_to_DB_map[field]:
+ info = carnivore_field_to_DB_map[field]
+ else:
+ print "Unknown field in line %d: %s" % (line_number, field)
+ continue
+
+ if info["content-type"] == "unique-login":
+ match = re.compile('(\w+)@debian.org').search(content)
+ record[info["name"]] = match.group(1)
+ elif info["content-type"] == "comma-separated":
+ record[info["name"]] = content.rstrip().split(", ")
+ elif info["content-type"] == "multiple entries":
+ if info["name"] not in record:
+ record[info["name"]] = []
+ record[info["name"]].append(content.rstrip())
+
+if __name__ == '__main__':
+ main()
More information about the Collab-qa-commits
mailing list