[Collab-qa-commits] r2330 - udd/udd
Andreas Tille
tille at alioth.debian.org
Wed May 16 08:15:06 UTC 2012
Author: tille
Date: 2012-05-16 08:15:05 +0000 (Wed, 16 May 2012)
New Revision: 2330
Modified:
udd/udd/bibref_gatherer.py
udd/udd/blends_prospective_gatherer.py
Log:
Define an upstream_reader class which is reusable in blends_prospective_gatherer to check upstream files which should have been imported but are not
Modified: udd/udd/bibref_gatherer.py
===================================================================
--- udd/udd/bibref_gatherer.py 2012-05-16 05:58:33 UTC (rev 2329)
+++ udd/udd/bibref_gatherer.py 2012-05-16 08:15:05 UTC (rev 2330)
@@ -62,32 +62,116 @@
other_known_keys = ('Archive', 'Contact', 'CRAN', 'Donation', 'Download', 'Help', 'Homepage', 'Name', 'Watch', 'Webservice')
-class bibref_gatherer(gatherer):
+class upstream_reader():
"""
- Bibliographic references from debian/upstream files
+ Read references from single debian/upstream file
"""
- def __init__(self, connection, config, source):
- gatherer.__init__(self, connection, config, source)
- self.assert_my_config('table')
+ def __init__(self, ufile, source, log):
+ uf = open(ufile)
+ self.source = source
+ self.references = None
+ self.fields = None
+ self.log = log
+ self.ubibrefs = []
+ self.ubibrefsinglelist = []
- self.log = logging.getLogger(self.__class__.__name__)
- if debug==1:
- self.log.setLevel(logging.DEBUG)
+ try:
+ self.fields = yaml.load(uf.read())
+ except yaml.scanner.ScannerError, err:
+ self.log.error("Scanner error in file %s: %s" % (ufile, str(err)))
+ return
+ except yaml.parser.ParserError, err:
+ self.log.error("Parser error in file %s: %s" % (ufile, str(err)))
+ return
+ except yaml.reader.ReaderError, err:
+ self.log.error("Encoding problem in file %s: %s" % (ufile, str(err)))
+ try:
+ self.references=self.fields['Reference']
+ except KeyError:
+ warn_keys = []
+ for key in self.fields.keys():
+ if key not in other_known_keys:
+ warn_keys.append(key)
+ if len(warn_keys) > 0:
+ log.warning("No references found for source package %s (Keys: %s)" % (self.source, str(warn_keys)))
+ return
+ return
+ except TypeError:
+ self.log.info("debian/upstream file of source package %s does not seem to be a YAML file" % (self.source))
+ return
+
+ def parse(self):
+ if isinstance(self.references, list):
+ # upstream file contains more than one reference
+ rank={} # record different ranks per binary package
+ rank[''] = 0 # default is to have no specific Debian package which is marked by '' in the package column
+ refid = 0
+ for singleref in self.references:
+ singleref['refid'] = refid # refid is not used currently but might make sense to identify references internally
+ singleref['package'] = ''
+ package_found = False
+ for r in singleref.keys():
+ key = r.lower()
+ if key != 'debian-package':
+ continue
+ # self.log.warning("Source package '%s' has key 'debian-package'", self.source)
+ pkg = singleref['package'] = singleref[r]
+ package_found = True
+ if rank.has_key(pkg):
+ rank[pkg] += 1
+ else:
+ rank[pkg] = 0
+ singleref['rank'] = rank[pkg]
+ if not package_found:
+ singleref['rank'] = rank['']
+ rank[''] += 1
+ for singleref in self.references:
+ self.setref(singleref, singleref['package'], singleref['rank'])
+ elif isinstance(self.references, str):
+ # upstream file has wrongly formatted reference
+ self.log.error("File %s has following references: %s" % (ufile, self.references))
else:
- self.log.setLevel(logging.INFO)
- handler = logging.handlers.RotatingFileHandler(filename=self.__class__.__name__+'.log',mode='w')
- formatter = logging.Formatter("%(asctime)s - %(levelname)s - (%(lineno)d): %(message)s")
- handler.setFormatter(formatter)
- self.log.addHandler(handler)
+ # upstream file has exactly one reference
+ package = ''
+ for r in self.references.keys():
+ key = r.lower()
+ if key != 'debian-package':
+ continue
+ self.log.warning("Source package '%s' has key 'debian-package'", self.source)
+ package = self.references[r]
+ self.setref(self.references, package, 0)
- self.bibrefs = []
- self.bibrefsinglelist = []
+ for key in self.fields.keys():
+ keyl=key.lower()
+ if keyl.startswith('reference-'):
+ # sometimes DOI and PMID are stored separately:
+ if keyl.endswith('doi'):
+ if self.references.has_key('doi') or self.references.has_key('DOI'):
+ self.log.warning("Extra key in source package '%s': %s - please remove from upstream file!", self.source, key)
+ continue
+ rdoi={}
+ rdoi['rank'] = 0
+ rdoi['source'] = self.source
+ rdoi['key'] = 'doi'
+ rdoi['value'] = self.fields[key]
+ rdoi['package'] = '' ### Hack!!! we should get rid of Reference-DOI soon to enable specifying 'debian-package' relieable
+ self.ubibrefs.append(rdoi)
+ elif keyl.endswith('pmid'):
+ if self.references.has_key('pmid') or self.references.has_key('PMID'):
+ self.log.warning("Extra key in source package '%s': %s - please remove from upstream file!", self.source, key)
+ continue
+ rpmid={}
+ rpmid['rank'] = 0
+ rpmid['source'] = self.source
+ rpmid['key'] = 'pmid'
+ rpmid['value'] = self.fields[key]
+ rpmid['package'] = '' ### Hack!!! we should get rid of Reference-PMID soon to enable specifying 'debian-package' relieable
+ self.ubibrefs.append(rpmid)
+ else:
+ print "Source package %s has %s : %s" % (self.source, key, self.fields[key])
- self.bibtexfile = 'debian.bib'
- self.bibtex_example_tex = 'debian.tex'
-
- def setref(self, references, source, package, rank):
+ def setref(self, references, package, rank):
year=''
defined_fields = { 'address' : 0,
'article' : 0,
@@ -120,17 +204,17 @@
continue
if defined_fields.has_key(key):
if defined_fields[key] > 0:
- self.log.error("Duplicated key in source package '%s': %s", source, key)
+ self.log.error("Duplicated key in source package '%s': %s", self.source, key)
continue
else:
defined_fields[key] = 1
else:
if key not in ('rank', 'package', 'refid'): # ignore internal maintenance fields
- self.log.warning("Unexpected key in source package '%s': %s", source, key)
+ self.log.warning("Unexpected key in source package '%s': %s", self.source, key)
defined_fields[key] = 1
ref={}
ref['rank'] = rank
- ref['source'] = source
+ ref['source'] = self.source
ref['key'] = key
ref['package'] = package
if isinstance(references[r], int) or isinstance(references[r], float):
@@ -139,40 +223,66 @@
try:
ref['value'] = references[r].strip()
except AttributeError, err:
- self.log.error("Cannot parse value for source %s: r = %s -> value = %s" % (source, r, str(references[r])))
+ self.log.error("Cannot parse value for source %s: r = %s -> value = %s" % (self.source, r, str(references[r])))
ref['value'] = '???'
if key == 'author':
# Try to catch broken author formating
new_author = re.sub(',\s* and\s*' , ' and ', ref['value'])
if new_author != ref['value']:
- self.log.warning("Author of source package %s does contain invalid BibTeX format: %s will be turned into %s", source, ref['value'], new_author)
+ self.log.warning("Author of source package %s does contain invalid BibTeX format: %s will be turned into %s", self.source, ref['value'], new_author)
ref['value'] = new_author
if ref['value'].count(',') > ref['value'].lower().count(' and ') + 1:
- self.log.warning("Suspicious authors field in source package %s with way more ',' than ' and ' strings: %s", source, ref['value'])
+ self.log.warning("Suspicious authors field in source package %s with way more ',' than ' and ' strings: %s", self.source, ref['value'])
match = seek_broken_authors_re.search(ref['value'])
if match:
- self.log.warning("Suspicious authors field in source package %s - seems to have comma separated authors: %s", source, ref['value'])
- self.bibrefs.append(ref)
+ self.log.warning("Suspicious authors field in source package %s - seems to have comma separated authors: %s", self.source, ref['value'])
+ self.ubibrefs.append(ref)
if r.lower() == 'year':
year = ref['value']
# Create unique BibTeX key
- bibtexkey = source
- if bibtexkey in self.bibrefsinglelist and year != '':
- bibtexkey = source+year
- if bibtexkey in self.bibrefsinglelist:
+ bibtexkey = self.source
+ if bibtexkey in self.ubibrefsinglelist and year != '':
+ bibtexkey = self.source+year
+ if bibtexkey in self.ubibrefsinglelist:
# if there are more than one reference per source package and even in
# the same year append the rank as letter
bibtexkey += 'abcdefghijklmnopqrstuvwxyz'[rank]
ref={}
ref['rank'] = rank
- ref['source'] = source
+ ref['source'] = self.source
ref['key'] = 'bibtex'
ref['value'] = re.sub('\+', '-', re.sub('\.', '-', bibtexkey)) # avoid '.' and '+' in BibTeX keys
ref['package'] = package
- self.bibrefsinglelist.append(bibtexkey)
- self.bibrefs.append(ref)
+ self.ubibrefsinglelist.append(bibtexkey)
+ self.ubibrefs.append(ref)
return ref
+ def get_bibrefs(self):
+ return self.ubibrefs
+
+class bibref_gatherer(gatherer):
+ """
+ Bibliographic references from debian/upstream files
+ """
+
+ def __init__(self, connection, config, source):
+ gatherer.__init__(self, connection, config, source)
+ self.assert_my_config('table')
+
+ self.log = logging.getLogger(self.__class__.__name__)
+ if debug==1:
+ self.log.setLevel(logging.DEBUG)
+ else:
+ self.log.setLevel(logging.INFO)
+ handler = logging.handlers.RotatingFileHandler(filename=self.__class__.__name__+'.log',mode='w')
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - (%(lineno)d): %(message)s")
+ handler.setFormatter(formatter)
+ self.log.addHandler(handler)
+
+
+ self.bibtexfile = 'debian.bib'
+ self.bibtex_example_tex = 'debian.tex'
+
def run(self):
my_config = self.my_config
#start harassing the DB, preparing the final inserts and making place
@@ -181,6 +291,8 @@
u_dirs = listdir(my_config['path'])
+ bibrefs = []
+
for u in u_dirs:
upath=my_config['path']+'/'+u
sources = []
@@ -190,106 +302,21 @@
for source in sources:
# print source
ufile = upath+'/'+source+'.upstream'
- uf = open(ufile)
- try:
- fields = yaml.load(uf.read())
- except yaml.scanner.ScannerError, err:
- self.log.error("Scanner error in file %s: %s" % (ufile, str(err)))
+
+ upstream = upstream_reader(ufile, source, self.log)
+ if not upstream.references:
continue
- except yaml.parser.ParserError, err:
- self.log.error("Parser error in file %s: %s" % (ufile, str(err)))
- continue
- except yaml.reader.ReaderError, err:
- self.log.error("Encoding problem in file %s: %s" % (ufile, str(err)))
- continue
- try:
- references=fields['Reference']
- except KeyError:
- warn_keys = []
- for key in fields.keys():
- if key not in other_known_keys:
- warn_keys.append(key)
- if len(warn_keys) > 0:
- self.log.warning("No references found for source package %s (Keys: %s)" % (source, str(warn_keys)))
- continue
- except TypeError:
- self.log.info("debian/upstream file of source package %s does not seem to be a YAML file" % (source))
- continue
- if isinstance(references, list):
- # upstream file contains more than one reference
- rank={} # record different ranks per binary package
- rank[''] = 0 # default is to have no specific Debian package which is marked by '' in the package column
- refid = 0
- for singleref in references:
- singleref['refid'] = refid # refid is not used currently but might make sense to identify references internally
- singleref['package'] = ''
- package_found = False
- for r in singleref.keys():
- key = r.lower()
- if key != 'debian-package':
- continue
- # self.log.warning("Source package '%s' has key 'debian-package'", source)
- pkg = singleref['package'] = singleref[r]
- package_found = True
- if rank.has_key(pkg):
- rank[pkg] += 1
- else:
- rank[pkg] = 0
- singleref['rank'] = rank[pkg]
- if not package_found:
- singleref['rank'] = rank['']
- rank[''] += 1
- for singleref in references:
- self.setref(singleref, source, singleref['package'], singleref['rank'])
- elif isinstance(references, str):
- # upstream file has wrongly formatted reference
- self.log.error("File %s has following references: %s" % (ufile, references))
- else:
- # upstream file has exactly one reference
- package = ''
- for r in references.keys():
- key = r.lower()
- if key != 'debian-package':
- continue
- self.log.warning("Source package '%s' has key 'debian-package'", source)
- package = references[r]
- self.setref(references, source, package, 0)
+ upstream.parse()
+
+ for ref in upstream.get_bibrefs():
+ bibrefs.append(ref)
- for key in fields.keys():
- keyl=key.lower()
- if keyl.startswith('reference-'):
- # sometimes DOI and PMID are stored separately:
- if keyl.endswith('doi'):
- if references.has_key('doi') or references.has_key('DOI'):
- self.log.warning("Extra key in source package '%s': %s - please remove from upstream file!", source, key)
- continue
- rdoi={}
- rdoi['rank'] = 0
- rdoi['source'] = source
- rdoi['key'] = 'doi'
- rdoi['value'] = fields[key]
- rdoi['package'] = '' ### Hack!!! we should get rid of Reference-DOI soon to enable specifying 'debian-package' relieable
- self.bibrefs.append(rdoi)
- elif keyl.endswith('pmid'):
- if references.has_key('pmid') or references.has_key('PMID'):
- self.log.warning("Extra key in source package '%s': %s - please remove from upstream file!", source, key)
- continue
- rpmid={}
- rpmid['rank'] = 0
- rpmid['source'] = source
- rpmid['key'] = 'pmid'
- rpmid['value'] = fields[key]
- rpmid['package'] = '' ### Hack!!! we should get rid of Reference-PMID soon to enable specifying 'debian-package' relieable
- self.bibrefs.append(rpmid)
- else:
- print "Source package %s has %s : %s" % (source, key, fields[key])
# only truncate table if there are really some references found
- if len(self.bibrefs) == 0:
+ if len(bibrefs) == 0:
self.log.error("No references found in any upstream file.")
exit(1)
- # print self.bibrefsinglelist
cur.execute("TRUNCATE %s" % (my_config['table']))
query = """PREPARE bibref_insert (text, text, text, text, int) AS
INSERT INTO %s
@@ -298,7 +325,7 @@
cur.execute(query)
query = "EXECUTE bibref_insert (%(source)s, %(key)s, %(value)s, %(package)s, %(rank)s)"
- for ref in self.bibrefs:
+ for ref in bibrefs:
try:
cur.execute(query, ref)
except UnicodeEncodeError, err:
Modified: udd/udd/blends_prospective_gatherer.py
===================================================================
--- udd/udd/blends_prospective_gatherer.py 2012-05-16 05:58:33 UTC (rev 2329)
+++ udd/udd/blends_prospective_gatherer.py 2012-05-16 08:15:05 UTC (rev 2330)
@@ -18,6 +18,8 @@
from debian import deb822
import email.Utils
+from bibref_gatherer import upstream_reader
+
debug=0
def get_gatherer(connection, config, source):
@@ -104,14 +106,17 @@
cur.execute("EXECUTE check_source (%s)", (source,))
if cur.fetchone()[0] > 0:
# print "Source %s is in DB. Ignore for prospective packages" % source
- upstream=upath+'/'+source+'.upstream'
- if not exists(upstream):
+ ufile=upath+'/'+source+'.upstream'
+ if not exists(ufile):
continue
cur.execute("EXECUTE check_reference (%s)", (source,))
if cur.fetchone()[0] > 0:
# UDD seems to contain the references specified in source.upstream file
- print "DEBUG: I know about the references in", upstream
continue
+ upstream = upstream_reader(ufile, source, self.log)
+ if not upstream.references:
+ # There are no valid references found in this upstream file or it is no valid YAML
+ continue
self.log.warning("%s has upstream file but no references in UDD" % (source, ))
continue
More information about the Collab-qa-commits
mailing list