[Collab-qa-commits] r2180 - in udd: . scripts udd

Andreas Tille tille at alioth.debian.org
Mon Mar 26 21:59:17 UTC 2012


Author: tille
Date: 2012-03-26 21:59:16 +0000 (Mon, 26 Mar 2012)
New Revision: 2180

Modified:
   udd/config-org.yaml
   udd/scripts/fetch_bibref.sh
   udd/udd/bibref_gatherer.py
Log:
New bibref gatherer based on upstream files gathered in SVN


Modified: udd/config-org.yaml
===================================================================
--- udd/config-org.yaml	2012-03-25 06:16:52 UTC (rev 2179)
+++ udd/config-org.yaml	2012-03-26 21:59:16 UTC (rev 2180)
@@ -895,7 +895,7 @@
 
 bibref:
   type: bibref
-  update-command: /org/udd.debian.org/udd/scripts/fetch_bibref.sh
+  update-command: /org/udd.debian.org/udd/scripts/fetch_bibref.sh /org/udd.debian.org/mirrors/bibref svn://svn.debian.org/svn/collab-qa/packages-metadata
   path: /org/udd.debian.org/mirrors/bibref
   cache: /org/udd.debian.org/mirrors/cache
   table: bibref

Modified: udd/scripts/fetch_bibref.sh
===================================================================
--- udd/scripts/fetch_bibref.sh	2012-03-25 06:16:52 UTC (rev 2179)
+++ udd/scripts/fetch_bibref.sh	2012-03-26 21:59:16 UTC (rev 2180)
@@ -2,10 +2,8 @@
 
 set -e
 
-TARGETDIR=/org/udd.debian.org/mirrors/bibref
-FETCHURL=http://upstream-metadata.debian.net/~plessy/biblio.yaml
-YAMLFILE=bibref.yaml
+TARGETDIR=$1
+FETCHURL=$2
 mkdir -p $TARGETDIR
-# set -x
-rm -rf $TARGETDIR/${YAMLFILE}
-wget -q ${FETCHURL} -O ${TARGETDIR}/${YAMLFILE}
+rm -rf $TARGETDIR
+svn export $FETCHURL $TARGETDIR >/dev/null

Modified: udd/udd/bibref_gatherer.py
===================================================================
--- udd/udd/bibref_gatherer.py	2012-03-25 06:16:52 UTC (rev 2179)
+++ udd/udd/bibref_gatherer.py	2012-03-26 21:59:16 UTC (rev 2180)
@@ -6,52 +6,199 @@
 
 from gatherer import gatherer
 from sys import stderr, exit
-from yaml import safe_load_all
+from os import listdir
+from fnmatch import fnmatch
+import yaml
+from psycopg2 import IntegrityError, InternalError
+import re
+import logging
+import logging.handlers
 
-online=0
+debug=0
 
 def get_gatherer(connection, config, source):
   return bibref_gatherer(connection, config, source)
 
 class bibref_gatherer(gatherer):
   """
-  Bibliographic references from upstream-metadata.debian.net.
+  Bibliographic references from debian/upstream files
   """
 
   def __init__(self, connection, config, source):
     gatherer.__init__(self, connection, config, source)
     self.assert_my_config('table')
 
+    self.log = logging.getLogger(self.__class__.__name__)
+    if debug==1:
+        self.log.setLevel(logging.DEBUG)
+    else:
+        self.log.setLevel(logging.INFO)
+    handler = logging.handlers.RotatingFileHandler(filename=self.__class__.__name__+'.log',mode='w')
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - (%(lineno)d): %(message)s")
+    handler.setFormatter(formatter)
+    self.log.addHandler(handler)
+
+    self.bibrefs = []
+    self.bibrefsinglelist = []
+
+  def setref(self, references, package, rank):
+    year=''
+    defined_fields = { 'article'   : 0,
+                       'author'    : 0,
+                       'booktitle' : 0,
+                       'doi'       : 0,
+                       'editor'    : 0,
+                       'eprint'    : 0,
+                       'issn'      : 0,
+                       'journal'   : 0,
+                       'license'   : 0,
+                       'month'     : 0,
+                       'number'    : 0,
+                       'pages'     : 0,
+                       'publisher' : 0,
+                       'pmid'      : 0,
+                       'title'     : 0,
+                       'url'       : 0,
+                       'volume'    : 0,
+                       'year'      : 0,
+                     }
+    for r in references.keys():
+      key = r.lower()
+      if defined_fields.has_key(key):
+        if defined_fields[key] > 0:
+          self.log.error("Duplicated key in package '%s': %s", package, key)
+          continue
+        else:
+          defined_fields[key] = 1
+      else:
+          self.log.warning("Unexpected key in package '%s': %s", package, key)
+          defined_fields[key] = 1
+      ref={}
+      ref['rank']    = rank
+      ref['package'] = package
+      ref['key']     = key
+      if isinstance(references[r], int):
+        ref['value']   = str(references[r])
+      else:
+        ref['value']   = references[r]
+      self.bibrefs.append(ref)
+      if r.lower() == 'year':
+        year = ref['value']
+    # Create unique BibTeX key
+    bibtexkey = package
+    if bibtexkey in self.bibrefsinglelist and year != '':
+      bibtexkey = package+year
+    if bibtexkey in self.bibrefsinglelist:
+      # if there are more than one reference per package and even in
+      # the same year append the rank as letter
+      bibtexkey += 'abcdefghijklmnopqrstuvwxyz'[rank]
+    ref={}
+    ref['rank']    = rank
+    ref['package'] = package
+    ref['key']     = 'bibtex'
+    ref['value']   = bibtexkey
+    self.bibrefsinglelist.append(bibtexkey)
+    self.bibrefs.append(ref)
+    return ref
+
   def run(self):
     my_config = self.my_config
     #start harassing the DB, preparing the final inserts and making place
     #for the new data:
     cur = self.cursor()
 
-    bibref_file = my_config['bibref_yaml']
-    fp = open(bibref_file, 'r')
-    result = fp.read()
-    fp.close()
+    u_dirs = listdir(my_config['path'])
 
-    if not len(result) > 0:
-      print >>stderr, "BibRef input file does not contain data.  Leave table %s unchanged and stop processing here" % (my_config['table'])
+    for u in u_dirs:
+      upath=my_config['path']+'/'+u
+      packages = []
+      for file in listdir(upath):
+        if fnmatch(file, '*.upstream'):
+          packages.append(re.sub("\.upstream", "", file))
+      # packages = listdir(upath)
+      for package in packages:
+        print package
+        ufile = upath+'/'+package+'.upstream'
+        uf = open(ufile)
+        try:
+          fields = yaml.load(uf.read())
+        except yaml.scanner.ScannerError, err:
+          self.log.error("Syntax error in file %s: %s" % (ufile, str(err)))
+          continue
+        try:
+          references=fields['Reference']
+        except KeyError:
+          self.log.warning("No references found for package %s (Keys: %s)" % (package,str(fields.keys())))
+          continue
+        except TypeError:
+          self.log.warning("debian/upstream file of package %s does not seem to be a YAML file" % (package))
+          continue
+
+        if isinstance(references, list):
+          # upstream file contains more than one reference
+          rank=0
+          for singleref in references:
+            self.setref(singleref, package, rank)
+            rank += 1
+        elif isinstance(references, str):
+          # upstream file has wrongly formatted reference
+          self.log.error("File %s has following references: %s" % (ufile, references))
+        else:
+          # upstream file has exactly one reference
+          self.setref(references, package, 0)
+
+        for key in fields.keys():
+          keyl=key.lower()
+    	  if keyl.startswith('reference-'):
+    	    # sometimes DOI and PMID are stored separately:
+    	    if keyl.endswith('doi'):
+    	      if references.has_key('doi') or references.has_key('DOI'):
+                self.log.warning("Extra key in package '%s': %s - please remove from upstream file!", package, key)
+    	        continue
+              rdoi={}
+              rdoi['rank']    = 0
+              rdoi['package'] = package
+              rdoi['key']     = 'doi'
+              rdoi['value']   = fields[key]
+              self.bibrefs.append(rdoi)
+    	    elif keyl.endswith('pmid'):
+    	      if references.has_key('pmid') or references.has_key('PMID'):
+                self.log.warning("Extra key in package '%s': %s - please remove from upstream file!", package, key)
+    	        continue
+              rpmid={}
+              rpmid['rank']    = 0
+              rpmid['package'] = package
+              rpmid['key']     = 'pmid'
+              rpmid['value']   = fields[key]
+              self.bibrefs.append(rpmid)
+    	    else:
+    	      print "Package %s has %s : %s" % (package, key, fields[key])
+    # only truncate table if there are really some references found
+    if len(self.bibrefs) == 0:
+      self.log.error("No references found in any upstream file.")
       exit(1)
+
+    # print self.bibrefsinglelist
     cur.execute("TRUNCATE %s" % (my_config['table']))
-    query = """PREPARE bibref_insert (text, text, text) AS
+    query = """PREPARE bibref_insert (text, text, text, int) AS
                    INSERT INTO %s
-                   (package, key, value)
-                    VALUES ($1, $2, $3)""" % (my_config['table'])
+                   (package, key, value, rank)
+                    VALUES ($1, $2, $3, $4)""" % (my_config['table'])
     cur.execute(query)
 
-    for res in safe_load_all(result):
-      package, key, value = res
-      value = unicode(value)
-      query = "EXECUTE bibref_insert (%s, %s, %s)"
+    query = "EXECUTE bibref_insert (%(package)s, %(key)s, %(value)s, %(rank)s)"
+    for ref in self.bibrefs:
       try:
-        cur.execute(query, (package, key, value.encode('utf-8')))
+        cur.execute(query, ref)
       except UnicodeEncodeError, err:
-        print >>stderr, "Unable to inject data for package %s, key %s, value %s. %s" % (package, key, value, err)
-        print >>stderr,  "-->", res
+        self.log.error("Unable to inject data: %s\n%s" % (str(ref),str(err)))
+        exit(1)
+      except IntegrityError, err:
+        self.log.error("Unable to inject data: %s\n%s" % (str(ref),str(err)))
+        exit(1)
+      except InternalError, err:
+        self.log.error("Unable to inject data: %s\n%s" % (str(ref),str(err)))
+        exit(1)
     cur.execute("DEALLOCATE bibref_insert")
     cur.execute("ANALYZE %s" % my_config['table'])
 




More information about the Collab-qa-commits mailing list