[Collab-qa-commits] r2142 - in udd: scripts udd

Sat Feb 11 21:21:36 UTC 2012

Author: tille
Date: 2012-02-11 21:21:36 +0000 (Sat, 11 Feb 2012)
New Revision: 2142

Modified:
   udd/scripts/fetch_ddtp_translations.sh
   udd/udd/ddtp_gatherer.py
Log:
Commit first shot at new ddtp importer to at least enable import for others and start testing applications
TODO:
 - New housekeeping method to import only changed translations (will be based on housekeeping table)
 - Remove debugging stuff


Modified: udd/scripts/fetch_ddtp_translations.sh
===================================================================

--- udd/scripts/fetch_ddtp_translations.sh	2012-02-11 21:13:46 UTC (rev 2141)
+++ udd/scripts/fetch_ddtp_translations.sh	2012-02-11 21:21:36 UTC (rev 2142)
@@ -1,42 +1,30 @@
 #!/bin/sh
+# Translation files will be taken from the mirror available on the local machine
+# However, we are doing some housekeeping to register what translations did really
+# changed and need to be importet and which one are not touched by translators
+# and thus can be ignored.
 
 set -e
 
-ZIPEXT=bz2
-
 TARGETPATH=$1
 MIRROR=$2
-shift
-shift
-RELEASES=$*
-HTTPMIRROR="http://$MIRROR"
-# RSYNCMIRROR="$MIRROR::debian/"
-# rm -rf "$TARGETPATH"
-for rel in $RELEASES; do
-    TARGETDIR="$TARGETPATH"/${rel}
-    find "$TARGETPATH"/${rel} -name '*.md5' -exec mv '{}' '{}'.prev \;
-    rm -rf "$TARGETDIR"/*.${ZIPEXT}
-    [ -d $TARGETDIR ] || mkdir -p $TARGETDIR
-    # store a copy of md5 sums of previous files
-    `dirname $0`/getlinks.pl "$HTTPMIRROR"/dists/${rel}/main/i18n/ "$TARGETPATH"/${rel} "Translation-.*\.${ZIPEXT}$"
-    # create md5 sums of translation files to enable deciding whether processing is needed or not
-    for zipfile in `find "$TARGETPATH"/${rel} -name "*.${ZIPEXT}"` ; do md5sum $zipfile > "$TARGETPATH"/${rel}/`basename $zipfile .${ZIPEXT}`.md5 ; done
-    # getlinks.pl always returns 0 independently from success so we have to verify that the target dir is
-    # not empty.
-    NUMFILES=`ls "$TARGETPATH"/${rel} | wc -l`
-    if [ $NUMFILES -le 0 ] ; then
-	echo "Downloading translation for release ${rel} failed. Stopped."
-	exit 66
+
+for indexdir in `find $MIRROR -name i18n -type d | sed "s?$MIRROR/\(.\+\)/i18n?\1?"` ; do
+    # rel=`echo $index | sed "s?$MIRROR/*\([^/]\+\)/.*?\1?"`
+    targetfile="${TARGETPATH}/${indexdir}"
+    mkdir -p `dirname $targetfile`
+    # create backup of previous index file
+    if [ -f "$targetfile" ] ; then
+	mv "$targetfile" "$targetfile".prev
     fi
-    ## The rsync-able Translations do not (yet) contain package version info
-    ## This might happen later but it requires deeper changes in several tools
-    ## including apt - so we have to download via http from ddtp directly which
-    ## does not support rsync
-    # rsync -a --no-motd --include "Translation-*.${ZIPEXT}" --exclude "*" "$RSYNCMIRROR"/dists/${rel}/main/i18n/ $TARGETDIR
+    index=${MIRROR}/$indexdir/i18n/Index
+    if [ -f $index ] ; then
+	grep "\.bz2" $index | sed -e 's/^ //' -e 's/ \+/ /g' > $targetfile
+    else
+	for trans in `find ${MIRROR}/$indexdir/i18n -mindepth 1 -maxdepth 1 -name "*.bz2"` ; do
+	    echo "`sha1sum $trans | cut -d' ' -f1``ls -l $trans | sed 's/^[-rwlx]\+ [0-9]\+ [^ ]\+ [^ ]\+\([ 0-9]\+[0-9]\) .*/\1/'` `basename $trans`" >> $targetfile
+	done
+    fi
 done
 
 exit 0
-
-# alternatively use wget
-cd "$TARGETPATH"
-wget -erobots=off -m $HTTPMIRROR

Modified: udd/udd/ddtp_gatherer.py
===================================================================
--- udd/udd/ddtp_gatherer.py	2012-02-11 21:13:46 UTC (rev 2141)
+++ udd/udd/ddtp_gatherer.py	2012-02-11 21:21:36 UTC (rev 2142)
@@ -14,11 +14,12 @@
     from debian import deb822
 except:
     from debian_bundle import deb822
-from os import listdir, access, F_OK
+from os import listdir, path, access, F_OK
 from sys import stderr, exit
 from filecmp import cmp
 import gzip
 import bz2
+import hashlib
 from psycopg2 import IntegrityError, InternalError, ProgrammingError
 
 import logging
@@ -51,12 +52,11 @@
 class ddtp_gatherer(gatherer):
   # DDTP translations
 
-  select_language_gz_re    = re.compile('^Translation-(\w+)\.gz$')
-  select_language_bz2_re   = re.compile('^Translation-(\w+)\.bz2$')
+  select_language_re   = re.compile('^Translation-(\w+)\.bz2$')
 
   def __init__(self, connection, config, source):
     gatherer.__init__(self, connection, config, source)
-    self.assert_my_config('path', 'files', 'table', 'releases')
+    self.assert_my_config('path', 'files', 'mirrorpath', 'table')
     my_config = self.my_config
     self.log = logging.getLogger(self.__class__.__name__)
     if debug==1:
@@ -89,54 +89,63 @@
 
   def run(self):
     my_config = self.my_config
-    #start harassing the DB, preparing the final inserts and making place
-    #for the new data:
     cur = self.cursor()
 
-    releases=my_config['releases'].split(' ')
+    cur.execute('SELECT component FROM packages GROUP by component')
+    rows = cur.fetchall()
+    valid_components = []
+    for r in rows:
+        valid_components.append(r[0])
+    releases = listdir(my_config['path'])
     for rel in releases:
-      dir = my_config['path']+'/'+rel+'/'
-      if not access(dir, F_OK):
-	self.log.error("Directory %s for release %s does not exist", dir, rel)
-        continue
-      for filename in listdir(dir):
-        match = ddtp_gatherer.select_language_gz_re.match(filename)
-        if not match:
-          match = ddtp_gatherer.select_language_bz2_re.match(filename)
+      cpath = my_config['path']+'/'+rel+'/'
+      components = listdir(cpath)
+      for comp in components:
+        if comp not in valid_components:
+          self.log.error("Invallid component '%s' file found in %s", comp, cpath)
+          continue
+        cfp = open(cpath+'/'+comp,'r')
+        trfilepath = my_config['mirrorpath']+'/'+rel+'/'+comp+'/i18n/'
+        for line in cfp.readlines():
+          (sha1, size, file) = line.strip().split(' ')
+          trfile = trfilepath + file
+          # check whether hash recorded in index file fits real file
+          f = open(trfile)
+          h = hashlib.sha1()
+          h.update(f.read())
+          hash = h.hexdigest()
+          f.close()
+          if sha1 != hash:
+            self.log.error("Hash mismatch between file %s and index found in %s/%s.", trfile, cpath, comp)
+            continue
+          fsize = path.getsize(trfile)
+          if int(size) != fsize:
+            self.log.error("Size mismatch between file %s (%i) and index found in %s/%s (%s).", trfile, fsize, cpath, comp, size)
+            continue
+          match = ddtp_gatherer.select_language_re.match(file)
           if not match:
+            self.log.error("Can not parse language of file %s.", trfile)
             continue
-          COMPRESSIONEXTENSION='bz2'
-        else:
-          COMPRESSIONEXTENSION='gz'
-        lang = match.groups()[0]
-        md5file=dir + 'Translation-' + lang + '.md5'
-        try:
-          if ( cmp(md5file, md5file + '.prev' ) ):
-            self.log.debug("%s has not changed.  No update needed.", md5file)
-            continue
-          else:
-            self.log.debug("%s changed.  Go on updating language %s (%s)", md5file, lang, rel)
-            pass
-        except OSError:
-          self.log.info('md5file for language %s in release %s missing -> Go updating', lang, rel)
+          lang = match.groups()[0]
+          self.import_translations(trfile, rel, lang)
+        cfp.close()
+       
+    cur.execute("DEALLOCATE ddtp_insert")
+    cur.execute("ANALYZE %s" % my_config['table'])
 
+
+  def import_translations(self, trfile, rel, lang):
+        print trfile, rel, lang
+
+        cur = self.cursor()
         # Delete only records where we actually have Translation files.  This
         # prevents dump deletion of all data in case of broken downloads
         cur.execute('EXECUTE ddtp_delete (%s, %s)', (rel, lang))
         self.log.debug('EXECUTE ddtp_delete (%s, %s)', (rel, lang))
-        
-        if debug == 1:
-    	  cur.execute('SELECT COUNT(*) FROM ddtp WHERE release = %s AND language = %s', (rel, lang))
-          if cur.rowcount > 0:
-            remaining = cur.fetchone()[0]
-            self.log.debug("Remaining translations for language %s in release %s: %s" %(lang, rel, str(remaining)))
 
         i18n_error_flag=0
         descstring = 'Description-'+lang
-        if COMPRESSIONEXTENSION =='gz':
-          g = gzip.GzipFile(dir + filename)
-        else:
-          g = bz2.BZ2File(dir + filename)
+        g = bz2.BZ2File(trfile)
         try:
           for stanza in deb822.Sources.iter_paragraphs(g, shared_storage=False):
             if i18n_error_flag == 1:
@@ -184,9 +193,6 @@
         # commit every successfully language to make sure we get any languages in an will not be blocked by a single failing import
         self.connection.commit()
 
-    cur.execute("DEALLOCATE ddtp_insert")
-    cur.execute("ANALYZE %s" % my_config['table'])
-
 if __name__ == '__main__':
   main()