[Collab-qa-commits] r2142 - in udd: scripts udd
Andreas Tille
tille at alioth.debian.org
Sat Feb 11 21:21:36 UTC 2012
Author: tille
Date: 2012-02-11 21:21:36 +0000 (Sat, 11 Feb 2012)
New Revision: 2142
Modified:
udd/scripts/fetch_ddtp_translations.sh
udd/udd/ddtp_gatherer.py
Log:
Commit first shot at new ddtp importer to at least enable import for others and start testing applications
TODO:
- New housekeeping method to import only changed translations (will be based on housekeeping table)
- Remove debugging stuff
Modified: udd/scripts/fetch_ddtp_translations.sh
===================================================================
--- udd/scripts/fetch_ddtp_translations.sh 2012-02-11 21:13:46 UTC (rev 2141)
+++ udd/scripts/fetch_ddtp_translations.sh 2012-02-11 21:21:36 UTC (rev 2142)
@@ -1,42 +1,30 @@
#!/bin/sh
+# Translation files will be taken from the mirror available on the local machine
+# However, we are doing some housekeeping to register what translations did really
+# changed and need to be importet and which one are not touched by translators
+# and thus can be ignored.
set -e
-ZIPEXT=bz2
-
TARGETPATH=$1
MIRROR=$2
-shift
-shift
-RELEASES=$*
-HTTPMIRROR="http://$MIRROR"
-# RSYNCMIRROR="$MIRROR::debian/"
-# rm -rf "$TARGETPATH"
-for rel in $RELEASES; do
- TARGETDIR="$TARGETPATH"/${rel}
- find "$TARGETPATH"/${rel} -name '*.md5' -exec mv '{}' '{}'.prev \;
- rm -rf "$TARGETDIR"/*.${ZIPEXT}
- [ -d $TARGETDIR ] || mkdir -p $TARGETDIR
- # store a copy of md5 sums of previous files
- `dirname $0`/getlinks.pl "$HTTPMIRROR"/dists/${rel}/main/i18n/ "$TARGETPATH"/${rel} "Translation-.*\.${ZIPEXT}$"
- # create md5 sums of translation files to enable deciding whether processing is needed or not
- for zipfile in `find "$TARGETPATH"/${rel} -name "*.${ZIPEXT}"` ; do md5sum $zipfile > "$TARGETPATH"/${rel}/`basename $zipfile .${ZIPEXT}`.md5 ; done
- # getlinks.pl always returns 0 independently from success so we have to verify that the target dir is
- # not empty.
- NUMFILES=`ls "$TARGETPATH"/${rel} | wc -l`
- if [ $NUMFILES -le 0 ] ; then
- echo "Downloading translation for release ${rel} failed. Stopped."
- exit 66
+
+for indexdir in `find $MIRROR -name i18n -type d | sed "s?$MIRROR/\(.\+\)/i18n?\1?"` ; do
+ # rel=`echo $index | sed "s?$MIRROR/*\([^/]\+\)/.*?\1?"`
+ targetfile="${TARGETPATH}/${indexdir}"
+ mkdir -p `dirname $targetfile`
+ # create backup of previous index file
+ if [ -f "$targetfile" ] ; then
+ mv "$targetfile" "$targetfile".prev
fi
- ## The rsync-able Translations do not (yet) contain package version info
- ## This might happen later but it requires deeper changes in several tools
- ## including apt - so we have to download via http from ddtp directly which
- ## does not support rsync
- # rsync -a --no-motd --include "Translation-*.${ZIPEXT}" --exclude "*" "$RSYNCMIRROR"/dists/${rel}/main/i18n/ $TARGETDIR
+ index=${MIRROR}/$indexdir/i18n/Index
+ if [ -f $index ] ; then
+ grep "\.bz2" $index | sed -e 's/^ //' -e 's/ \+/ /g' > $targetfile
+ else
+ for trans in `find ${MIRROR}/$indexdir/i18n -mindepth 1 -maxdepth 1 -name "*.bz2"` ; do
+ echo "`sha1sum $trans | cut -d' ' -f1``ls -l $trans | sed 's/^[-rwlx]\+ [0-9]\+ [^ ]\+ [^ ]\+\([ 0-9]\+[0-9]\) .*/\1/'` `basename $trans`" >> $targetfile
+ done
+ fi
done
exit 0
-
-# alternatively use wget
-cd "$TARGETPATH"
-wget -erobots=off -m $HTTPMIRROR
Modified: udd/udd/ddtp_gatherer.py
===================================================================
--- udd/udd/ddtp_gatherer.py 2012-02-11 21:13:46 UTC (rev 2141)
+++ udd/udd/ddtp_gatherer.py 2012-02-11 21:21:36 UTC (rev 2142)
@@ -14,11 +14,12 @@
from debian import deb822
except:
from debian_bundle import deb822
-from os import listdir, access, F_OK
+from os import listdir, path, access, F_OK
from sys import stderr, exit
from filecmp import cmp
import gzip
import bz2
+import hashlib
from psycopg2 import IntegrityError, InternalError, ProgrammingError
import logging
@@ -51,12 +52,11 @@
class ddtp_gatherer(gatherer):
# DDTP translations
- select_language_gz_re = re.compile('^Translation-(\w+)\.gz$')
- select_language_bz2_re = re.compile('^Translation-(\w+)\.bz2$')
+ select_language_re = re.compile('^Translation-(\w+)\.bz2$')
def __init__(self, connection, config, source):
gatherer.__init__(self, connection, config, source)
- self.assert_my_config('path', 'files', 'table', 'releases')
+ self.assert_my_config('path', 'files', 'mirrorpath', 'table')
my_config = self.my_config
self.log = logging.getLogger(self.__class__.__name__)
if debug==1:
@@ -89,54 +89,63 @@
def run(self):
my_config = self.my_config
- #start harassing the DB, preparing the final inserts and making place
- #for the new data:
cur = self.cursor()
- releases=my_config['releases'].split(' ')
+ cur.execute('SELECT component FROM packages GROUP by component')
+ rows = cur.fetchall()
+ valid_components = []
+ for r in rows:
+ valid_components.append(r[0])
+ releases = listdir(my_config['path'])
for rel in releases:
- dir = my_config['path']+'/'+rel+'/'
- if not access(dir, F_OK):
- self.log.error("Directory %s for release %s does not exist", dir, rel)
- continue
- for filename in listdir(dir):
- match = ddtp_gatherer.select_language_gz_re.match(filename)
- if not match:
- match = ddtp_gatherer.select_language_bz2_re.match(filename)
+ cpath = my_config['path']+'/'+rel+'/'
+ components = listdir(cpath)
+ for comp in components:
+ if comp not in valid_components:
+ self.log.error("Invallid component '%s' file found in %s", comp, cpath)
+ continue
+ cfp = open(cpath+'/'+comp,'r')
+ trfilepath = my_config['mirrorpath']+'/'+rel+'/'+comp+'/i18n/'
+ for line in cfp.readlines():
+ (sha1, size, file) = line.strip().split(' ')
+ trfile = trfilepath + file
+ # check whether hash recorded in index file fits real file
+ f = open(trfile)
+ h = hashlib.sha1()
+ h.update(f.read())
+ hash = h.hexdigest()
+ f.close()
+ if sha1 != hash:
+ self.log.error("Hash mismatch between file %s and index found in %s/%s.", trfile, cpath, comp)
+ continue
+ fsize = path.getsize(trfile)
+ if int(size) != fsize:
+ self.log.error("Size mismatch between file %s (%i) and index found in %s/%s (%s).", trfile, fsize, cpath, comp, size)
+ continue
+ match = ddtp_gatherer.select_language_re.match(file)
if not match:
+ self.log.error("Can not parse language of file %s.", trfile)
continue
- COMPRESSIONEXTENSION='bz2'
- else:
- COMPRESSIONEXTENSION='gz'
- lang = match.groups()[0]
- md5file=dir + 'Translation-' + lang + '.md5'
- try:
- if ( cmp(md5file, md5file + '.prev' ) ):
- self.log.debug("%s has not changed. No update needed.", md5file)
- continue
- else:
- self.log.debug("%s changed. Go on updating language %s (%s)", md5file, lang, rel)
- pass
- except OSError:
- self.log.info('md5file for language %s in release %s missing -> Go updating', lang, rel)
+ lang = match.groups()[0]
+ self.import_translations(trfile, rel, lang)
+ cfp.close()
+
+ cur.execute("DEALLOCATE ddtp_insert")
+ cur.execute("ANALYZE %s" % my_config['table'])
+
+ def import_translations(self, trfile, rel, lang):
+ print trfile, rel, lang
+
+ cur = self.cursor()
# Delete only records where we actually have Translation files. This
# prevents dump deletion of all data in case of broken downloads
cur.execute('EXECUTE ddtp_delete (%s, %s)', (rel, lang))
self.log.debug('EXECUTE ddtp_delete (%s, %s)', (rel, lang))
-
- if debug == 1:
- cur.execute('SELECT COUNT(*) FROM ddtp WHERE release = %s AND language = %s', (rel, lang))
- if cur.rowcount > 0:
- remaining = cur.fetchone()[0]
- self.log.debug("Remaining translations for language %s in release %s: %s" %(lang, rel, str(remaining)))
i18n_error_flag=0
descstring = 'Description-'+lang
- if COMPRESSIONEXTENSION =='gz':
- g = gzip.GzipFile(dir + filename)
- else:
- g = bz2.BZ2File(dir + filename)
+ g = bz2.BZ2File(trfile)
try:
for stanza in deb822.Sources.iter_paragraphs(g, shared_storage=False):
if i18n_error_flag == 1:
@@ -184,9 +193,6 @@
# commit every successfully language to make sure we get any languages in an will not be blocked by a single failing import
self.connection.commit()
- cur.execute("DEALLOCATE ddtp_insert")
- cur.execute("ANALYZE %s" % my_config['table'])
-
if __name__ == '__main__':
main()
More information about the Collab-qa-commits
mailing list