r377 - in /debtorrent/trunk: hippy.py uniquely.py

camrdale-guest at users.alioth.debian.org camrdale-guest at users.alioth.debian.org
Sun May 11 05:44:24 UTC 2008


Author: camrdale-guest
Date: Sun May 11 05:44:23 2008
New Revision: 377

URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=377
Log:
Speed up hippy and uniquely for deployment on merkel.

Modified:
    debtorrent/trunk/hippy.py
    debtorrent/trunk/uniquely.py

Modified: debtorrent/trunk/hippy.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/trunk/hippy.py?rev=377&op=diff
==============================================================================
--- debtorrent/trunk/hippy.py (original)
+++ debtorrent/trunk/hippy.py Sun May 11 05:44:23 2008
@@ -2,67 +2,43 @@
 
 """Calculate the sub-piece hashes for large package files.
 
-Run this script in the directory where the extrapieces files are to be stored.
-It's only command line argument is the Berkeley database containing the cached
-data from previous runs. Pass the paths of Release files to process into the
+The command line arguments are the Berkeley database containing the cached
+data from previous runs, and the unique name of the Packages file being
+processed. Pass the paths of package files to process into the
 standard input.
 
 For example::
 
-    find /var/www/debian -maxdepth 3 -name "Release" | hippy ../hippycache.bdb
+    find dists -name 'Packages.gz' | sort |
+            while read a; do
+                    b=$(echo $a | tr / _)
+                    d=~/public_html/extrapieces/${b%.gz}-extrapieces.gz
+                    zcat $a | sed -n 's/^Filename: //p' |
+                            grep -v '^[.]/' |
+                            sort | (~/hippy ~/mycache $a || echo >&2 "Failed: $a") |
+                            gzip -9 > ${d}.new
+                    mv ${d}.new $d
+            done
+
+ at var MAX_PIECE_SIZE: the maximum piece size to use, pieces may be smaller than this
+ at var CHUNK_SIZE: the download chunk size used by DebTorrent (used to calculate
+    an optimal piece size)
+ at var MAX_AGE: the maximum iterations of the program to keep subpiece information
+    for the Packages file after it's no longer in the file (for slow mirrors)
 
 """
 
 import bsddb, sha, binascii
-import os, sys, gzip
+import os, sys
 import struct
-from bz2 import BZ2File
 from math import ceil
-from debian_bundle import deb822
 
 MAX_PIECE_SIZE = 512*1024
 CHUNK_SIZE = 16*1024
+MAX_AGE = 14
 
 # The Packages files to read
 EXTENSION = ".gz"
-
-def read_release(filename):
-    """Read the headers and Packages file names from a Release file.
-    
-    @type filename: C{string}
-    @param filename: the Release file to read
-    @rtype: C{dictionary}, C{list} of C{string}
-    @return: the headers and full file names of Packages files
-    
-    """
-
-    # Initialize the Release file variables
-    read_packages = False
-    headers = {}
-    packages = []
-    
-    f = open(filename, 'r')
-    
-    rel = deb822.Release(f)
-    for header in rel:
-        if header.lower() not in ["md5sum", "sha1", "sha256"]:
-            # Read the headers from the file
-            headers[header] = rel[header]
-            
-    # Read the Packages file names
-    for file in rel.get('MD5Sum', []):
-        if file['name'].endswith("Packages"+EXTENSION) and file['name'] not in packages:
-            packages.append(file['name'])
-    for file in rel.get('SHA1', []):
-        if file['name'].endswith("Packages"+EXTENSION) and file['name'] not in packages:
-            packages.append(file['name'])
-    for file in rel.get('SHA256', []):
-        if file['name'].endswith("Packages"+EXTENSION) and file['name'] not in packages:
-            packages.append(file['name'])
-    
-    f.close()
-    
-    return headers, packages
 
 def hash(file, piece_size):
     """Read a file and hash it's sub-pieces.
@@ -121,45 +97,54 @@
     """Convert a cache value to a list of package names.
     
     The cache is stored as a string. The list is a repeating sequence of one
-    byte length followed by a string of that length. Therefore, the longest
-    string that can be stored is 256.
+    byte length followed by a string of that length, followed by one byte
+    indicating the number of times the package has been missing from the
+    Packages file plus 1. Therefore, the longest string that can be stored is
+    256.
     
     @type cache_value: C{string}
     @param cache_value: the cached value for this file
-    @rtype: C{list} of C{string}
-    @return: the list of package names stored in the cache
+    @rtype: C{dictionary}
+    @return: keys are the package names stored in the cache, values are the
+        number of times the package has not been found in the Packages file
     
     """
 
     if cache_value == "":
-        return []
-
-    deb_list = []
+        return {}
+
+    debs = {}
     while len(cache_value) > 0:
         length = ord(cache_value[0])
         deb = cache_value[1:length+1]
-        cache_value = cache_value[length+1:]
-        deb_list.append(deb)
-        
-    return deb_list
-
-def list2cache(deb_list):
+        num = ord(cache_value[length+1]) - 1
+        cache_value = cache_value[length+2:]
+        debs[deb] = num
+        
+    return debs
+
+def list2cache(debs):
     """Convert a list of package names to a cacheable value.
     
-    @type deb_list: C{list} of C{string}
-    @param deb_list: the package names to create a cache value for
+    @type debs: C{dictionary}
+    @param debs: the package names to create a cache value for, keys are the
+        names of the packages, values are the integer number of times this
+        package has not been found in the Packages file
     @rtype: C{string}
     @return: the cacheable string
     
     """
     
-    if not deb_list:
+    if not debs:
         return ""
     
+    deb_list = debs.keys()
+    deb_list.sort()
     cache_value = ""
     for deb in deb_list:
         assert len(deb) < 256
-        cache_value += chr(len(deb)) + deb
+        assert debs[deb] >= 0
+        cache_value += chr(len(deb)) + deb + chr(min(255, debs[deb]+1))
     return cache_value
 
 def cache2hash(cache_value):
@@ -223,8 +208,7 @@
     @param filename: the file to calculate sub pieces for
     
     """
-    
-    filename = filename.rstrip()
+    sys.stderr.write('        Hashing: %s\n' % filename)
     
     # Get the size of the file
     size = os.stat(filename).st_size
@@ -242,111 +226,89 @@
         
     return sha1, piece_list
 
-def get_packages(filename):
-    """Read the new piece data from a Packages file.
-    
-    @type filename: C{string}
-    @param filename: the Packages file to open and parse
-    @rtype: C{list} of C{string}
-    @return: the package files listed in the Packages file
-    
-    """
-
-    # Open the possibly compressed file
-    if filename.endswith(".gz"):
-        f = gzip.open(filename, 'r')
-    elif filename.endswith(".bz2"):
-        f = BZ2File(filename, "r")
-    else:
-        f = open(filename, 'r')
-
-    debs = []
-    
-    for pkg in deb822.Packages.iter_paragraphs(f, fields = ['Filename']):
-        if pkg.get('Filename', ''):
-            debs.append(pkg['Filename'])
-    
-    f.close()
-    
-    return debs
-
-def run(cache, releasefile):
+def run(cache, pkg_file):
     """Process a single Release file.
     
     @type cache: C{bsddb.BTree}
     @param cache: an already opened bDB b-tree
-    @type releasefile: C{string}
-    @param releasefile: the Release file to process
-
-    """
-    
-    # Process the Release file
-    print "Processing: %s" % releasefile
-    root_dir = releasefile[:releasefile.index('/dists/')+1]
-    release_dir = releasefile[len(root_dir):].rsplit('/', 1)[0] + '/'
-    release_headers, packages = read_release(releasefile)
-    
-    file_prefix = "dists_" + release_headers.get("Codename", "") + "_"
-    file_suffix = "_Packages-extrapieces.gz"
-    
-    for packages_file in packages:
-        sub_filename = file_prefix + '_'.join(packages_file.split('/')[:-1]) + file_suffix
-        pkey = release_dir + packages_file + ":pl"
-        
-        # Get the list of packages in the packages file
-        debs = get_packages(root_dir + release_dir + packages_file)
-        
-        # Retrieve the saved list of sub-pieced packages in the Packages file
-        if cache.has_key(pkey):
-            packages_list = cache2list(cache[pkey])
+    @type pkg_file: C{string}
+    @param pkg_file: the name of the Packages file being processed
+
+    """
+    sys.stderr.write('Processing: %s\n' % pkg_file)
+    
+    # Retrieve the saved list of sub-pieced packages in the Packages file
+    pkey = pkg_file + ":pl"
+    if cache.has_key(pkey):
+        old_debs = cache2list(cache[pkey])
+    else:
+        old_debs = {}
+    new_debs = {}
+    
+    sys.stderr.write('    Found %d old files\n' % len(old_debs))
+
+    # Print the pice hashes for files in the Packages file
+    for filename in sys.stdin:
+        filename = filename.rstrip()
+        fnkey = filename + ":pc"
+
+        # Check if this file's sub-pieces are already known
+        if cache.has_key(fnkey):
+            sha1, piece_list = cache2hash(cache[fnkey])
         else:
-            packages_list = []
-        all_debs = {}.fromkeys(packages_list, 1)
-
-        # First, sub-piece any new package files
-        for deb in debs:
-            filename = root_dir + deb
-            fnkey = deb + ":pc"
-
-            # Check if this file's sub-pieces are already known
-            if cache.has_key(fnkey):
-                sha1, piece_list = cache2hash(cache[fnkey])
-            else:
-                print '    Hashing new package:', deb
-                sha1, piece_list = sub_piece(filename)
-
-                # Save the result for next time
-                cache[fnkey] = hash2cache(sha1, piece_list)
-                
-            # If it has sub-pieces, save it to the list
-            if piece_list:
-                all_debs[deb] = 1
-        
-        # Write the list back to the cache
-        packages_list = all_debs.keys()
-        packages_list.sort()
-        cache[pkey] = list2cache(packages_list)
-
-        # Write the sub-piece data to the file
-        sub_file = gzip.open(sub_filename, 'w')
-        for deb in packages_list:
-            fnkey = deb + ":pc"
+            # Not known, hash the file
+            sha1, piece_list = sub_piece(filename)
+
+            # Save the result for next time
+            cache[fnkey] = hash2cache(sha1, piece_list)
+        
+        # Remove the file from the old list
+        old_debs.pop(filename, None)
+
+        if piece_list:
+            # Print the resulting sub-piece hashes
+            print "Filename: %s" % (filename, )
+            print "SHA1: %s" % (sha1, )
+            print "SHA1-Pieces:"
+            for x in piece_list:
+                print " %s %d" % x
+            print
+
+            # Save the file to the new list
+            new_debs[filename] = 0
             
-            # Check to make sure
-            if cache.has_key(fnkey):
-                # Get the cached result
-                sha1, piece_list = cache2hash(cache[fnkey])
-                
-                # Print the resulting sub-piece hashes
-                sub_file.write("Filename: %s\n" % (deb))
-                sub_file.write("SHA1: %s\n" % (sha1))
-                sub_file.write("SHA1-Pieces:\n")
-                for x in piece_list:
-                    sub_file.write(" %s %d\n" % x)
-                sub_file.write("\n")
-            else:
-                print "WARNING: no sub-piece data found for " + deb
-        sub_file.close()
+    sys.stderr.write('    Have %d missing old files\n' % len(old_debs))
+        
+    # Also print any unexpired old entries
+    for missing_file in old_debs:
+        # Expire entries after they have been missing for MAX_AGE runs
+        if old_debs[missing_file] >= MAX_AGE:
+            sys.stderr.write('        Expired: %s\n' % missing_file)
+            continue
+        
+        fnkey = missing_file + ":pc"
+
+        # Check if this file's sub-pieces are already known
+        if not cache.has_key(fnkey):
+            continue
+        
+        sha1, piece_list = cache2hash(cache[fnkey])
+
+        if piece_list:
+            # Print the resulting sub-piece hashes
+            print "Filename: %s" % (missing_file, )
+            print "SHA1: %s" % (sha1, )
+            print "SHA1-Pieces:"
+            for x in piece_list:
+                print " %s %d" % x
+            print
+        
+            # Increment the age of the missing file
+            new_debs[missing_file] = old_debs[missing_file] + 1
+    
+    # Write the list back to the cache
+    sys.stderr.write('    Saving %d new files\n' % len(new_debs))
+    cache[pkey] = list2cache(new_debs)
 
 if __name__ == '__main__':
     
@@ -354,10 +316,10 @@
     cache_file = sys.argv[1]
     cache = bsddb.btopen(cache_file, "w")
 
-    # Read Release file names from standard in
-    for filename in sys.stdin:
-        filename = filename.rstrip()
-        run(cache, filename)
+    # Get the Packages file name being processed
+    pkg_file = sys.argv[2]
+    
+    run(cache, pkg_file)
 
     # Close the cache file
     cache.sync()

Modified: debtorrent/trunk/uniquely.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/trunk/uniquely.py?rev=377&op=diff
==============================================================================
--- debtorrent/trunk/uniquely.py (original)
+++ debtorrent/trunk/uniquely.py Sun May 11 05:44:23 2008
@@ -7,10 +7,11 @@
 import gzip
 from bz2 import BZ2File
 from math import ceil
-from os import remove
+from os import remove, rename, system
 from os.path import exists
 from time import strftime, gmtime
 from debian_bundle import deb822
+from tempfile import mkstemp
 
 # The piece size to use (must match the '-extrapieces' file's piece size)
 DEFAULT_PIECESIZE = 512*1024
@@ -197,13 +198,22 @@
     
     """
 
+    f, tmpfile = mkstemp(prefix = 'uniquely')
+#    f.close()
+    
     # Open the possibly compressed file
     if filename.endswith(".gz"):
-        f = gzip.open(filename, 'r')
+        ret = system("zcat '%s' | grep -E '^(Filename:.*|Size:.*|Architecture:.*|)$' > '%s'" % (filename, tmpfile))
+        if ret != 0:
+            raise RuntimeError, 'Failed to decompress %s' % filename
+        filename = tmpfile
     elif filename.endswith(".bz2"):
-        f = BZ2File(filename, "r")
-    else:
-        f = open(filename, 'r')
+        ret = system("bzcat '%s' | grep -E '^(Filename:.*|Size:.*|Architecture:.*|)$' > '%s'" % (filename, tmpfile))
+        if ret != 0:
+            raise RuntimeError, 'Failed to decompress %s' % filename
+        filename = tmpfile
+
+    f = open(filename, 'r')
 
     pieces = {}
     new_pieces = []
@@ -233,6 +243,8 @@
 
     f.close()
     
+    remove(tmpfile)
+    
     return pieces, new_pieces
 
 def add_new(pieces, new_pieces, headers):
@@ -288,7 +300,7 @@
     
     """
 
-    f = gzip.open(filename, 'w')
+    f = gzip.open(filename + '.new', 'w')
     
     # Write the headers
     for header in HEADER_ORDER:
@@ -304,6 +316,7 @@
         f.write(format_string % (p, pieces[p]))
     
     f.close()
+    rename(filename + '.new', filename)
 
 def run(releasefile):
     """Process a single Release file.




More information about the Debtorrent-commits mailing list