r273 - /debtorrent/branches/unique/hippy.py

camrdale-guest at users.alioth.debian.org camrdale-guest at users.alioth.debian.org
Sun Aug 19 23:23:10 UTC 2007


Author: camrdale-guest
Date: Sun Aug 19 23:23:10 2007
New Revision: 273

URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=273
Log:
Update hippy to save old sub-piece data for use with out-of-date mirrors.

Modified:
    debtorrent/branches/unique/hippy.py

Modified: debtorrent/branches/unique/hippy.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/branches/unique/hippy.py?rev=273&op=diff
==============================================================================
--- debtorrent/branches/unique/hippy.py (original)
+++ debtorrent/branches/unique/hippy.py Sun Aug 19 23:23:10 2007
@@ -1,14 +1,76 @@
 #!/usr/bin/env python
 
-"""Calculate the sub-piece hashes for large package files."""
+"""Calculate the sub-piece hashes for large package files.
+
+Run this script in the directory where the extrapieces files are to be stored.
+It's only command line argument is the Berkeley database containing the cached
+data from previous runs. Pass the paths of Release files to process into the
+standard input.
+
+For example::
+
+    find /var/www/debian -maxdepth 3 -name "Release" | hippy ../hippycache.bdb
+
+"""
 
 import bsddb, sha, binascii
-import os, sys
+import os, sys, gzip
 import struct
+from bz2 import BZ2File
 from math import ceil
 
 MAX_PIECE_SIZE = 512*1024
 CHUNK_SIZE = 16*1024
+
+# The Packages files to read
+EXTENSION = ".gz"
+
+def read_release(filename):
+    """Read the headers and Packages file names from a Release file.
+    
+    @type filename: C[string}
+    @param filename: the Release file to read
+    @rtype: C{dictionary}, C{list} of C{string}
+    @return: the headers and full file names of Packages files
+    
+    """
+
+    # Initialize the Release file variables
+    read_packages = False
+    headers = {}
+    packages = []
+    
+    f = open(filename, 'r')
+    
+    for line in f:
+        line = line.rstrip()
+
+        if line[:1] != " ":
+            read_packages = False
+            try:
+                # Read the various headers from the file
+                h, v = line.split(":", 1)
+                if h == "MD5Sum" or h == "SHA1" or h == "SHA256":
+                    read_packages = True
+                elif len(v) > 0:
+                    headers[h] = v[1:]
+            except:
+                # Bad header line, just ignore it
+                print "WARNING: Ignoring badly formatted Release line:", line
+
+            # Skip to the next line
+            continue
+        
+        # Read file names from the multiple hash sections of the file
+        if read_packages:
+            p = line.split()
+            if len(p) == 3 and p[2].endswith("Packages"+EXTENSION):
+                if p[2] not in packages:
+                    packages.append(p[2])
+    
+    f.close()
+    
+    return headers, packages
 
 def hash(file, piece_size):
     """Read a file and hash it's sub-pieces.
@@ -63,8 +125,53 @@
     n = 1 + size / MAX_PIECE_SIZE
     return max(MAX_PIECE_SIZE/2, int(ceil((float(size)/n)/CHUNK_SIZE))*CHUNK_SIZE)
 
+def cache2list(cache_value):
+    """Convert a cache value to a list of package names.
+    
+    The cache is stored as a string. The list is a repeating sequence of one
+    byte length followed by a string of that length. Therefore, the longest
+    string that can be stored is 256.
+    
+    @type cache_value: C{string}
+    @param cache_value: the cached value for this file
+    @rtype: C{list} of C{string}
+    @return: the list of package names stored in the cache
+    
+    """
+
+    if cache_value == "":
+        return []
+
+    deb_list = []
+    while len(cache_value) > 0:
+        length = ord(cache_value[0])
+        deb = cache_value[1:length+1]
+        cache_value = cache_value[length+1:]
+        deb_list.append(deb)
+        
+    return deb_list
+
+def list2cache(deb_list):
+    """Convert a list of package names to a cacheable value.
+    
+    @type deb_list: C{list} of C{string}
+    @param deb_list: the package names to create a cache value for
+    @rtype: C{string}
+    @return: the cacheable string
+    
+    """
+    
+    if not deb_list:
+        return ""
+    
+    cache_value = ""
+    for deb in deb_list:
+        assert len(deb) < 256
+        cache_value += chr(len(deb)) + deb
+    return cache_value
+
 def cache2hash(cache_value):
-    """Convert a list of sub-piece hashes to a cacheable value.
+    """Convert a cache value to a list of sub-piece hashes.
     
     The cache is stored as a string. The first 20 bytes are the SHA1 hash of
     the entire file. Then there are repeating 24 byte sequences, the first 4
@@ -117,50 +224,144 @@
         cache_value += struct.pack(">i", length) + binascii.a2b_hex(hash)
     return cache_value
 
-def sub_piece(cache, filename):
+def sub_piece(filename):
     """Calculate and print the sub-pieces for a single file.
+    
+    @type filename: C{String}
+    @param filename: the file to calculate sub pieces for
+    
+    """
+    
+    filename = filename.rstrip()
+    
+    # Get the size of the file
+    size = os.stat(filename).st_size
+    
+    if size <= MAX_PIECE_SIZE:
+        # No sub-pieces are needed for this file
+        sha1 = ""
+        piece_list = []
+    else:
+        # Calculate all the sub-piece hashes
+        piece_size = optimal_piece_size(size)
+        file = open(filename)
+        sha1, piece_list = hash(file, piece_size)
+        file.close()
+        
+    return sha1, piece_list
+
+def get_packages(filename):
+    """Read the new piece data from a Packages file.
+    
+    @type filename: C[string}
+    @param filename: the Packages file to open and parse
+    @rtype: C{list} of C{string}
+    @return: the package files listed in the Packages file
+    
+    """
+
+    # Open the possibly compressed file
+    if filename.endswith(".gz"):
+        f = gzip.open(filename, 'r')
+    elif filename.endswith(".bz2"):
+        f = BZ2File(filename, "r")
+    else:
+        f = open(filename, 'r')
+
+    debs = []
+    
+    p = [None]
+    for line in f:
+        line = line.rstrip()
+
+        if line == "":
+            if p[0]:
+                debs.append(p[0])
+            p = [None]
+        if line[:9] == "Filename:":
+            p[0] = line[10:]
+    
+    f.close()
+    
+    return debs
+
+def run(cache, releasefile):
+    """Process a single Release file.
     
     @type cache: C{bsddb.BTree}
     @param cache: an already opened bDB b-tree
-    @type filename: C{String}
-    @param filename: the file to calculate sub pieces for
-    
-    """
-    
-    filename = filename.rstrip()
-    
-    # Check if this file's sub-pieces are already known
-    fnkey = filename + ":pc"
-    if cache.has_key(fnkey):
-        # Use the cached result
-        sha1, piece_list = cache2hash(cache[fnkey])
-    else:
-        # Get the size fo the file
-        size = os.stat(filename).st_size
-        
-        if size <= MAX_PIECE_SIZE:
-            # No sub-pieces are needed for this file
-            cache_value = ""
-            piece_list = []
+    @type releasefile: C[string}
+    @param releasefile: the Release file to process
+
+    """
+    
+    # Process the Release file
+    print "Processing: %s" % releasefile
+    root_dir = releasefile[:releasefile.index('/dists/')+1]
+    release_dir = releasefile[len(root_dir):].rsplit('/', 1)[0] + '/'
+    release_headers, packages = read_release(releasefile)
+    
+    file_prefix = "dists_" + release_headers.get("Codename", "") + "_"
+    file_suffix = "_Packages-extrapieces.gz"
+    
+    for packages_file in packages:
+        sub_filename = file_prefix + '_'.join(packages_file.split('/')[:-1]) + file_suffix
+        pkey = release_dir + packages_file + ":pl"
+        
+        # Get the list of packages in the packages file
+        debs = get_packages(root_dir + release_dir + packages_file)
+        
+        # Retrieve the saved list of sub-pieced packages in the Packages file
+        if cache.has_key(pkey):
+            packages_list = cache2list(cache[pkey])
         else:
-            # Calculate all the sub-piece hashes
-            piece_size = optimal_piece_size(size)
-            file = open(filename)
-            sha1, piece_list = hash(file, piece_size)
-            cache_value = hash2cache(sha1, piece_list)
-            file.close()
+            packages_list = []
+        all_debs = {}.fromkeys(packages_list, 1)
+
+        # First, sub-piece any new package files
+        for deb in debs:
+            filename = root_dir + deb
+            fnkey = deb + ":pc"
+
+            # Check if this file's sub-pieces are already known
+            if cache.has_key(fnkey):
+                sha1, piece_list = cache2hash(cache[fnkey])
+            else:
+                print '    Hashing new package:', deb
+                sha1, piece_list = sub_piece(filename)
+
+                # Save the result for next time
+                cache[fnkey] = hash2cache(sha1, piece_list)
+                
+            # If it has sub-pieces, save it to the list
+            if piece_list:
+                all_debs[deb] = 1
+        
+        # Write the list back to the cache
+        packages_list = all_debs.keys()
+        packages_list.sort()
+        cache[pkey] = list2cache(packages_list)
+
+        # Write the sub-piece data to the file
+        sub_file = gzip.open(sub_filename, 'w')
+        for deb in packages_list:
+            fnkey = deb + ":pc"
             
-        # Save the result for next time
-        cache[fnkey] = cache_value
-
-    if piece_list:
-        # Print the resulting sub-piece hashes
-        print "Filename: %s" % (filename)
-        print "SHA1: %s" % (sha1)
-        print "SHA1-Pieces:"
-        for x in piece_list:
-            print " %s %d" % x
-        print ""
+            # Check to make sure
+            if cache.has_key(fnkey):
+                # Get the cached result
+                sha1, piece_list = cache2hash(cache[fnkey])
+                
+                # Print the resulting sub-piece hashes
+                sub_file.write("Filename: %s\n" % (deb))
+                sub_file.write("SHA1: %s\n" % (sha1))
+                sub_file.write("SHA1-Pieces:\n")
+                for x in piece_list:
+                    sub_file.write(" %s %d\n" % x)
+                sub_file.write("\n")
+            else:
+                print "WARNING: no sub-piece data found for " + deb
+        sub_file.close()
 
 if __name__ == '__main__':
     
@@ -168,9 +369,10 @@
     cache_file = sys.argv[1]
     cache = bsddb.btopen(cache_file, "w")
 
-    # Read files to sub-piece from standard in
+    # Read Release file names from standard in
     for filename in sys.stdin:
-        sub_piece(cache, filename)
+        filename = filename.rstrip()
+        run(cache, filename)
 
     # Close the cache file
     cache.sync()




More information about the Debtorrent-commits mailing list