r273 - /debtorrent/branches/unique/hippy.py
camrdale-guest at users.alioth.debian.org
camrdale-guest at users.alioth.debian.org
Sun Aug 19 23:23:10 UTC 2007
Author: camrdale-guest
Date: Sun Aug 19 23:23:10 2007
New Revision: 273
URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=273
Log:
Update hippy to save old sub-piece data for use with out-of-date mirrors.
Modified:
debtorrent/branches/unique/hippy.py
Modified: debtorrent/branches/unique/hippy.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/branches/unique/hippy.py?rev=273&op=diff
==============================================================================
--- debtorrent/branches/unique/hippy.py (original)
+++ debtorrent/branches/unique/hippy.py Sun Aug 19 23:23:10 2007
@@ -1,14 +1,76 @@
#!/usr/bin/env python
-"""Calculate the sub-piece hashes for large package files."""
+"""Calculate the sub-piece hashes for large package files.
+
+Run this script in the directory where the extrapieces files are to be stored.
+It's only command line argument is the Berkeley database containing the cached
+data from previous runs. Pass the paths of Release files to process into the
+standard input.
+
+For example::
+
+ find /var/www/debian -maxdepth 3 -name "Release" | hippy ../hippycache.bdb
+
+"""
import bsddb, sha, binascii
-import os, sys
+import os, sys, gzip
import struct
+from bz2 import BZ2File
from math import ceil
MAX_PIECE_SIZE = 512*1024
CHUNK_SIZE = 16*1024
+
+# The Packages files to read
+EXTENSION = ".gz"
+
+def read_release(filename):
+ """Read the headers and Packages file names from a Release file.
+
+ @type filename: C[string}
+ @param filename: the Release file to read
+ @rtype: C{dictionary}, C{list} of C{string}
+ @return: the headers and full file names of Packages files
+
+ """
+
+ # Initialize the Release file variables
+ read_packages = False
+ headers = {}
+ packages = []
+
+ f = open(filename, 'r')
+
+ for line in f:
+ line = line.rstrip()
+
+ if line[:1] != " ":
+ read_packages = False
+ try:
+ # Read the various headers from the file
+ h, v = line.split(":", 1)
+ if h == "MD5Sum" or h == "SHA1" or h == "SHA256":
+ read_packages = True
+ elif len(v) > 0:
+ headers[h] = v[1:]
+ except:
+ # Bad header line, just ignore it
+ print "WARNING: Ignoring badly formatted Release line:", line
+
+ # Skip to the next line
+ continue
+
+ # Read file names from the multiple hash sections of the file
+ if read_packages:
+ p = line.split()
+ if len(p) == 3 and p[2].endswith("Packages"+EXTENSION):
+ if p[2] not in packages:
+ packages.append(p[2])
+
+ f.close()
+
+ return headers, packages
def hash(file, piece_size):
"""Read a file and hash it's sub-pieces.
@@ -63,8 +125,53 @@
n = 1 + size / MAX_PIECE_SIZE
return max(MAX_PIECE_SIZE/2, int(ceil((float(size)/n)/CHUNK_SIZE))*CHUNK_SIZE)
+def cache2list(cache_value):
+ """Convert a cache value to a list of package names.
+
+ The cache is stored as a string. The list is a repeating sequence of one
+ byte length followed by a string of that length. Therefore, the longest
+ string that can be stored is 256.
+
+ @type cache_value: C{string}
+ @param cache_value: the cached value for this file
+ @rtype: C{list} of C{string}
+ @return: the list of package names stored in the cache
+
+ """
+
+ if cache_value == "":
+ return []
+
+ deb_list = []
+ while len(cache_value) > 0:
+ length = ord(cache_value[0])
+ deb = cache_value[1:length+1]
+ cache_value = cache_value[length+1:]
+ deb_list.append(deb)
+
+ return deb_list
+
+def list2cache(deb_list):
+ """Convert a list of package names to a cacheable value.
+
+ @type deb_list: C{list} of C{string}
+ @param deb_list: the package names to create a cache value for
+ @rtype: C{string}
+ @return: the cacheable string
+
+ """
+
+ if not deb_list:
+ return ""
+
+ cache_value = ""
+ for deb in deb_list:
+ assert len(deb) < 256
+ cache_value += chr(len(deb)) + deb
+ return cache_value
+
def cache2hash(cache_value):
- """Convert a list of sub-piece hashes to a cacheable value.
+ """Convert a cache value to a list of sub-piece hashes.
The cache is stored as a string. The first 20 bytes are the SHA1 hash of
the entire file. Then there are repeating 24 byte sequences, the first 4
@@ -117,50 +224,144 @@
cache_value += struct.pack(">i", length) + binascii.a2b_hex(hash)
return cache_value
-def sub_piece(cache, filename):
+def sub_piece(filename):
"""Calculate and print the sub-pieces for a single file.
+
+ @type filename: C{String}
+ @param filename: the file to calculate sub pieces for
+
+ """
+
+ filename = filename.rstrip()
+
+ # Get the size of the file
+ size = os.stat(filename).st_size
+
+ if size <= MAX_PIECE_SIZE:
+ # No sub-pieces are needed for this file
+ sha1 = ""
+ piece_list = []
+ else:
+ # Calculate all the sub-piece hashes
+ piece_size = optimal_piece_size(size)
+ file = open(filename)
+ sha1, piece_list = hash(file, piece_size)
+ file.close()
+
+ return sha1, piece_list
+
+def get_packages(filename):
+ """Read the new piece data from a Packages file.
+
+ @type filename: C[string}
+ @param filename: the Packages file to open and parse
+ @rtype: C{list} of C{string}
+ @return: the package files listed in the Packages file
+
+ """
+
+ # Open the possibly compressed file
+ if filename.endswith(".gz"):
+ f = gzip.open(filename, 'r')
+ elif filename.endswith(".bz2"):
+ f = BZ2File(filename, "r")
+ else:
+ f = open(filename, 'r')
+
+ debs = []
+
+ p = [None]
+ for line in f:
+ line = line.rstrip()
+
+ if line == "":
+ if p[0]:
+ debs.append(p[0])
+ p = [None]
+ if line[:9] == "Filename:":
+ p[0] = line[10:]
+
+ f.close()
+
+ return debs
+
+def run(cache, releasefile):
+ """Process a single Release file.
@type cache: C{bsddb.BTree}
@param cache: an already opened bDB b-tree
- @type filename: C{String}
- @param filename: the file to calculate sub pieces for
-
- """
-
- filename = filename.rstrip()
-
- # Check if this file's sub-pieces are already known
- fnkey = filename + ":pc"
- if cache.has_key(fnkey):
- # Use the cached result
- sha1, piece_list = cache2hash(cache[fnkey])
- else:
- # Get the size fo the file
- size = os.stat(filename).st_size
-
- if size <= MAX_PIECE_SIZE:
- # No sub-pieces are needed for this file
- cache_value = ""
- piece_list = []
+ @type releasefile: C[string}
+ @param releasefile: the Release file to process
+
+ """
+
+ # Process the Release file
+ print "Processing: %s" % releasefile
+ root_dir = releasefile[:releasefile.index('/dists/')+1]
+ release_dir = releasefile[len(root_dir):].rsplit('/', 1)[0] + '/'
+ release_headers, packages = read_release(releasefile)
+
+ file_prefix = "dists_" + release_headers.get("Codename", "") + "_"
+ file_suffix = "_Packages-extrapieces.gz"
+
+ for packages_file in packages:
+ sub_filename = file_prefix + '_'.join(packages_file.split('/')[:-1]) + file_suffix
+ pkey = release_dir + packages_file + ":pl"
+
+ # Get the list of packages in the packages file
+ debs = get_packages(root_dir + release_dir + packages_file)
+
+ # Retrieve the saved list of sub-pieced packages in the Packages file
+ if cache.has_key(pkey):
+ packages_list = cache2list(cache[pkey])
else:
- # Calculate all the sub-piece hashes
- piece_size = optimal_piece_size(size)
- file = open(filename)
- sha1, piece_list = hash(file, piece_size)
- cache_value = hash2cache(sha1, piece_list)
- file.close()
+ packages_list = []
+ all_debs = {}.fromkeys(packages_list, 1)
+
+ # First, sub-piece any new package files
+ for deb in debs:
+ filename = root_dir + deb
+ fnkey = deb + ":pc"
+
+ # Check if this file's sub-pieces are already known
+ if cache.has_key(fnkey):
+ sha1, piece_list = cache2hash(cache[fnkey])
+ else:
+ print ' Hashing new package:', deb
+ sha1, piece_list = sub_piece(filename)
+
+ # Save the result for next time
+ cache[fnkey] = hash2cache(sha1, piece_list)
+
+ # If it has sub-pieces, save it to the list
+ if piece_list:
+ all_debs[deb] = 1
+
+ # Write the list back to the cache
+ packages_list = all_debs.keys()
+ packages_list.sort()
+ cache[pkey] = list2cache(packages_list)
+
+ # Write the sub-piece data to the file
+ sub_file = gzip.open(sub_filename, 'w')
+ for deb in packages_list:
+ fnkey = deb + ":pc"
- # Save the result for next time
- cache[fnkey] = cache_value
-
- if piece_list:
- # Print the resulting sub-piece hashes
- print "Filename: %s" % (filename)
- print "SHA1: %s" % (sha1)
- print "SHA1-Pieces:"
- for x in piece_list:
- print " %s %d" % x
- print ""
+ # Check to make sure
+ if cache.has_key(fnkey):
+ # Get the cached result
+ sha1, piece_list = cache2hash(cache[fnkey])
+
+ # Print the resulting sub-piece hashes
+ sub_file.write("Filename: %s\n" % (deb))
+ sub_file.write("SHA1: %s\n" % (sha1))
+ sub_file.write("SHA1-Pieces:\n")
+ for x in piece_list:
+ sub_file.write(" %s %d\n" % x)
+ sub_file.write("\n")
+ else:
+ print "WARNING: no sub-piece data found for " + deb
+ sub_file.close()
if __name__ == '__main__':
@@ -168,9 +369,10 @@
cache_file = sys.argv[1]
cache = bsddb.btopen(cache_file, "w")
- # Read files to sub-piece from standard in
+ # Read Release file names from standard in
for filename in sys.stdin:
- sub_piece(cache, filename)
+ filename = filename.rstrip()
+ run(cache, filename)
# Close the cache file
cache.sync()
More information about the Debtorrent-commits
mailing list