r235 - /debtorrent/branches/unique/hippy.py

Tue Aug 14 04:17:10 UTC 2007

Author: camrdale-guest
Date: Tue Aug 14 04:17:09 2007
New Revision: 235

URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=235
Log:
Rewrite hippy for readability.
Calculate rather than simulate optimal piece size.

Modified:
    debtorrent/branches/unique/hippy.py

Modified: debtorrent/branches/unique/hippy.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/branches/unique/hippy.py?rev=235&op=diff
==============================================================================

--- debtorrent/branches/unique/hippy.py (original)
+++ debtorrent/branches/unique/hippy.py Tue Aug 14 04:17:09 2007
@@ -1,83 +1,175 @@
 #!/usr/bin/env python
+
+"""Calculate the sub-piece hashes for large package files."""
 
 import bsddb, sha, binascii
 import os, sys
-from gzip import GzipFile
-from StringIO import StringIO
+import struct
+from math import ceil
 
-def hash(file, piecesize):
-        h = []
-	fullh = sha.new()
-        while 1:
-                x = file.read(piecesize)
-                if x == "": break
-                h.append((sha.new(x).hexdigest(), len(x)))
-		fullh.update(x)
-        return (fullh.hexdigest(), h)
+MAX_PIECE_SIZE = 512*1024
+CHUNK_SIZE = 16*1024
 
-piecesize = 512*1024
-chunksize = 16*1024
+def hash(file, piece_size):
+    """Read a file and hash it's sub-pieces.
 
-def optimalpiecesize(size):
-    def eval(s,c,m):
-        b = m/c
-        return [ i*c for i in range(int(b/2), b+1) if s - i*c*int(s/m) <= i*c ]
+    @type file: C{file}
+    @param file: an already opened file-like object to read from
+    @type piece_size: C{int}
+    @param piece_size: the piece size to divide the file into
+    @rtype: C{string}, C{list} of (C{string}, C{int})
+    @return: the 40-byte hex representation of the SHA1 hash of the file, and
+        the 40-byte hex representation of the SHA1 hash of the piece and the
+        length of the piece, for each sub-piece of the file
+    
+    """
+    
+    hashes = []
+    file_hash = sha.new()
 
-    def score(s,c,m):
-        l = int(s/m)
-        return [ (abs(i - (s - l*i)), i) for i in eval(s,c,m) ]
+    while 1:
+        data = file.read(piece_size)
+        if data == "":
+            break
+        
+        hashes.append((sha.new(data).hexdigest(), len(data)))
+        file_hash.update(data)
 
-    def bestest(s,c,m): 
-        return min( score(s,c,m) )
+    return file_hash.hexdigest(), hashes
 
-    return bestest(size,chunksize,piecesize)[1]
 
-cache_file = sys.argv[1]
-pieces = {}
+def optimal_piece_size(size):
+    """Calculate the optimal piece size to use for a file.
+    
+    The optimal piece size is the largest possible piece size such that the
+    piece size is larger than the extra piece, the piece size is a multiple of
+    the chunk size, and the difference between the piece size and the extra
+    piece size is a minimum.
 
-cache = bsddb.btopen(cache_file, "w")
+    This function currently contains an error, as it returns a non-optimal
+    piece size when the size is a multiple of the maximum piece size. This
+    error is kept for backwards compatibility with previous versions. To
+    correct it::
+        
+        n = 1 + (size-1) / MAX_PIECE_SIZE
+    
+    @type size: C{long}
+    @param size: the file size
+    @rtype: C{int}
+    @return: the optimal piece size
+    
+    """
+    
+    n = 1 + size / MAX_PIECE_SIZE
+    return max(MAX_PIECE_SIZE/2, int(ceil((float(size)/n)/CHUNK_SIZE))*CHUNK_SIZE)
 
-def str2hash(s):
-    r = []
-    if s == "": return None, []
+def cache2hash(cache_value):
+    """Convert a list of sub-piece hashes to a cacheable value.
+    
+    The cache is stored as a string. The first 20 bytes are the SHA1 hash of
+    the entire file. Then there are repeating 24 byte sequences, the first 4
+    bytes being the length of the piece in network (big-endian) order, the
+    next 20 bytes being the SHA1 hash of the piece. If there are no sub-pieces
+    for the file, the cached string is empty.
+    
+    @type cache_value: C{string}
+    @param cache_value: the cached value for this file
+    @rtype: C{string}, C{list} of (C{string}, C{int})
+    @return: the 40-byte hex representation of the SHA1 hash of the file, and
+        the 40-byte hex representation of the SHA1 hash of the piece and the
+        length of the piece, for each sub-piece of the file
+    
+    """
 
-    fh,s = binascii.b2a_hex(s[:20]), s[20:]
-    while len(s) > 0:
-        (l,h,s) = s[:4], s[4:24], s[24:]
-	r.append( (binascii.b2a_hex(h), long(binascii.b2a_hex(l), 16)) )
-    return fh,r
+    if cache_value == "":
+        return None, []
 
-def hash2str(fh, hs):
-    s = binascii.a2b_hex(fh)
-    for (h, l) in hs:
-	s += binascii.a2b_hex("%08x" % l) + binascii.a2b_hex(h)
-    return s
+    piece_list = []
+    file_hash = binascii.b2a_hex(cache_value[:20])
+    cache_value = cache_value[20:]
+    
+    while len(cache_value) > 0:
+        length = struct.unpack(">i", cache_value[:4])[0]
+        hash = binascii.b2a_hex(cache_value[4:24])
+        cache_value = cache_value[24:]
+        piece_list.append((hash, length))
+        
+    return file_hash, piece_list
 
-for filename in sys.stdin:
+def hash2cache(sha1, piece_list):
+    """Convert a list of sub-piece hashes to a cacheable value.
+    
+    @type sha1: C{string}
+    @param sha1: the 40-byte hex representation of the SHA1 hash of the file
+    @type piece_list: C{list} of (C{string}, C{int})
+    @param piece_list: for each sub-piece of the file, the 40-byte hex
+        representation of the SHA1 hash and the length of the piece
+    @rtype: C{string}
+    @return: the cacheable string
+    
+    """
+    
+    if not piece_list:
+        return ""
+    
+    cache_value = binascii.a2b_hex(sha1)
+    for (hash, length) in piece_list:
+        cache_value += struct.pack(">i", length) + binascii.a2b_hex(hash)
+    return cache_value
+
+def sub_piece(filename):
+    """Calculate and print the sub-pieces for a single file.
+    
+    @type filename: C{String}
+    @param filename: the file to calculate sub pieces for
+    
+    """
+    
     filename = filename.rstrip()
+    
+    # Check if this file's sub-pieces are already known
     fnkey = filename + ":pc"
     if cache.has_key(fnkey):
-    	sha1, result = str2hash(cache[fnkey])
+        # Use the cached result
+        sha1, piece_list = cache2hash(cache[fnkey])
     else:
-    	size = os.stat(filename).st_size
-    	if size <= piecesize:
-		values = ""
-		result = []
-	else:
-        	ps = optimalpiecesize(size)
-        	file = open(filename)
-        	sha1, result = hash(file, ps)
-		values = hash2str(sha1, result)
-        	file.close()
-	cache[fnkey] = values
+        # Get the size fo the file
+        size = os.stat(filename).st_size
+        
+        if size <= MAX_PIECE_SIZE:
+            # No sub-pieces are needed for this file
+            cache_value = ""
+            piece_list = []
+        else:
+            # Calculate all the sub-piece hashes
+            piece_size = optimal_piece_size(size)
+            file = open(filename)
+            sha1, piece_list = hash(file, piece_size)
+            cache_value = hash2cache(sha1, piece_list)
+            file.close()
+            
+        # Save the result for next time
+        cache[fnkey] = cache_value
 
-    if result:
-    	print "Filename: %s" % (filename)
-	print "SHA1: %s" % (sha1)
-    	print "SHA1-Pieces:"
-    	for x in result:
+    if piece_list:
+        # Print the resulting sub-piece hashes
+        print "Filename: %s" % (filename)
+        print "SHA1: %s" % (sha1)
+        print "SHA1-Pieces:"
+        for x in piece_list:
             print " %s %d" % x
-    	print ""
+        print ""
 
-cache.sync()
-cache.close()
+if __name__ == '__main__':
+    
+    # Open the cache file specified on the command line
+    cache_file = sys.argv[1]
+    cache = bsddb.btopen(cache_file, "w")
+
+    # Read files to sub-piece from standard in
+    for filename in sys.stdin:
+        sub_piece(filename)
+
+    # Close the cache file
+    cache.sync()
+    cache.close()