r235 - /debtorrent/branches/unique/hippy.py
camrdale-guest at users.alioth.debian.org
camrdale-guest at users.alioth.debian.org
Tue Aug 14 04:17:10 UTC 2007
Author: camrdale-guest
Date: Tue Aug 14 04:17:09 2007
New Revision: 235
URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=235
Log:
Rewrite hippy for readability.
Calculate rather than simulate optimal piece size.
Modified:
debtorrent/branches/unique/hippy.py
Modified: debtorrent/branches/unique/hippy.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/branches/unique/hippy.py?rev=235&op=diff
==============================================================================
--- debtorrent/branches/unique/hippy.py (original)
+++ debtorrent/branches/unique/hippy.py Tue Aug 14 04:17:09 2007
@@ -1,83 +1,175 @@
#!/usr/bin/env python
+
+"""Calculate the sub-piece hashes for large package files."""
import bsddb, sha, binascii
import os, sys
-from gzip import GzipFile
-from StringIO import StringIO
+import struct
+from math import ceil
-def hash(file, piecesize):
- h = []
- fullh = sha.new()
- while 1:
- x = file.read(piecesize)
- if x == "": break
- h.append((sha.new(x).hexdigest(), len(x)))
- fullh.update(x)
- return (fullh.hexdigest(), h)
+MAX_PIECE_SIZE = 512*1024
+CHUNK_SIZE = 16*1024
-piecesize = 512*1024
-chunksize = 16*1024
+def hash(file, piece_size):
+ """Read a file and hash it's sub-pieces.
-def optimalpiecesize(size):
- def eval(s,c,m):
- b = m/c
- return [ i*c for i in range(int(b/2), b+1) if s - i*c*int(s/m) <= i*c ]
+ @type file: C{file}
+ @param file: an already opened file-like object to read from
+ @type piece_size: C{int}
+ @param piece_size: the piece size to divide the file into
+ @rtype: C{string}, C{list} of (C{string}, C{int})
+ @return: the 40-byte hex representation of the SHA1 hash of the file, and
+ the 40-byte hex representation of the SHA1 hash of the piece and the
+ length of the piece, for each sub-piece of the file
+
+ """
+
+ hashes = []
+ file_hash = sha.new()
- def score(s,c,m):
- l = int(s/m)
- return [ (abs(i - (s - l*i)), i) for i in eval(s,c,m) ]
+ while 1:
+ data = file.read(piece_size)
+ if data == "":
+ break
+
+ hashes.append((sha.new(data).hexdigest(), len(data)))
+ file_hash.update(data)
- def bestest(s,c,m):
- return min( score(s,c,m) )
+ return file_hash.hexdigest(), hashes
- return bestest(size,chunksize,piecesize)[1]
-cache_file = sys.argv[1]
-pieces = {}
+def optimal_piece_size(size):
+ """Calculate the optimal piece size to use for a file.
+
+ The optimal piece size is the largest possible piece size such that the
+ piece size is larger than the extra piece, the piece size is a multiple of
+ the chunk size, and the difference between the piece size and the extra
+ piece size is a minimum.
-cache = bsddb.btopen(cache_file, "w")
+ This function currently contains an error, as it returns a non-optimal
+ piece size when the size is a multiple of the maximum piece size. This
+ error is kept for backwards compatibility with previous versions. To
+ correct it::
+
+ n = 1 + (size-1) / MAX_PIECE_SIZE
+
+ @type size: C{long}
+ @param size: the file size
+ @rtype: C{int}
+ @return: the optimal piece size
+
+ """
+
+ n = 1 + size / MAX_PIECE_SIZE
+ return max(MAX_PIECE_SIZE/2, int(ceil((float(size)/n)/CHUNK_SIZE))*CHUNK_SIZE)
-def str2hash(s):
- r = []
- if s == "": return None, []
+def cache2hash(cache_value):
+ """Convert a list of sub-piece hashes to a cacheable value.
+
+ The cache is stored as a string. The first 20 bytes are the SHA1 hash of
+ the entire file. Then there are repeating 24 byte sequences, the first 4
+ bytes being the length of the piece in network (big-endian) order, the
+ next 20 bytes being the SHA1 hash of the piece. If there are no sub-pieces
+ for the file, the cached string is empty.
+
+ @type cache_value: C{string}
+ @param cache_value: the cached value for this file
+ @rtype: C{string}, C{list} of (C{string}, C{int})
+ @return: the 40-byte hex representation of the SHA1 hash of the file, and
+ the 40-byte hex representation of the SHA1 hash of the piece and the
+ length of the piece, for each sub-piece of the file
+
+ """
- fh,s = binascii.b2a_hex(s[:20]), s[20:]
- while len(s) > 0:
- (l,h,s) = s[:4], s[4:24], s[24:]
- r.append( (binascii.b2a_hex(h), long(binascii.b2a_hex(l), 16)) )
- return fh,r
+ if cache_value == "":
+ return None, []
-def hash2str(fh, hs):
- s = binascii.a2b_hex(fh)
- for (h, l) in hs:
- s += binascii.a2b_hex("%08x" % l) + binascii.a2b_hex(h)
- return s
+ piece_list = []
+ file_hash = binascii.b2a_hex(cache_value[:20])
+ cache_value = cache_value[20:]
+
+ while len(cache_value) > 0:
+ length = struct.unpack(">i", cache_value[:4])[0]
+ hash = binascii.b2a_hex(cache_value[4:24])
+ cache_value = cache_value[24:]
+ piece_list.append((hash, length))
+
+ return file_hash, piece_list
-for filename in sys.stdin:
+def hash2cache(sha1, piece_list):
+ """Convert a list of sub-piece hashes to a cacheable value.
+
+ @type sha1: C{string}
+ @param sha1: the 40-byte hex representation of the SHA1 hash of the file
+ @type piece_list: C{list} of (C{string}, C{int})
+ @param piece_list: for each sub-piece of the file, the 40-byte hex
+ representation of the SHA1 hash and the length of the piece
+ @rtype: C{string}
+ @return: the cacheable string
+
+ """
+
+ if not piece_list:
+ return ""
+
+ cache_value = binascii.a2b_hex(sha1)
+ for (hash, length) in piece_list:
+ cache_value += struct.pack(">i", length) + binascii.a2b_hex(hash)
+ return cache_value
+
+def sub_piece(filename):
+ """Calculate and print the sub-pieces for a single file.
+
+ @type filename: C{String}
+ @param filename: the file to calculate sub pieces for
+
+ """
+
filename = filename.rstrip()
+
+ # Check if this file's sub-pieces are already known
fnkey = filename + ":pc"
if cache.has_key(fnkey):
- sha1, result = str2hash(cache[fnkey])
+ # Use the cached result
+ sha1, piece_list = cache2hash(cache[fnkey])
else:
- size = os.stat(filename).st_size
- if size <= piecesize:
- values = ""
- result = []
- else:
- ps = optimalpiecesize(size)
- file = open(filename)
- sha1, result = hash(file, ps)
- values = hash2str(sha1, result)
- file.close()
- cache[fnkey] = values
+ # Get the size fo the file
+ size = os.stat(filename).st_size
+
+ if size <= MAX_PIECE_SIZE:
+ # No sub-pieces are needed for this file
+ cache_value = ""
+ piece_list = []
+ else:
+ # Calculate all the sub-piece hashes
+ piece_size = optimal_piece_size(size)
+ file = open(filename)
+ sha1, piece_list = hash(file, piece_size)
+ cache_value = hash2cache(sha1, piece_list)
+ file.close()
+
+ # Save the result for next time
+ cache[fnkey] = cache_value
- if result:
- print "Filename: %s" % (filename)
- print "SHA1: %s" % (sha1)
- print "SHA1-Pieces:"
- for x in result:
+ if piece_list:
+ # Print the resulting sub-piece hashes
+ print "Filename: %s" % (filename)
+ print "SHA1: %s" % (sha1)
+ print "SHA1-Pieces:"
+ for x in piece_list:
print " %s %d" % x
- print ""
+ print ""
-cache.sync()
-cache.close()
+if __name__ == '__main__':
+
+ # Open the cache file specified on the command line
+ cache_file = sys.argv[1]
+ cache = bsddb.btopen(cache_file, "w")
+
+ # Read files to sub-piece from standard in
+ for filename in sys.stdin:
+ sub_piece(filename)
+
+ # Close the cache file
+ cache.sync()
+ cache.close()
More information about the Debtorrent-commits
mailing list