r219 - /debtorrent/branches/unique/uniquely.py

Sat Aug 11 20:48:51 UTC 2007

Author: camrdale-guest
Date: Sat Aug 11 20:48:51 2007
New Revision: 219

URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=219
Log:
Update uniquely to order the pieces by full path name, and rewrite for readability.

Modified:
    debtorrent/branches/unique/uniquely.py

Modified: debtorrent/branches/unique/uniquely.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/branches/unique/uniquely.py?rev=219&op=diff
==============================================================================

--- debtorrent/branches/unique/uniquely.py (original)
+++ debtorrent/branches/unique/uniquely.py Sat Aug 11 20:48:51 2007
@@ -2,35 +2,93 @@
 
 """Process a Release file, creating, finding and updating any torrent files."""
 
-import bsddb, sha, binascii
+import sha
 import sys
 import gzip
 from bz2 import BZ2File
 from math import ceil
 from os import remove
 from os.path import exists
-
-# Some default values
-default_piecesize = 512*1024
-extension = ".gz"
-# can not contain Date, Infohash, NextPiece or OriginalPieces
-default_hash_fields = ["Codename", "Suite", "Component", "Architecture",
+from time import strftime, gmtime
+
+# The piece size to use (must match the '-extrapieces' file's piece size)
+DEFAULT_PIECESIZE = 512*1024
+
+# The Packages files to read
+EXTENSION = ".gz"
+
+# The fields to hash to determine the torrent identifier
+# (can not contain Date, Infohash, NextPiece or OriginalPieces)
+DEFAULT_HASH_FIELDS = ["Codename", "Suite", "Component", "Architecture",
                        "PieceSize", "OriginalDate"]
-default_tracker = "http://dttracker.debian.net:6969/announce"
-header_order = ["Torrent", "Infohash", "InfohashArchs", "OriginalDate", "Date",
+
+# The tracker announce URL to use
+DEFAULT_TRACKER = "http://dttracker.debian.net:6969/announce"
+
+# The order to write the headers in (headers not listed won't be written)
+HEADER_ORDER = ["Torrent", "Infohash", "InfohashArchs", "OriginalDate", "Date",
                 "PieceSize", "NextPiece", "OriginalPieces", "Codename", "Suite",
                 "Component", "Architecture", "Tracker", "TorrentHashFields"]
 
+def read_release(filename):
+    """Read the headers and Packages file names from a Release file.
+    
+    @type filename: C[string}
+    @param filename: the Release file to read
+    @rtype: C{dictionary}, C{list} of C{string}
+    @return: the headers and full file names of Packages files
+    
+    """
+
+    # Initialize the Release file variables
+    release_dir = filename.rsplit('/', 1)[0]
+    read_packages = False
+    headers = {}
+    packages = []
+    
+    f = open(filename, 'r')
+    
+    for line in f:
+        line = line.rstrip()
+
+        if line[:1] != " ":
+            read_packages = False
+            try:
+                # Read the various headers from the file
+                h, v = line.split(":", 1)
+                if h == "MD5Sum" or h == "SHA1" or h == "SHA256":
+                    read_packages = True
+                elif len(v) > 0:
+                    headers[h] = v[1:]
+            except:
+                # Bad header line, just ignore it
+                print "WARNING: Ignoring badly formatted Release line:", line
+
+            # Skip to the next line
+            continue
+        
+        # Read file names from the multiple hash sections of the file
+        if read_packages:
+            p = line.split()
+            if len(p) == 3 and p[2].endswith("Packages"+EXTENSION):
+                if release_dir + "/" + p[2] not in packages:
+                    packages.append(release_dir + "/" + p[2])
+    
+    f.close()
+    
+    return headers, packages
+
 def get_old(old_file):
     """Read the headers and piece ordering data from an old file.
     
     @type old_file: C[string}
     @param old_file: the old piece ordering file to open
-    @rtype: (C{dictionary}, C{dictionary})
+    @rtype: C{dictionary}, C{dictionary}
     @return: the old piece ordering (keys are the file names, values are the
         starting piece number) and headers
     
     """
+
     pieces = {}
     headers = {}
     
@@ -62,12 +120,64 @@
         # Delete the file and return empty variables to create a new torrent
         if exists(old_file):
             remove(old_file)
-        pass
     
     return pieces, headers
 
-def get_new(filename, old_files, headers, old_all_files, all_pieces,
-            all_new_pieces):
+def update_headers(headers, release_headers, component, arch):
+    """Update the headers with new fields from the Release file.
+    
+    @type headers: C{dictionary}
+    @param headers: the headers from the piece ordering file
+    @type release_headers: C{dictionary}
+    @param release_headers: the headers from the Release file
+    @type component: C{string}
+    @param component: the component name (e.g. main, contrib, non-free)
+    @type arch: C{string}
+    @param arch: the architecture name (e.g. i386, amd64, all)
+    @rtype: C{boolean}
+    @return: whether a new torrent has been created
+    
+    """
+
+    # Set any required Release headers
+    if len(release_headers.get("Date", "")) == 0:
+        # Use today's date
+        release_headers["Date"] = strftime('%a, %d %b %Y %H:%M:%S +0000', gmtime())
+    
+    # Create/update the headers
+    headers.setdefault("OriginalDate", release_headers["Date"])
+    headers["Date"] = release_headers["Date"]
+    headers.setdefault("PieceSize", str(DEFAULT_PIECESIZE))
+    headers.setdefault("NextPiece", str(0))
+    headers["Codename"] = release_headers.get("Codename", "")
+    headers["Suite"] = release_headers.get("Suite", "")
+    headers["Component"] = component
+    headers["Architecture"] = arch
+    headers.setdefault("Tracker", DEFAULT_TRACKER)
+    headers.setdefault("TorrentHashFields", " ".join(DEFAULT_HASH_FIELDS))
+    
+    # Calculate the new hash
+    sha1 = sha.new()
+    for header in headers["TorrentHashFields"].split():
+        sha1.update(headers[header])
+    new_hash = sha1.hexdigest()
+    
+    # Check if the hash has changed
+    if headers.get("Torrent", "") == new_hash:
+        return False
+    else:
+        # If it has, then reset the torrent to create a new one
+        headers["OriginalDate"] = release_headers["Date"]
+        headers["NextPiece"] = str(0)
+        headers.pop("OriginalPieces", "")
+        sha1 = sha.new()
+        for header in headers["TorrentHashFields"].split():
+            sha1.update(headers[header])
+        headers["Torrent"] = sha1.hexdigest()
+
+        return True
+
+def get_new(filename, old_files, old_all_files, all_pieces, all_new_pieces):
     """Read the new piece data from a Packages file.
     
     Reads the Packages file, finding old files in it and copying their data to
@@ -84,8 +194,6 @@
     @type old_files: C{dictionary}
     @param old_files: the original piece ordering, keys are the file names,
         values are the starting piece number
-    @type headers: C{dictionary}
-    @param headers: the original headers
     @type old_all_files: C{dictionary}
     @param old_all_files: the original piece ordering for architecture:all
         files, keys are the file names, values are the starting piece number
@@ -101,10 +209,6 @@
     
     """
 
-    # Get the needed header information
-    next_piece = int(headers["NextPiece"])
-    piece_size = int(headers["PieceSize"])
-    
     # Open the possibly compressed file
     if filename.endswith(".gz"):
         f = gzip.open(filename, 'r')
@@ -114,6 +218,7 @@
         f = open(filename, 'r')
 
     pieces = {}
+    new_pieces = []
     
     p = [None, None, None]
     for line in f:
@@ -138,9 +243,8 @@
                         pieces[old_files[p[0]]] = p[0]
                         del old_files[p[0]]
                     else:
-                        # Add new file to the end of the torrent
-                        pieces[next_piece] = p[0]
-                        next_piece += int(ceil(p[1]/float(piece_size)))
+                        # Found new file, save it for later processing
+                        new_pieces.append((p[0], p[1]))
 
             p = [None, None, None]
         if line[:9] == "Filename:":
@@ -152,258 +256,186 @@
     
     f.close()
     
+    return pieces, new_pieces
+
+def add_new(pieces, new_pieces, headers):
+    """Read the new piece data from a Packages file.
+    
+    Adds new files to the end of the piece ordering. The 'pieces' input is 
+    modified by having the new pieces added to it. The 'new_pieces' input
+    list is sorted. The 'NextPiece' header in the input 'headers' is updated.
+    
+    @type pieces: C{dictionary}
+    @param pieces: the current piece ordering, keys are the starting piece
+        numbers, values are the file names
+    @type new_pieces: C{list} of (C{string}, C{long})
+    @param new_pieces: the file name and file size of the new files that have
+        been found and are to be added to the pirce ordering
+    @type headers: C{dictionary}
+    @param headers: the headers from the piece ordering file
+    
+    """
+
+    # Get the needed header information
+    next_piece = int(headers["NextPiece"])
+    piece_size = int(headers["PieceSize"])
+    
+    new_pieces.sort()
+    old_file = ""
+    old_size = 0L
+    for (file, size) in new_pieces:
+        if file == old_file:
+            if size != old_size:
+                print "WARNING: multiple files with different size:", file
+        else:
+            pieces[next_piece] = file
+            next_piece += int(ceil(size/float(piece_size)))
+            
+        old_file = file
+        old_size = size
+
+    # Set the final header values
     headers["NextPiece"] = str(next_piece)
-
-    return pieces
-
-#cache_file = sys.argv[1]
-#cache = bsddb.btopen(cache_file, "w")
-
-# The only input is the Release file to process
-releasefile = sys.argv[1]
-print "Processing: %s" % releasefile
-
-# Initialize the Release file variables
-release_dir = releasefile.rsplit('/', 1)[0]
-origin = ""
-label = ""
-suite = ""
-codename = ""
-date = ""
-components = []
-archs = []
-read_files = False
-packages = []
-packages_sha1 = {}
-packages_size = {}
-
-f = open(releasefile, 'r')
-
-for line in f:
-    line = line.rstrip()
-    
-    # Read the various headers from the file
-    if line[:7] == "Origin:":
-        origin = line[8:]
-    if line[:6] == "Label:":
-        label = line[7:]
-    if line[:6] == "Suite:":
-        suite = line[7:]
-    if line[:9] == "Codename:":
-        codename = line[10:]
-    if line[:5] == "Date:":
-        date = line[6:]
-    if line[:11] == "Components:":
-        components = line[12:].split()
-    if line[:14] == "Architectures:":
-        archs = line[15:].split()
-
-    # Read multiple lines from the SHA1 section of the file
-    if line[:1] != " ":
-        read_files = False
-    if read_files:
-        p = line.split()
-        if len(p) == 3 and p[2].endswith("Packages"+extension):
-            packages.append(release_dir + "/" + p[2])
-            packages_sha1[p[2]] = binascii.a2b_hex(p[0])
-            packages_size[p[2]] = long(p[1])
-    if line[:7] == "MD5Sum:":
-        read_files = True
-
-f.close()
-
-torrent_prefix = "dists_" + codename + "_"
-torrent_suffix = "_Packages-torrent.gz"
-
-for component in components:
-    # Get the old 'all' data
-    all_file = torrent_prefix + component + "_binary-all" + torrent_suffix
-    old_all_files, all_headers = get_old(all_file)
-    all_pieces = {}
-    all_new_pieces = []
-    new_all_torrent = False
-
-    # Create the all headers
-    all_headers.setdefault("OriginalDate", date)
-    all_headers["Date"] = date
-    all_headers.setdefault("PieceSize", str(default_piecesize))
-    all_headers.setdefault("NextPiece", str(0))
-    all_headers["Codename"] = codename
-    all_headers["Suite"] = suite
-    all_headers["Component"] = component
-    all_headers["Architecture"] = "all"
-    all_headers.setdefault("Tracker", default_tracker)
-    all_headers.setdefault("TorrentHashFields", " ".join(default_hash_fields))
-    
-    # Calculate the new hash
-    sha1 = sha.new()
-    for header in all_headers["TorrentHashFields"].split():
-        sha1.update(all_headers[header])
-    new_hash = sha1.hexdigest()
-
-    # Check if the hash has changed
-    if all_headers.get("Torrent", "") != new_hash:
-        # If it has, then reset the torrent
-        new_all_torrent = True
-        old_all_files = {}
-        all_headers["OriginalDate"] = date
-        all_headers["NextPiece"] = str(0)
-        all_headers.pop("OriginalPieces", "")
-        sha1 = sha.new()
-        for header in all_headers["TorrentHashFields"].split():
-            sha1.update(all_headers[header])
-        all_headers["Torrent"] = sha1.hexdigest()
-
-    for arch in archs:
-        torrent_file = torrent_prefix + component + "_binary-" + arch + torrent_suffix
-
-        # Find the Packages file that will be parsed
-        found = False
-        for filename in packages:
-            if (filename.find(component) >= 0 and 
-                filename.find("binary-"+arch) >= 0):
-                found = True
-                break
-        if not found:
-            print "WARNING: no matching Packages file for component %s, arch %s" % (component, arch)
-            if exists(torrent_file):
-                remove(torrent_file)
-            continue
-        packages.pop(packages.index(filename))
-
-        # Get the old data for this torrent, if any existed
-        print torrent_file + ": reading ...",
+    headers.setdefault("OriginalPieces", headers["NextPiece"])
+
+def write_file(filename, pieces, headers):
+    """Print the new data to the file.
+    
+    @type filename: C[string}
+    @param filename: the file to write to
+    @type pieces: C{dictionary}
+    @param pieces: the current piece ordering, keys are the starting piece
+        numbers, values are the file names
+    @type headers: C{dictionary}
+    @param headers: the headers from the piece ordering file
+    
+    """
+
+    f = gzip.open(filename, 'w')
+    
+    # Write the headers
+    for header in HEADER_ORDER:
+        if header in headers:
+            f.write("%s: %s\n" % (header, headers[header]))
+    f.write("PieceNumbers:\n")
+    
+    # Write the starting piece numbers
+    ps = pieces.keys()
+    ps.sort()
+    format_string = " %"+str(len(str(max(ps))))+"d %s\n"
+    for p in ps:
+        f.write(format_string % (p, pieces[p]))
+    
+    f.close()
+
+def run(releasefile):
+    """Process a single Release file.
+    
+    @type releasefile: C[string}
+    @param releasefile: the Release file to process
+
+    """
+    
+    # Process the Release file
+    print "Processing: %s" % releasefile
+    release_headers, packages = read_release(releasefile)
+    
+    torrent_prefix = "dists_" + release_headers.get("Codename", "") + "_"
+    torrent_suffix = "_Packages-torrent.gz"
+    
+    for component in release_headers.get("Components", "").split():
+        # Get the old 'all' data
+        all_file = torrent_prefix + component + "_binary-all" + torrent_suffix
+        old_all_pieces, all_headers = get_old(all_file)
+        all_pieces = {}
+        all_new_pieces = []
+        new_all_torrent = False
+    
+        # First update the 'all' headers
+        if update_headers(all_headers, release_headers, component, "all"):
+            # If it has, then reset the torrent
+            new_all_torrent = True
+            old_all_pieces = {}
+    
+        for arch in release_headers.get("Architectures", "").split():
+            torrent_file = torrent_prefix + component + "_binary-" + arch + torrent_suffix
+    
+            # Find the Packages file that will be parsed
+            found = False
+            for filename in packages:
+                if (filename.find(component) >= 0 and 
+                    filename.find("binary-"+arch) >= 0):
+                    found = True
+                    break
+            if not found:
+                print "WARNING: no matching Packages file for component %s, arch %s" % (component, arch)
+                if exists(torrent_file):
+                    remove(torrent_file)
+                continue
+            packages.pop(packages.index(filename))
+    
+            # Get the old data for this torrent, if any existed
+            print torrent_file + ": reading ...",
+            sys.stdout.flush()
+            old_pieces, headers = get_old(torrent_file)
+    
+            # Update the headers from the Release file ones
+            if update_headers(headers, release_headers, component, arch):
+                print "new torrent created ...",
+                sys.stdout.flush()
+                old_pieces = {}
+    
+            # Parse the Packages file for the new data
+            print "updating ...",
+            sys.stdout.flush()
+            pieces, new_pieces = get_new(filename, old_pieces, old_all_pieces, 
+                                         all_pieces, all_new_pieces)
+    
+            if pieces or new_pieces:
+                # Add any new pieces to the end of pieces
+                add_new(pieces, new_pieces, headers)
+                
+                # Write the headers
+                print "writing ...",
+                sys.stdout.flush()
+                write_file(torrent_file, pieces, headers)
+            else:
+                print "empty ...",
+                if exists(torrent_file):
+                    remove(torrent_file)
+                
+            print "done."
+    
+        print all_file + ": reading ...",
+        if new_all_torrent:
+            print "new torrent created ...",
         sys.stdout.flush()
-        old_files, headers = get_old(torrent_file)
-
-        # Create the headers
-        headers.setdefault("OriginalDate", date)
-        headers["Date"] = date
-        headers.setdefault("PieceSize", str(default_piecesize))
-        headers.setdefault("NextPiece", str(0))
-        headers["Codename"] = codename
-        headers["Suite"] = suite
-        headers["Component"] = component
-        headers["Architecture"] = arch
-        headers.setdefault("Tracker", default_tracker)
-        headers.setdefault("TorrentHashFields", " ".join(default_hash_fields))
+        # If there were 'all' files found
+        if all_pieces or all_new_pieces:
+            # Process the new 'all' files found
+            print "updating ...",
+            sys.stdout.flush()
+            add_new(all_pieces, all_new_pieces, all_headers)
         
-        # Calculate the new hash
-        sha1 = sha.new()
-        for header in headers["TorrentHashFields"].split():
-            sha1.update(headers[header])
-        new_hash = sha1.hexdigest()
-        
-        # Check if the hash has changed
-        if headers.get("Torrent", "") != new_hash:
-            # If it has, then reset the torrent
-            print "new torrent created ...",
-            sys.stdout.flush()
-            old_files = {}
-            headers["OriginalDate"] = date
-            headers["NextPiece"] = str(0)
-            headers.pop("OriginalPieces", "")
-            sha1 = sha.new()
-            for header in headers["TorrentHashFields"].split():
-                sha1.update(headers[header])
-            headers["Torrent"] = sha1.hexdigest()
-
-        # Parse the Packages file for the new data
-        print "updating ...",
-        sys.stdout.flush()
-        new_pieces = get_new(filename, old_files, headers, old_all_files, 
-                             all_pieces, all_new_pieces)
-
-        # Set the final header values
-        headers.setdefault("OriginalPieces", headers["NextPiece"])
-
-        if new_pieces:
-            # Write the headers
+            # Write the all_headers
             print "writing ...",
             sys.stdout.flush()
-            f = gzip.open(torrent_file, 'w')
-            for header in header_order:
-                if header in headers:
-                    f.write("%s: %s\n" % (header, headers[header]))
-            f.write("PieceNumbers:\n")
-            
-            # Write the starting piece numbers
-            pieces = new_pieces.keys()
-            pieces.sort()
-            format_string = " %"+str(len(str(max(pieces))))+"d %s\n"
-            for piece in pieces:
-                f.write(format_string % (piece, new_pieces[piece]))
-            
-            f.close()
+            write_file(all_file, all_pieces, all_headers)
         else:
             print "empty ...",
-            if exists(torrent_file):
-                remove(torrent_file)
-            
+            if exists(all_file):
+                remove(all_file)
+    
         print "done."
-
-    print all_file + ": reading ...",
-    if new_all_torrent:
-        print "new torrent created ...",
-    sys.stdout.flush()
-    # If there were 'all' files found
-    if all_pieces or all_new_pieces:
-        # Process the new 'all' files found
-        print "updating ...",
-        sys.stdout.flush()
-        next_piece = int(all_headers["NextPiece"])
-        piece_size = int(all_headers["PieceSize"])
-        all_new_pieces.sort()
-        old_file = ""
-        old_size = 0L
-        for (file, size) in all_new_pieces:
-            if file == old_file:
-                if size != old_size:
-                    print "WARNING: multiple architecture:all files with different size:", file
-            else:
-                all_pieces[next_piece] = file
-                next_piece += int(ceil(size/float(piece_size)))
-                
-            old_file = file
-            old_size = size
-    
-        # Set the final header values
-        all_headers["NextPiece"] = str(next_piece)
-        all_headers.setdefault("OriginalPieces", all_headers["NextPiece"])
-    
-        # Write the all_headers
-        print "writing ...",
-        sys.stdout.flush()
-        f = gzip.open(all_file, 'w')
-        for header in header_order:
-            if header in all_headers:
-                f.write("%s: %s\n" % (header, all_headers[header]))
-        f.write("PieceNumbers:\n")
-            
-        # Write the all starting piece numbers
-        pieces = all_pieces.keys()
-        pieces.sort()
-        format_string = " %"+str(len(str(max(pieces))))+"d %s\n"
-        for piece in pieces:
-            f.write(format_string % (piece, all_pieces[piece]))
-        
-        f.close()
+    
+    if packages:
+        print "The following packages files were not used:"
+        for package in packages:
+            print "    %s" % package
+
+if __name__ == '__main__':
+    if len(sys.argv) >= 2:
+        for file in sys.argv[1:]:
+            run(file)
     else:
-        print "empty ...",
-        if exists(all_file):
-            remove(all_file)
-
-    print "done."
-
-if packages:
-    print "The following packages files were not used:"
-    for package in packages:
-        print "    %s" % package
-        
-
-#    fnkey = filename + ":pc"
-#    if cache.has_key(fnkey):
-#        sha1, result = str2hash(cache[fnkey])
-#    cache[fnkey] = values
-#cache.sync()
-#cache.close()
+        print "Usage: " + sys.argv[0] + " Releasefile [Releasefile ...]"