r210 - /debtorrent/branches/unique/uniquely.py

Thu Aug 9 18:21:55 UTC 2007

Author: camrdale-guest
Date: Thu Aug  9 18:21:55 2007
New Revision: 210

URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=210
Log:
Mostly finished uniquely script.

Modified:
    debtorrent/branches/unique/uniquely.py

Modified: debtorrent/branches/unique/uniquely.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/branches/unique/uniquely.py?rev=210&op=diff
==============================================================================

--- debtorrent/branches/unique/uniquely.py (original)
+++ debtorrent/branches/unique/uniquely.py Thu Aug  9 18:21:55 2007
@@ -1,10 +1,12 @@
 #!/usr/bin/env python
 
+"""Process a Release file, creating, finding and updating any torrent files."""
+
 import bsddb, sha, binascii
-import os, sys
+import sys
 import gzip
-from StringIO import StringIO
-from math import ceil, log
+from bz2 import BZ2File
+from math import ceil
 
 # Some default values
 default_piecesize = 512*1024
@@ -12,17 +14,151 @@
 default_hash_fields = ["Codename", "Suite", "Component", "Architecture",
                        "PieceSize", "OriginalDate"]
 header_order = ["Torrent", "Infohash", "OriginalDate", "Date", "PieceSize",
-                "Codename", "Suite", "Component", "Architecture",
-                "TorrentHashFields"]
+                "NextPiece", "OriginalPieces", "Codename", "Suite",
+                "Component", "Architecture", "TorrentHashFields"]
+
+def get_old(old_file):
+    """Read the headers and piece ordering data from an old file.
+    
+    @type old_file: C[string}
+    @param old_file: the old piece ordering file to open
+    @rtype: (C{dictionary}, C{dictionary})
+    @return: the old piece ordering (keys are the file names, values are the
+        starting piece number) and headers
+    
+    """
+    pieces = {}
+    headers = {}
+    
+    try:
+        f = gzip.open(old_file, 'r')
+    
+        # Read the headers from the file
+        for line in f:
+            line = line.rstrip()
+    
+            h, v = line.split(":", 1)
+            if h == "PieceNumbers":
+                break
+    
+            headers[h] = v[1:]
+    
+        # Read the piece ordering from the file
+        for line in f:
+            line = line.rstrip()
+    
+            if line[:1] != " ":
+                break
+    
+            piece, file = line.split()
+            pieces[file] = int(piece)
+        
+        f.close()
+    except:
+        # Just return the empty variables, causing a new torrent to be generated
+        pass
+    
+    return pieces, headers
+
+def get_new(filename, old_files, headers, old_all_files, all_pieces,
+            all_new_pieces):
+    """Read the new piece data from a Packages file.
+    
+    Reads the Packages file, finding old files in it and copying their data to
+    the new ordering, and adding any new files found to the end of the
+    ordering. The old_files input is modified by removing the found files from
+    it, and the 'NextPiece' header in the input headers is changed.
+    
+    Any architecture:all files found are processed and added to the 'all'
+    piece ordering. This is done by modifying the input old_all_files,
+    all_pieces, and all_new_pieces variables.
+    
+    @type filename: C[string}
+    @param filename: the Packages file to open and parse
+    @type old_files: C{dictionary}
+    @param old_files: the original piece ordering, keys are the file names,
+        values are the starting piece number
+    @type headers: C{dictionary}
+    @param headers: the original headers
+    @type old_all_files: C{dictionary}
+    @param old_all_files: the original piece ordering for architecture:all
+        files, keys are the file names, values are the starting piece number
+    @type all_pieces: C{dictionary}
+    @param all_pieces: the new piece ordering for architecture:all files,
+        keys are the starting piece numbers, values are the file names
+    @type all_new_pieces: C{list} of (C{string}, C{long})
+    @param all_new_pieces: the file name and file size of the new
+        architecture:all files that have been found
+    @rtype: C{dictionary}
+    @return: the new piece ordering, keys are the starting piece numbers,
+        values are the file names
+    
+    """
+
+    # Get the needed header information
+    next_piece = int(headers["NextPiece"])
+    piece_size = int(headers["PieceSize"])
+    
+    # Open the possibly compressed file
+    if filename.endswith(".gz"):
+        f = gzip.open(filename, 'r')
+    elif filename.endswith(".bz2"):
+        f = BZ2File(filename, "r")
+    else:
+        f = open(filename, 'r')
+
+    pieces = {}
+    
+    p = [None, None, None]
+    for line in f:
+        line = line.rstrip()
+        if line == "":
+            if (p[0] and p[1] and p[2]):
+                # Check which torrent to add the info to
+                if p[2] == 'all':
+                    if p[0] in all_pieces.values():
+                        # Already found the old file
+                        pass
+                    elif p[0] in old_all_files:
+                        # Found old file, so add it
+                        all_pieces[old_all_files[p[0]]] = p[0]
+                        del old_all_files[p[0]]
+                    elif (p[0], p[1]) not in all_new_pieces:
+                        # Found new file, save it for later processing
+                        all_new_pieces.append((p[0], p[1]))
+                else:
+                    if p[0] in old_files:
+                        # Found old file, so add it
+                        pieces[old_files[p[0]]] = p[0]
+                        del old_files[p[0]]
+                    else:
+                        # Add new file to the end of the torrent
+                        pieces[next_piece] = p[0]
+                        next_piece += int(ceil(p[1]/float(piece_size)))
+
+            p = [None, None, None]
+        if line[:9] == "Filename:":
+            p[0] = line[10:]
+        if line[:5] == "Size:":
+            p[1] = long(line[6:])
+        if line[:13] == "Architecture:":
+            p[2] = line[14:]
+    
+    f.close()
+    
+    headers["NextPiece"] = str(next_piece)
+
+    return pieces
 
 #cache_file = sys.argv[1]
-#%cache = bsddb.btopen(cache_file, "w")
+#cache = bsddb.btopen(cache_file, "w")
 
 # The only input is the Release file to process
 releasefile = sys.argv[1]
 print "Processing: %s" % releasefile
 
 # Initialize the Release file variables
+release_dir = releasefile.rsplit('/', 1)[0]
 origin = ""
 label = ""
 suite = ""
@@ -42,7 +178,7 @@
     
     # Read the various headers from the file
     if line[:7] == "Origin:":
-        origin = line[7:]
+        origin = line[8:]
     if line[:6] == "Label:":
         label = line[7:]
     if line[:6] == "Suite:":
@@ -55,17 +191,17 @@
         components = line[12:].split()
     if line[:14] == "Architectures:":
         archs = line[15:].split()
-        
+
     # Read multiple lines from the SHA1 section of the file
     if line[:1] != " ":
         read_files = False
     if read_files:
         p = line.split()
-        if len(p) == 3 and p[2].EndsWith("Packages"+extension):
-            packages.append(p[2])
+        if len(p) == 3 and p[2].endswith("Packages"+extension):
+            packages.append(release_dir + "/" + p[2])
             packages_sha1[p[2]] = binascii.a2b_hex(p[0])
-            packages_size[p[2]] = long(p[2])
-    if line[:5] == "SHA1:":
+            packages_size[p[2]] = long(p[1])
+    if line[:7] == "MD5Sum:":
         read_files = True
 
 f.close()
@@ -77,16 +213,35 @@
     # Get the old 'all' data
     all_file = torrent_prefix + component + "_binary-all" + torrent_suffix
     old_all_files, all_headers = get_old(all_file)
+    all_pieces = {}
+    all_new_pieces = []
+
+    # Create the all headers
+    all_headers.setdefault("OriginalDate", date)
+    all_headers["Date"] = date
+    all_headers.setdefault("PieceSize", str(default_piecesize))
+    all_headers.setdefault("NextPiece", str(0))
+    all_headers["Codename"] = codename
+    all_headers["Suite"] = suite
+    all_headers["Component"] = component
+    all_headers["Architecture"] = "all"
+    all_headers.setdefault("TorrentHashFields", " ".join(default_hash_fields))
+    if "Torrent" not in all_headers:
+        sha1 = sha.new()
+        for header in all_headers["TorrentHashFields"].split():
+            sha1.update(all_headers[header])
+        all_headers["Torrent"] = sha1.hexdigest()
 
     for arch in archs:
         # Find the Packages file that will be parsed
         found = False
         for filename in packages:
-            if filename.find(component) >= 0 and filename.find("binary-"+arch) >= 0:
+            if (filename.find(component) >= 0 and 
+                filename.find("binary-"+arch) >= 0):
                 found = True
                 break
         if not found:
-            print "WARNING: no matching Packages file for component %s, arch %s" % component, arch
+            print "WARNING: no matching Packages file for component %s, arch %s" % (component, arch)
             continue
         packages.pop(packages.index(filename))
 
@@ -96,16 +251,15 @@
         old_files, headers = get_old(torrent_file)
 
         # Create the headers
-        if "OriginalDate" not in headers:
-            headers["OriginalDate"] = date
-        if "PieceSize" not in headers:
-            headers["PieceSize"] = default_piecesize
+        headers.setdefault("OriginalDate", date)
+        headers["Date"] = date
+        headers.setdefault("PieceSize", str(default_piecesize))
+        headers.setdefault("NextPiece", str(0))
         headers["Codename"] = codename
         headers["Suite"] = suite
         headers["Component"] = component
         headers["Architecture"] = arch
-        if "TorrentHashFields" not in headers:
-            headers["TorrentHashFields"] = " ".join(default_hash_fields)
+        headers.setdefault("TorrentHashFields", " ".join(default_hash_fields))
         if "Torrent" not in headers:
             sha1 = sha.new()
             for header in headers["TorrentHashFields"].split():
@@ -114,59 +268,72 @@
 
         # Parse the Packages file for the new data
         print "updating ... ",
-        new_files, removed_files = get_new(filename, old_files, headers["PieceSize"])
-
-        # Write the headers
+        new_pieces = get_new(filename, old_files, headers, old_all_files, 
+                             all_pieces, all_new_pieces)
+
+        headers.setdefault("OriginalPieces", headers["NextPiece"])
+
+        if new_pieces:
+            # Write the headers
+            print "writing ... ",
+            f = gzip.open(torrent_file, 'w')
+            for header in header_order:
+                if header in headers:
+                    f.write("%s: %s\n" % (header, headers[header]))
+            f.write("PieceNumbers:\n")
+            
+            # Write the starting piece numbers
+            pieces = new_pieces.keys()
+            pieces.sort()
+            format_string = " %"+str(len(str(max(pieces))))+"d %s\n"
+            for piece in pieces:
+                f.write(format_string % (piece, new_pieces[piece]))
+            
+            f.close()
+        print "done."
+
+    print all_file + ": ",
+    # If there were 'all' files found
+    if all_pieces or all_new_pieces:
+        # Process the new 'all' files found
+        print "updating ... ",
+        next_piece = int(all_headers["NextPiece"])
+        piece_size = int(all_headers["PieceSize"])
+        all_new_pieces.sort()
+        old_file = ""
+        old_size = 0L
+        for (file, size) in all_new_pieces:
+            if file == old_file:
+                if size != old_size:
+                    print "WARNING: multiple architecture:all files with different size:", file
+            else:
+                all_pieces[next_piece] = file
+                next_piece += int(ceil(size/float(piece_size)))
+                
+            old_file = file
+            old_size = size
+    
+        # Set the new next piece to use
+        all_headers["NextPiece"] = str(next_piece)
+        all_headers.setdefault("OriginalPieces", all_headers["NextPiece"])
+    
+        # Write the all_headers
         print "writing ... ",
-        f = gzip.open(torrent_file, 'w')
+        f = gzip.open(all_file, 'w')
         for header in header_order:
-            if header in headers:
-                f.write("%s: %s\n" % header, headers[header])
+            if header in all_headers:
+                f.write("%s: %s\n" % (header, all_headers[header]))
         f.write("PieceNumbers:\n")
-        
-        # Write the starting piece numbers
-        pieces = new_pieces.keys()
+            
+        # Write the all starting piece numbers
+        pieces = all_pieces.keys()
         pieces.sort()
-        format_string = " %"+str(int(ceil(log(max(pieces), 10))))+"d %s\n"
+        format_string = " %"+str(len(str(max(pieces))))+"d %s\n"
         for piece in pieces:
-            f.write(format_string % piece, new_pieces[piece])
+            f.write(format_string % (piece, all_pieces[piece]))
         
         f.close()
-        print "done."
-
-    # Create the all headers
-    if "OriginalDate" not in all_headers:
-        all_headers["OriginalDate"] = date
-    if "PieceSize" not in all_headers:
-        all_headers["PieceSize"] = default_piecesize
-    all_headers["Codename"] = codename
-    all_headers["Suite"] = suite
-    all_headers["Component"] = component
-    all_headers["Architecture"] = "all"
-    if "TorrentHashFields" not in all_headers:
-        all_headers["TorrentHashFields"] = " ".join(default_hash_fields)
-    if "Torrent" not in all_headers:
-        sha1 = sha.new()
-        for header in all_headers["TorrentHashFields"].split():
-            sha1.update(all_headers[header])
-        all_headers["Torrent"] = sha1.hexdigest()
-
-    # Write the all_headers
-    print all_file + ": writing ... ",
-    f = gzip.open(all_file, 'w')
-    for header in header_order:
-        if header in all_headers:
-            f.write("%s: %s\n" % header, all_headers[header])
-    f.write("PieceNumbers:\n")
-        
-    # Write the all starting piece numbers
-    pieces = all_new_pieces.keys()
-    pieces.sort()
-    format_string = " %"+str(int(ceil(log(max(pieces), 10))))+"d %s\n"
-    for piece in pieces:
-        f.write(format_string % piece, all_new_pieces[piece])
-    
-    f.close()
+
     print "done."
 
 if packages:
@@ -174,11 +341,10 @@
     for package in packages:
         print "    %s" % package
         
-"""        
-    fnkey = filename + ":pc"
-    if cache.has_key(fnkey):
-    	sha1, result = str2hash(cache[fnkey])
-	cache[fnkey] = values
-"""
+
+#    fnkey = filename + ":pc"
+#    if cache.has_key(fnkey):
+#        sha1, result = str2hash(cache[fnkey])
+#    cache[fnkey] = values
 #cache.sync()
 #cache.close()