r377 - in /debtorrent/trunk: hippy.py uniquely.py
camrdale-guest at users.alioth.debian.org
camrdale-guest at users.alioth.debian.org
Sun May 11 05:44:24 UTC 2008
Author: camrdale-guest
Date: Sun May 11 05:44:23 2008
New Revision: 377
URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=377
Log:
Speed up hippy and uniquely for deployment on merkel.
Modified:
debtorrent/trunk/hippy.py
debtorrent/trunk/uniquely.py
Modified: debtorrent/trunk/hippy.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/trunk/hippy.py?rev=377&op=diff
==============================================================================
--- debtorrent/trunk/hippy.py (original)
+++ debtorrent/trunk/hippy.py Sun May 11 05:44:23 2008
@@ -2,67 +2,43 @@
"""Calculate the sub-piece hashes for large package files.
-Run this script in the directory where the extrapieces files are to be stored.
-It's only command line argument is the Berkeley database containing the cached
-data from previous runs. Pass the paths of Release files to process into the
+The command line arguments are the Berkeley database containing the cached
+data from previous runs, and the unique name of the Packages file being
+processed. Pass the paths of package files to process into the
standard input.
For example::
- find /var/www/debian -maxdepth 3 -name "Release" | hippy ../hippycache.bdb
+ find dists -name 'Packages.gz' | sort |
+ while read a; do
+ b=$(echo $a | tr / _)
+ d=~/public_html/extrapieces/${b%.gz}-extrapieces.gz
+ zcat $a | sed -n 's/^Filename: //p' |
+ grep -v '^[.]/' |
+ sort | (~/hippy ~/mycache $a || echo >&2 "Failed: $a") |
+ gzip -9 > ${d}.new
+ mv ${d}.new $d
+ done
+
+ at var MAX_PIECE_SIZE: the maximum piece size to use, pieces may be smaller than this
+ at var CHUNK_SIZE: the download chunk size used by DebTorrent (used to calculate
+ an optimal piece size)
+ at var MAX_AGE: the maximum iterations of the program to keep subpiece information
+ for the Packages file after it's no longer in the file (for slow mirrors)
"""
import bsddb, sha, binascii
-import os, sys, gzip
+import os, sys
import struct
-from bz2 import BZ2File
from math import ceil
-from debian_bundle import deb822
MAX_PIECE_SIZE = 512*1024
CHUNK_SIZE = 16*1024
+MAX_AGE = 14
# The Packages files to read
EXTENSION = ".gz"
-
-def read_release(filename):
- """Read the headers and Packages file names from a Release file.
-
- @type filename: C{string}
- @param filename: the Release file to read
- @rtype: C{dictionary}, C{list} of C{string}
- @return: the headers and full file names of Packages files
-
- """
-
- # Initialize the Release file variables
- read_packages = False
- headers = {}
- packages = []
-
- f = open(filename, 'r')
-
- rel = deb822.Release(f)
- for header in rel:
- if header.lower() not in ["md5sum", "sha1", "sha256"]:
- # Read the headers from the file
- headers[header] = rel[header]
-
- # Read the Packages file names
- for file in rel.get('MD5Sum', []):
- if file['name'].endswith("Packages"+EXTENSION) and file['name'] not in packages:
- packages.append(file['name'])
- for file in rel.get('SHA1', []):
- if file['name'].endswith("Packages"+EXTENSION) and file['name'] not in packages:
- packages.append(file['name'])
- for file in rel.get('SHA256', []):
- if file['name'].endswith("Packages"+EXTENSION) and file['name'] not in packages:
- packages.append(file['name'])
-
- f.close()
-
- return headers, packages
def hash(file, piece_size):
"""Read a file and hash it's sub-pieces.
@@ -121,45 +97,54 @@
"""Convert a cache value to a list of package names.
The cache is stored as a string. The list is a repeating sequence of one
- byte length followed by a string of that length. Therefore, the longest
- string that can be stored is 256.
+ byte length followed by a string of that length, followed by one byte
+ indicating the number of times the package has been missing from the
+ Packages file plus 1. Therefore, the longest string that can be stored is
+ 256.
@type cache_value: C{string}
@param cache_value: the cached value for this file
- @rtype: C{list} of C{string}
- @return: the list of package names stored in the cache
+ @rtype: C{dictionary}
+ @return: keys are the package names stored in the cache, values are the
+ number of times the package has not been found in the Packages file
"""
if cache_value == "":
- return []
-
- deb_list = []
+ return {}
+
+ debs = {}
while len(cache_value) > 0:
length = ord(cache_value[0])
deb = cache_value[1:length+1]
- cache_value = cache_value[length+1:]
- deb_list.append(deb)
-
- return deb_list
-
-def list2cache(deb_list):
+ num = ord(cache_value[length+1]) - 1
+ cache_value = cache_value[length+2:]
+ debs[deb] = num
+
+ return debs
+
+def list2cache(debs):
"""Convert a list of package names to a cacheable value.
- @type deb_list: C{list} of C{string}
- @param deb_list: the package names to create a cache value for
+ @type debs: C{dictionary}
+ @param debs: the package names to create a cache value for, keys are the
+ names of the packages, values are the integer number of times this
+ package has not been found in the Packages file
@rtype: C{string}
@return: the cacheable string
"""
- if not deb_list:
+ if not debs:
return ""
+ deb_list = debs.keys()
+ deb_list.sort()
cache_value = ""
for deb in deb_list:
assert len(deb) < 256
- cache_value += chr(len(deb)) + deb
+ assert debs[deb] >= 0
+ cache_value += chr(len(deb)) + deb + chr(min(255, debs[deb]+1))
return cache_value
def cache2hash(cache_value):
@@ -223,8 +208,7 @@
@param filename: the file to calculate sub pieces for
"""
-
- filename = filename.rstrip()
+ sys.stderr.write(' Hashing: %s\n' % filename)
# Get the size of the file
size = os.stat(filename).st_size
@@ -242,111 +226,89 @@
return sha1, piece_list
-def get_packages(filename):
- """Read the new piece data from a Packages file.
-
- @type filename: C{string}
- @param filename: the Packages file to open and parse
- @rtype: C{list} of C{string}
- @return: the package files listed in the Packages file
-
- """
-
- # Open the possibly compressed file
- if filename.endswith(".gz"):
- f = gzip.open(filename, 'r')
- elif filename.endswith(".bz2"):
- f = BZ2File(filename, "r")
- else:
- f = open(filename, 'r')
-
- debs = []
-
- for pkg in deb822.Packages.iter_paragraphs(f, fields = ['Filename']):
- if pkg.get('Filename', ''):
- debs.append(pkg['Filename'])
-
- f.close()
-
- return debs
-
-def run(cache, releasefile):
+def run(cache, pkg_file):
"""Process a single Release file.
@type cache: C{bsddb.BTree}
@param cache: an already opened bDB b-tree
- @type releasefile: C{string}
- @param releasefile: the Release file to process
-
- """
-
- # Process the Release file
- print "Processing: %s" % releasefile
- root_dir = releasefile[:releasefile.index('/dists/')+1]
- release_dir = releasefile[len(root_dir):].rsplit('/', 1)[0] + '/'
- release_headers, packages = read_release(releasefile)
-
- file_prefix = "dists_" + release_headers.get("Codename", "") + "_"
- file_suffix = "_Packages-extrapieces.gz"
-
- for packages_file in packages:
- sub_filename = file_prefix + '_'.join(packages_file.split('/')[:-1]) + file_suffix
- pkey = release_dir + packages_file + ":pl"
-
- # Get the list of packages in the packages file
- debs = get_packages(root_dir + release_dir + packages_file)
-
- # Retrieve the saved list of sub-pieced packages in the Packages file
- if cache.has_key(pkey):
- packages_list = cache2list(cache[pkey])
+ @type pkg_file: C{string}
+ @param pkg_file: the name of the Packages file being processed
+
+ """
+ sys.stderr.write('Processing: %s\n' % pkg_file)
+
+ # Retrieve the saved list of sub-pieced packages in the Packages file
+ pkey = pkg_file + ":pl"
+ if cache.has_key(pkey):
+ old_debs = cache2list(cache[pkey])
+ else:
+ old_debs = {}
+ new_debs = {}
+
+ sys.stderr.write(' Found %d old files\n' % len(old_debs))
+
+ # Print the pice hashes for files in the Packages file
+ for filename in sys.stdin:
+ filename = filename.rstrip()
+ fnkey = filename + ":pc"
+
+ # Check if this file's sub-pieces are already known
+ if cache.has_key(fnkey):
+ sha1, piece_list = cache2hash(cache[fnkey])
else:
- packages_list = []
- all_debs = {}.fromkeys(packages_list, 1)
-
- # First, sub-piece any new package files
- for deb in debs:
- filename = root_dir + deb
- fnkey = deb + ":pc"
-
- # Check if this file's sub-pieces are already known
- if cache.has_key(fnkey):
- sha1, piece_list = cache2hash(cache[fnkey])
- else:
- print ' Hashing new package:', deb
- sha1, piece_list = sub_piece(filename)
-
- # Save the result for next time
- cache[fnkey] = hash2cache(sha1, piece_list)
-
- # If it has sub-pieces, save it to the list
- if piece_list:
- all_debs[deb] = 1
-
- # Write the list back to the cache
- packages_list = all_debs.keys()
- packages_list.sort()
- cache[pkey] = list2cache(packages_list)
-
- # Write the sub-piece data to the file
- sub_file = gzip.open(sub_filename, 'w')
- for deb in packages_list:
- fnkey = deb + ":pc"
+ # Not known, hash the file
+ sha1, piece_list = sub_piece(filename)
+
+ # Save the result for next time
+ cache[fnkey] = hash2cache(sha1, piece_list)
+
+ # Remove the file from the old list
+ old_debs.pop(filename, None)
+
+ if piece_list:
+ # Print the resulting sub-piece hashes
+ print "Filename: %s" % (filename, )
+ print "SHA1: %s" % (sha1, )
+ print "SHA1-Pieces:"
+ for x in piece_list:
+ print " %s %d" % x
+ print
+
+ # Save the file to the new list
+ new_debs[filename] = 0
- # Check to make sure
- if cache.has_key(fnkey):
- # Get the cached result
- sha1, piece_list = cache2hash(cache[fnkey])
-
- # Print the resulting sub-piece hashes
- sub_file.write("Filename: %s\n" % (deb))
- sub_file.write("SHA1: %s\n" % (sha1))
- sub_file.write("SHA1-Pieces:\n")
- for x in piece_list:
- sub_file.write(" %s %d\n" % x)
- sub_file.write("\n")
- else:
- print "WARNING: no sub-piece data found for " + deb
- sub_file.close()
+ sys.stderr.write(' Have %d missing old files\n' % len(old_debs))
+
+ # Also print any unexpired old entries
+ for missing_file in old_debs:
+ # Expire entries after they have been missing for MAX_AGE runs
+ if old_debs[missing_file] >= MAX_AGE:
+ sys.stderr.write(' Expired: %s\n' % missing_file)
+ continue
+
+ fnkey = missing_file + ":pc"
+
+ # Check if this file's sub-pieces are already known
+ if not cache.has_key(fnkey):
+ continue
+
+ sha1, piece_list = cache2hash(cache[fnkey])
+
+ if piece_list:
+ # Print the resulting sub-piece hashes
+ print "Filename: %s" % (missing_file, )
+ print "SHA1: %s" % (sha1, )
+ print "SHA1-Pieces:"
+ for x in piece_list:
+ print " %s %d" % x
+ print
+
+ # Increment the age of the missing file
+ new_debs[missing_file] = old_debs[missing_file] + 1
+
+ # Write the list back to the cache
+ sys.stderr.write(' Saving %d new files\n' % len(new_debs))
+ cache[pkey] = list2cache(new_debs)
if __name__ == '__main__':
@@ -354,10 +316,10 @@
cache_file = sys.argv[1]
cache = bsddb.btopen(cache_file, "w")
- # Read Release file names from standard in
- for filename in sys.stdin:
- filename = filename.rstrip()
- run(cache, filename)
+ # Get the Packages file name being processed
+ pkg_file = sys.argv[2]
+
+ run(cache, pkg_file)
# Close the cache file
cache.sync()
Modified: debtorrent/trunk/uniquely.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/trunk/uniquely.py?rev=377&op=diff
==============================================================================
--- debtorrent/trunk/uniquely.py (original)
+++ debtorrent/trunk/uniquely.py Sun May 11 05:44:23 2008
@@ -7,10 +7,11 @@
import gzip
from bz2 import BZ2File
from math import ceil
-from os import remove
+from os import remove, rename, system
from os.path import exists
from time import strftime, gmtime
from debian_bundle import deb822
+from tempfile import mkstemp
# The piece size to use (must match the '-extrapieces' file's piece size)
DEFAULT_PIECESIZE = 512*1024
@@ -197,13 +198,22 @@
"""
+ f, tmpfile = mkstemp(prefix = 'uniquely')
+# f.close()
+
# Open the possibly compressed file
if filename.endswith(".gz"):
- f = gzip.open(filename, 'r')
+ ret = system("zcat '%s' | grep -E '^(Filename:.*|Size:.*|Architecture:.*|)$' > '%s'" % (filename, tmpfile))
+ if ret != 0:
+ raise RuntimeError, 'Failed to decompress %s' % filename
+ filename = tmpfile
elif filename.endswith(".bz2"):
- f = BZ2File(filename, "r")
- else:
- f = open(filename, 'r')
+ ret = system("bzcat '%s' | grep -E '^(Filename:.*|Size:.*|Architecture:.*|)$' > '%s'" % (filename, tmpfile))
+ if ret != 0:
+ raise RuntimeError, 'Failed to decompress %s' % filename
+ filename = tmpfile
+
+ f = open(filename, 'r')
pieces = {}
new_pieces = []
@@ -233,6 +243,8 @@
f.close()
+ remove(tmpfile)
+
return pieces, new_pieces
def add_new(pieces, new_pieces, headers):
@@ -288,7 +300,7 @@
"""
- f = gzip.open(filename, 'w')
+ f = gzip.open(filename + '.new', 'w')
# Write the headers
for header in HEADER_ORDER:
@@ -304,6 +316,7 @@
f.write(format_string % (p, pieces[p]))
f.close()
+ rename(filename + '.new', filename)
def run(releasefile):
"""Process a single Release file.
More information about the Debtorrent-commits
mailing list