[misc] 01/01: rewrite fetch_logs in python, so it can skip sizes, do mtimes, ...
Chris West
faux-guest at moszumanska.debian.org
Thu Mar 9 12:19:18 UTC 2017
This is an automated email from the git hooks/post-receive script.
faux-guest pushed a commit to branch master
in repository misc.
commit e24cc5f102b787397e34a9961a2cf57cc5ff8c98
Author: Chris West (Faux) <git at goeswhere.com>
Date: Thu Feb 9 07:54:25 2017 +0000
rewrite fetch_logs in python, so it can skip sizes, do mtimes, ...
---
fetch-logs | 306 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
fetch-logs.sh | 15 ---
2 files changed, 306 insertions(+), 15 deletions(-)
diff --git a/fetch-logs b/fetch-logs
new file mode 100755
index 0000000..ffb82b8
--- /dev/null
+++ b/fetch-logs
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+import argparse
+import datetime
+import html.parser
+import http.client
+import io
+import os
+import shutil
+import sys
+import tempfile
+import urllib.request
+
+from email.utils import format_datetime
+from typing import List, Optional, Iterable
+
+
+class File:
+ def __init__(self):
+ self.name = None # type: str
+ self.mtime = None # type: datetime.datetime
+ self.size = None # type: int
+
+ def __str__(self):
+ return 'File({}, {}, {})'.format(repr(self.name), self.mtime and self.mtime.isoformat(), self.size)
+
+
+class Parsey(html.parser.HTMLParser):
+ def __init__(self):
+ super().__init__()
+ self.row = 0
+ self.in_row = False # type: bool
+ self.td = 0 # type: int
+ self.file = File() # type: File
+ self.files = [] # type: List[File]
+
+ def handle_starttag(self, tag: str, attrs):
+ if 'tr' == tag:
+ self.in_row = True
+ self.row += 1
+ elif 'td' == tag:
+ self.td += 1
+
+ def handle_endtag(self, tag: str):
+ if 'tr' == tag:
+ self.td = 0
+ self.in_row = False
+ if self.file.name and self.file.mtime and self.file.size:
+ self.files.append(self.file)
+ self.file = File()
+
+ def handle_data(self, data: str):
+ if not self.in_row:
+ return
+
+ # "parent directory" row
+ if self.row == 3:
+ return
+
+ if 2 == self.td:
+ self.file.name = data
+ elif 3 == self.td:
+ self.file.mtime = apache_time(data)
+ elif 4 == self.td:
+ self.file.size = apache_size(data)
+
+ def error(self, message):
+ raise Exception(message)
+
+
+def apache_time(text: str) -> datetime.datetime:
+ """
+ >>> apache_time('2017-01-15 13:53')
+ datetime.datetime(2017, 1, 15, 13, 53)
+ """
+ return datetime.datetime.strptime(text.strip(), '%Y-%m-%d %H:%M')
+
+
+def apache_size(data):
+ """
+ >>> apache_size('16K')
+ 16*1024
+ """
+ suffix = data[-1].upper()
+ if 'G' == suffix:
+ return int(1024 * 1024 * 1024 * float(data[:-1]))
+ if 'M' == suffix:
+ return int(1024 * 1024 * float(data[:-1]))
+ if 'K' == suffix:
+ return int(1024 * float(data[:-1]))
+ if ' ' == suffix:
+ return int(data)
+
+ raise Exception("can't read " + data)
+
+
+class Sponge:
+ def __init__(self, dest: str):
+ self.fd = None # type: io.BufferedRandom
+ self.name = None # type: str
+ self.dest = dest
+
+ def __enter__(self):
+ fd_int, self.name = tempfile.mkstemp(
+ dir=os.path.dirname(self.dest),
+ prefix='.',
+ suffix='.tmp~')
+ self.fd = open(fd_int, 'w+b')
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.fd.close()
+ if exc_type:
+ os.unlink(self.name)
+ else:
+ os.rename(self.name, self.dest)
+
+
+def fetch(url: str, path: str):
+ with urllib.request.urlopen(url) as resp: # type: urllib.response.HTTPResponse
+ if 200 != resp.status:
+ raise Exception('fetching {} failed ({}): {}'.format(url, resp.status, resp))
+
+ with Sponge(path) as dest:
+ shutil.copyfileobj(resp, dest.fd)
+
+
+class Store:
+ def __init__(self,
+ local_root: str,
+ folder: str,
+ base='https://tests.reproducible-builds.org/',
+ verbose: bool = False):
+ assert base.endswith('/')
+
+ self.out = os.path.realpath(local_root) + '/'
+ self.folder = os.path.realpath(folder) + '/'
+ self.base = base
+ self.files = [] # type: List[File]
+ self.verbose = verbose
+
+ os.makedirs(self.to_local_path(self.folder), exist_ok=True)
+
+ def mtime(self, path: str) -> Optional[datetime.datetime]:
+ try:
+ return datetime.datetime.fromtimestamp(os.path.getmtime(self.to_local_path(path)))
+ except FileNotFoundError:
+ return None
+
+ def to_local_path(self, path: str) -> str:
+ return self.out + path[1:]
+
+ def to_url(self, path: str) -> str:
+ assert path.startswith('/')
+ return self.base + path[1:]
+
+ def file_named(self, name: str) -> str:
+ assert not name.startswith('/')
+ return self.folder + name
+
+ def load_index(self, max_age: datetime.timedelta):
+ index = self.file_named('index.html')
+ index_fetched = self.mtime(index)
+
+ if not index_fetched or index_fetched < (datetime.datetime.now() - max_age):
+ if self.verbose:
+ print("info: index out of date, downloading...")
+ # empty file name: not index.html on the server
+ fetch(self.to_url(self.file_named('')), self.to_local_path(index))
+
+ self.load_from(index)
+
+ def load_from(self, path: str):
+ with open(self.to_local_path(path)) as f:
+ parsey = Parsey()
+ if self.verbose:
+ sys.stdout.write("info: parsing HTML... ")
+ sys.stdout.flush()
+ parsey.feed(f.read())
+ if self.verbose:
+ print("done.")
+ self.files = parsey.files
+
+ def up_to_date(self, file: File) -> bool:
+ mtime = self.mtime(self.file_named(file.name))
+ if not mtime:
+ return False
+ return mtime >= file.mtime
+
+ def download_many(self, files: Iterable[File]):
+ parse = urllib.parse.urlparse(self.base) # type: urllib.parse.ParseResult
+ assert 'https' == parse.scheme
+ conn = http.client.HTTPSConnection(parse.netloc)
+ try:
+ for file in files:
+ path = self.file_named(file.name)
+
+ if self.verbose:
+ print('info: fetching {}'.format(path))
+
+ conn.request('GET', path, None, headers={
+ 'User-Agent': 'fetch-logs',
+ })
+
+ resp = conn.getresponse()
+ if 304 == resp.status: # not modified
+ continue
+
+ if 200 != resp.status:
+ raise Exception('fetching {} failed ({}): {}'.format(path, resp.status, resp))
+
+ with Sponge(self.to_local_path(path)) as dest:
+ shutil.copyfileobj(resp, dest.fd)
+
+ finally:
+ conn.close()
+
+
+def package_name(name: str) -> str:
+ return name[0:name.index('_')]
+
+
+def print_package_set(set: Iterable[File], name: str):
+ print(name)
+ print("---")
+ print()
+ for file in sorted(file.name for file in set):
+ print(' * ' + file)
+ print()
+
+
+def main():
+ allowed_types = {'dbdtxt', 'rbuild'}
+ allowed_suites = {'unstable', 'testing'}
+ allowed_arches = {'amd64', 'i386', 'arm64', 'armhf'}
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-v', '--verbose', action='store_true')
+ parser.add_argument('-d', '--download', action='store_true')
+ parser.add_argument('-t', '--type', type=str, help='|'.join(allowed_types), default='rbuild')
+ parser.add_argument('-s', '--suite', type=str, help='|'.join(allowed_suites), default='unstable')
+ parser.add_argument('-a', '--arch', type=str, help='|'.join(allowed_arches), default='amd64')
+ parser.add_argument('-o', '--output-dir', type=str, help='output directory path', default='logs')
+ parser.add_argument('-m', '--max-size', type=str, default='1M',
+ help='maximum file size to fetch (suffixes allowed)')
+ parser.add_argument('packages', nargs='*')
+ args = parser.parse_args()
+
+ if not args.download:
+ parser.print_help()
+ print("--download required")
+ return
+
+ if not args.type in allowed_types:
+ parser.print_help()
+ print("invalid type")
+ return
+
+ if args.suite not in allowed_suites:
+ parser.print_help()
+ print("invalid suite")
+
+ if args.arch not in allowed_arches:
+ parser.print_help()
+ print("invalid arch")
+
+ if args.max_size.isdigit():
+ max_size = int(args.max_size)
+ else:
+ max_size = apache_size(args.max_size.strip())
+
+ store = Store('logs/', '/debian/{}/{}/{}/'.format(args.type, args.suite, args.arch), verbose=args.verbose)
+
+ store.load_index(datetime.timedelta(hours=1))
+
+ wanted = set(args.packages)
+ if not wanted:
+ wanted = set(package_name(file.name) for file in store.files)
+
+ to_fetch = set()
+ too_big = set()
+ up_to_date = set()
+ for file in store.files:
+ name = package_name(file.name)
+ if name not in wanted:
+ continue
+ if file.size > max_size:
+ too_big.add(file)
+ continue
+ if store.up_to_date(file):
+ up_to_date.add(file)
+ continue
+ to_fetch.add(file)
+
+ if args.verbose:
+ print_package_set(too_big, "too big")
+ print_package_set(up_to_date, "up to date")
+ print_package_set(to_fetch, "to fetch")
+
+ print('skipped: {} (size), {} (not needed); total to download: {}'
+ .format(len(too_big), len(up_to_date), len(to_fetch)))
+
+ store.download_many(to_fetch)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/fetch-logs.sh b/fetch-logs.sh
deleted file mode 100755
index 232e8c4..0000000
--- a/fetch-logs.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh
-set -eu
-
-U=https://tests.reproducible-builds.org/debian/rbuild/unstable/amd64/
-# ############################################ vvvvvv
-U=https://tests.reproducible-builds.org/debian/dbdtxt/unstable/amd64/
-U=https://tests.reproducible-builds.org/debian/dbdtxt/unstable/i386/
-
-mkdir -p logs
-curl ${U} > rbuild.lst
-for n in "$@"; do grep '"'${n}_ rbuild.lst | cut -d\" -f8; done > wanted.lst
-(
- cd logs;
- <../wanted.lst sed 's,^,'${U}',' | xargs wget -N
-)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/misc.git
More information about the Reproducible-commits
mailing list