[Reproducible-commits] [debbindiff] 01/01: Analyze md5sums in .deb and skip identical files
Jérémy Bobbio
lunar at moszumanska.debian.org
Thu Jul 30 21:46:23 UTC 2015
This is an automated email from the git hooks/post-receive script.
lunar pushed a commit to branch master
in repository debbindiff.
commit ad2c698d736b8ee6ab9b1a737f0e198ff347e737
Author: Jérémy Bobbio <lunar at debian.org>
Date: Thu Jul 30 21:43:35 2015 +0000
Analyze md5sums in .deb and skip identical files
We trust the md5sums in the control archive of a Debian package
to be accurate. As we already identified md5sums file as such,
we now parse them to compare their content. Files in data.tar
which are identified has having the same checksum will not be
compared later on. Huge savings!
---
debbindiff/comparators/__init__.py | 3 +-
debbindiff/comparators/deb.py | 72 ++++++++++++++++++++++++++++++++++++--
2 files changed, 71 insertions(+), 4 deletions(-)
diff --git a/debbindiff/comparators/__init__.py b/debbindiff/comparators/__init__.py
index cd489e8..f737ffb 100644
--- a/debbindiff/comparators/__init__.py
+++ b/debbindiff/comparators/__init__.py
@@ -29,7 +29,7 @@ from debbindiff.comparators.binary import \
from debbindiff.comparators.bzip2 import Bzip2File
from debbindiff.comparators.java import ClassFile
from debbindiff.comparators.cpio import CpioFile
-from debbindiff.comparators.deb import DebFile, Md5sumsFile
+from debbindiff.comparators.deb import DebFile, Md5sumsFile, DebDataTarFile
from debbindiff.comparators.debian import DotChangesFile
from debbindiff.comparators.device import Device
from debbindiff.comparators.directory import Directory, compare_directories
@@ -83,6 +83,7 @@ FILE_CLASSES = (
Device,
DotChangesFile,
Md5sumsFile,
+ DebDataTarFile,
TextFile,
Bzip2File,
CpioFile,
diff --git a/debbindiff/comparators/deb.py b/debbindiff/comparators/deb.py
index 4d975af..48fe2c6 100644
--- a/debbindiff/comparators/deb.py
+++ b/debbindiff/comparators/deb.py
@@ -28,6 +28,7 @@ import debbindiff.comparators
from debbindiff.comparators.binary import File, needs_content
from debbindiff.comparators.utils import \
Archive, ArchiveMember, get_ar_content
+from debbindiff.comparators.tar import TarContainer, get_tar_listing
AR_EXTRACTION_BUFFER_SIZE = 32768
@@ -64,6 +65,13 @@ class DebFile(File):
def recognizes(file):
return DebFile.RE_FILE_TYPE.match(file.magic_file_type)
+ @property
+ def files_with_same_content_in_data(self):
+ return self._files_with_same_content_in_data
+
+ def set_files_with_same_content_in_data(self, files):
+ self._files_with_same_content_in_data = files
+
@needs_content
def compare_details(self, other, source=None):
differences = []
@@ -84,10 +92,68 @@ class Md5sumsFile(File):
file.name == './md5sums' and \
isinstance(file.container.source, ArchiveMember) and \
isinstance(file.container.source.container.source, ArchiveMember) and \
- file.container.source.container.source.name.startswith('control.tar.')
+ file.container.source.container.source.name.startswith('control.tar.') and \
+ isinstance(file.container.source.container.source.container.source, DebFile)
+ @staticmethod
+ def parse_md5sums(path):
+ d = {}
+ with open(path) as f:
+ for line in f.readlines():
+ md5sum, path = re.split(r'\s+', line.strip(), maxsplit=1)
+ d[path] = md5sum
+ return d
+
+ @needs_content
def compare(self, other, source=None):
if self.has_same_content_as(other):
return None
- return Difference(None, self.path, other.path, source='md5sums',
- comment="Files in package differs")
+ try:
+ my_md5sums = Md5sumsFile.parse_md5sums(self.path)
+ other_md5sums = Md5sumsFile.parse_md5sums(other.path)
+ same = set()
+ for path in set(my_md5sums.keys()).intersection(set(other_md5sums.keys())):
+ if my_md5sums[path] == other_md5sums[path]:
+ same.add('./%s' % path)
+ self.container.source.container.source.container.source.set_files_with_same_content_in_data(same)
+ logger.debug('Identifed %d files as identical in data archive', len(same))
+ return Difference(None, self.path, other.path, source='md5sums',
+ comment="Files in package differs")
+ except ValueError as e:
+ difference = self.compare_bytes(other)
+ difference.comment = 'Malformed md5sums file'
+ return Difference
+
+
+class DebTarContainer(TarContainer):
+ def __init__(self, archive, ignore_files):
+ super(DebTarContainer, self).__init__(archive)
+ assert type(ignore_files) is set
+ self._ignore_files = ignore_files
+
+ def get_member_names(self):
+ names = set(super(DebTarContainer, self).get_member_names())
+ return names - self._ignore_files
+
+
+class DebDataTarFile(File):
+ @staticmethod
+ def recognizes(file):
+ return isinstance(file, ArchiveMember) and \
+ isinstance(file.container.source, ArchiveMember) and \
+ file.container.source.name.startswith('data.tar.') and \
+ isinstance(file.container.source.container.source, DebFile)
+
+ @needs_content
+ def compare_details(self, other, source=None):
+ differences = []
+ ignore_files = self.container.source.container.source.files_with_same_content_in_data
+ with DebTarContainer(self, ignore_files).open() as my_container, \
+ DebTarContainer(other, ignore_files).open() as other_container:
+ # look up differences in file list and file metadata
+ my_listing = get_tar_listing(my_container.archive)
+ other_listing = get_tar_listing(other_container.archive)
+ differences.append(Difference.from_unicode(
+ my_listing, other_listing, self.name, other.name, source="metadata"))
+ differences.extend(my_container.compare(other_container, source))
+ return differences
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/debbindiff.git
More information about the Reproducible-commits
mailing list