[Reproducible-commits] [debbindiff] 01/01: Analyze md5sums in .deb and skip identical files

Jérémy Bobbio lunar at moszumanska.debian.org
Thu Jul 30 21:46:23 UTC 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch master
in repository debbindiff.

commit ad2c698d736b8ee6ab9b1a737f0e198ff347e737
Author: Jérémy Bobbio <lunar at debian.org>
Date:   Thu Jul 30 21:43:35 2015 +0000

    Analyze md5sums in .deb and skip identical files
    
    We trust the md5sums in the control archive of a Debian package
    to be accurate. As we already identified md5sums file as such,
    we now parse them to compare their content. Files in data.tar
    which are identified has having the same checksum will not be
    compared later on. Huge savings!
---
 debbindiff/comparators/__init__.py |  3 +-
 debbindiff/comparators/deb.py      | 72 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/debbindiff/comparators/__init__.py b/debbindiff/comparators/__init__.py
index cd489e8..f737ffb 100644
--- a/debbindiff/comparators/__init__.py
+++ b/debbindiff/comparators/__init__.py
@@ -29,7 +29,7 @@ from debbindiff.comparators.binary import \
 from debbindiff.comparators.bzip2 import Bzip2File
 from debbindiff.comparators.java import ClassFile
 from debbindiff.comparators.cpio import CpioFile
-from debbindiff.comparators.deb import DebFile, Md5sumsFile
+from debbindiff.comparators.deb import DebFile, Md5sumsFile, DebDataTarFile
 from debbindiff.comparators.debian import DotChangesFile
 from debbindiff.comparators.device import Device
 from debbindiff.comparators.directory import Directory, compare_directories
@@ -83,6 +83,7 @@ FILE_CLASSES = (
     Device,
     DotChangesFile,
     Md5sumsFile,
+    DebDataTarFile,
     TextFile,
     Bzip2File,
     CpioFile,
diff --git a/debbindiff/comparators/deb.py b/debbindiff/comparators/deb.py
index 4d975af..48fe2c6 100644
--- a/debbindiff/comparators/deb.py
+++ b/debbindiff/comparators/deb.py
@@ -28,6 +28,7 @@ import debbindiff.comparators
 from debbindiff.comparators.binary import File, needs_content
 from debbindiff.comparators.utils import \
     Archive, ArchiveMember, get_ar_content
+from debbindiff.comparators.tar import TarContainer, get_tar_listing
 
 AR_EXTRACTION_BUFFER_SIZE = 32768
 
@@ -64,6 +65,13 @@ class DebFile(File):
     def recognizes(file):
         return DebFile.RE_FILE_TYPE.match(file.magic_file_type)
 
+    @property
+    def files_with_same_content_in_data(self):
+        return self._files_with_same_content_in_data
+
+    def set_files_with_same_content_in_data(self, files):
+        self._files_with_same_content_in_data = files
+
     @needs_content
     def compare_details(self, other, source=None):
         differences = []
@@ -84,10 +92,68 @@ class Md5sumsFile(File):
                file.name == './md5sums' and \
                isinstance(file.container.source, ArchiveMember) and \
                isinstance(file.container.source.container.source, ArchiveMember) and \
-               file.container.source.container.source.name.startswith('control.tar.')
+               file.container.source.container.source.name.startswith('control.tar.') and \
+               isinstance(file.container.source.container.source.container.source, DebFile)
 
+    @staticmethod
+    def parse_md5sums(path):
+        d = {}
+        with open(path) as f:
+            for line in f.readlines():
+                md5sum, path = re.split(r'\s+', line.strip(), maxsplit=1)
+                d[path] = md5sum
+        return d
+
+    @needs_content
     def compare(self, other, source=None):
         if self.has_same_content_as(other):
            return None
-        return Difference(None, self.path, other.path, source='md5sums',
-                          comment="Files in package differs")
+        try:
+            my_md5sums = Md5sumsFile.parse_md5sums(self.path)
+            other_md5sums = Md5sumsFile.parse_md5sums(other.path)
+            same = set()
+            for path in set(my_md5sums.keys()).intersection(set(other_md5sums.keys())):
+                if my_md5sums[path] == other_md5sums[path]:
+                    same.add('./%s' % path)
+            self.container.source.container.source.container.source.set_files_with_same_content_in_data(same)
+            logger.debug('Identifed %d files as identical in data archive', len(same))
+            return Difference(None, self.path, other.path, source='md5sums',
+                              comment="Files in package differs")
+        except ValueError as e:
+            difference = self.compare_bytes(other)
+            difference.comment = 'Malformed md5sums file'
+            return Difference
+
+
+class DebTarContainer(TarContainer):
+    def __init__(self, archive, ignore_files):
+        super(DebTarContainer, self).__init__(archive)
+        assert type(ignore_files) is set
+        self._ignore_files = ignore_files
+
+    def get_member_names(self):
+        names = set(super(DebTarContainer, self).get_member_names())
+        return names - self._ignore_files
+
+
+class DebDataTarFile(File):
+    @staticmethod
+    def recognizes(file):
+        return isinstance(file, ArchiveMember) and \
+               isinstance(file.container.source, ArchiveMember) and \
+               file.container.source.name.startswith('data.tar.') and \
+               isinstance(file.container.source.container.source, DebFile)
+
+    @needs_content
+    def compare_details(self, other, source=None):
+        differences = []
+        ignore_files = self.container.source.container.source.files_with_same_content_in_data
+        with DebTarContainer(self, ignore_files).open() as my_container, \
+             DebTarContainer(other, ignore_files).open() as other_container:
+            # look up differences in file list and file metadata
+            my_listing = get_tar_listing(my_container.archive)
+            other_listing = get_tar_listing(other_container.archive)
+            differences.append(Difference.from_unicode(
+                                  my_listing, other_listing, self.name, other.name, source="metadata"))
+            differences.extend(my_container.compare(other_container, source))
+        return differences

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/debbindiff.git



More information about the Reproducible-commits mailing list