[Reproducible-commits] [diffoscope] 06/06: Analyze md5sums file before working on data.tar

Jérémy Bobbio lunar at moszumanska.debian.org
Sat Dec 5 23:18:59 UTC 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch master
in repository diffoscope.

commit 02c25aae33a18b188d6f2e19e4464c27ef91a0f7
Author: Jérémy Bobbio <lunar at debian.org>
Date:   Sat Dec 5 14:54:26 2015 +0000

    Analyze md5sums file before working on data.tar
    
    We used to be clever and analyze md5sums content to recognize files to ignore
    when they were compared themselves. Sadly, this cannot be made to work easily
    in the context of parallel processing.
    
    Instead, we now explicitly extract md5sums files to analyze their content
    when comparing data.tar. Given control.tar.gz is pretty small, this
    should not be a significant performance hit.
    
    The new method Container.lookup_file() allows to retrieve an abitrary file
    deep down several containers.
    
    Huge thanks to Joachim Breitner for the initial implementation.
---
 diffoscope/comparators/deb.py   | 80 +++++++++++++++++++----------------------
 diffoscope/comparators/utils.py | 15 ++++++++
 tests/comparators/test_deb.py   |  5 ++-
 3 files changed, 54 insertions(+), 46 deletions(-)

diff --git a/diffoscope/comparators/deb.py b/diffoscope/comparators/deb.py
index 3851323..8448d4b 100644
--- a/diffoscope/comparators/deb.py
+++ b/diffoscope/comparators/deb.py
@@ -41,14 +41,15 @@ class DebFile(File):
         return DebFile.RE_FILE_TYPE.match(file.magic_file_type)
 
     @property
-    def files_with_same_content_in_data(self):
-        if hasattr(self, '_files_with_same_content_in_data'):
-            return self._files_with_same_content_in_data
-        else:
-            return set()
-
-    def set_files_with_same_content_in_data(self, files):
-        self._files_with_same_content_in_data = files
+    def md5sums(self):
+        if not hasattr(self, '_md5sums'):
+            md5sums_file = self.as_container.lookup_file('control.tar.gz', 'control.tar', './md5sums')
+            if md5sums_file:
+                self._md5sums = md5sums_file.parse()
+            else:
+                logger.debug('Unable to find a md5sums file')
+                self._md5sums = set()
+        return self._md5sums
 
     def compare_details(self, other, source=None):
         my_content = get_ar_content(self.path)
@@ -66,46 +67,39 @@ class Md5sumsFile(File):
                file.container.source.container.source.name.startswith('control.tar.') and \
                isinstance(file.container.source.container.source.container.source, DebFile)
 
-    @staticmethod
-    def parse_md5sums(path):
-        d = {}
-        with open(path, 'r', encoding='utf-8') as f:
-            for line in f.readlines():
-                md5sum, path = re.split(r'\s+', line.strip(), maxsplit=1)
-                d[path] = md5sum
-        return d
+    def parse(self):
+        try:
+            md5sums = {}
+            with open(self.path, 'r', encoding='utf-8') as f:
+                for line in f.readlines():
+                    md5sum, path = re.split(r'\s+', line.strip(), maxsplit=1)
+                    md5sums['./%s' % path] = md5sum
+            return md5sums
+        except (UnicodeDecodeError, ValueError):
+            logger.debug('Malformed md5sums, ignoring.')
+            return set()
 
     def compare(self, other, source=None):
-        if other.path is None:
-            return None
-        try:
-            my_md5sums = Md5sumsFile.parse_md5sums(self.path)
-            other_md5sums = Md5sumsFile.parse_md5sums(other.path)
-            same = set()
-            for path in my_md5sums.keys() & other_md5sums.keys():
-                if my_md5sums[path] == other_md5sums[path]:
-                    same.add('./%s' % path)
-            self.container.source.container.source.container.source.set_files_with_same_content_in_data(same)
-            logger.debug('Identifed %d files as identical in data archive', len(same))
-            return Difference(None, self.path, other.path, source='md5sums',
-                              comment="Files in package differs")
-        except ValueError as e:
-            difference = self.compare_bytes(other)
-            difference.add_comment('Malformed md5sums file: %s' % e)
-            return difference
+        return Difference(None, self.path, other.path, source='md5sums',
+                          comment="Files in package differs")
 
 
 class DebTarContainer(TarContainer):
-    def __init__(self, archive):
-        super().__init__(archive)
-        ignore_files = archive.container.source.container.source.files_with_same_content_in_data
-        assert type(ignore_files) is set
-        self._ignore_files = ignore_files
-
-    def get_member_names(self):
-        names = set(super().get_member_names())
-        logger.debug('Ignoring %d/%d files known identical in data.tar', len(self._ignore_files), len(names))
-        return names - self._ignore_files
+    def comparisons(self, other):
+        if self.source:
+            my_md5sums = self.source.container.source.container.source.md5sums
+        else:
+            my_md5sums = set()
+        if other.source:
+            other_md5sums = other.source.container.source.container.source.md5sums
+        else:
+            other_md5sums = set()
+        for my_member, other_member, comment in super().comparisons(other):
+            if my_member.name == other_member.name and \
+               my_md5sums.get(my_member.name, 'my') == other_md5sums.get(other_member.name, 'other'):
+                logger.debug('Skip %s: identical md5sum', my_member.name)
+                continue
+            yield my_member, other_member, comment
 
 
 class DebDataTarFile(File):
diff --git a/diffoscope/comparators/utils.py b/diffoscope/comparators/utils.py
index 59b2961..6528da2 100644
--- a/diffoscope/comparators/utils.py
+++ b/diffoscope/comparators/utils.py
@@ -168,6 +168,21 @@ class Container(object, metaclass=ABCMeta):
         """Returns a directory. The key is what is used to match when comparing containers."""
         return {name: self.get_member(name) for name in self.get_member_names()}
 
+    def lookup_file(self, *names):
+        """Try to fetch a specific file by digging in containers."""
+        name, remainings = names[0], names[1:]
+        file = self.get_member(name)
+        logger.debug('lookup_file(%s) -> %s', names, file)
+        if not file:
+            return None
+        diffoscope.comparators.specialize(file)
+        if not remainings:
+            return file
+        container = file.as_container
+        if not container:
+            return None
+        return container.lookup_file(*remainings)
+
     @abstractmethod
     def get_member_names(self):
         raise NotImplemented
diff --git a/tests/comparators/test_deb.py b/tests/comparators/test_deb.py
index b0a5790..3423865 100644
--- a/tests/comparators/test_deb.py
+++ b/tests/comparators/test_deb.py
@@ -84,9 +84,8 @@ def test_md5sums(differences):
 
 @pytest.mark.skipif(tool_missing('ar'), reason='missing ar')
 def test_identical_files_in_md5sums(deb1, deb2):
-    deb1.compare(deb2)
-    assert deb1.files_with_same_content_in_data == set(['./usr/share/doc/test/README.Debian',
-                                                        './usr/share/doc/test/copyright'])
+    for name in ['./usr/share/doc/test/README.Debian', './usr/share/doc/test/copyright']:
+        assert deb1.md5sums[name] == deb2.md5sums[name]
 
 @pytest.mark.skipif(tool_missing('ar'), reason='missing ar')
 def test_identification_of_data_tar(deb1, deb2, monkeypatch):

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git



More information about the Reproducible-commits mailing list