[Reproducible-commits] [debbindiff] 04/04: Implement fuzzy-matching of files in the same container

Jérémy Bobbio lunar at moszumanska.debian.org
Fri Jul 31 05:44:51 UTC 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch master
in repository debbindiff.

commit 52271e74344710e9e33ec1c025c3bafc9cb7cc83
Author: Jérémy Bobbio <lunar at debian.org>
Date:   Fri Jul 31 05:41:59 2015 +0000

    Implement fuzzy-matching of files in the same container
    
    We now use ssdeep to perform fuzzy-matching of files. This is
    currently limited to a single container. It works good enough
    for comparing source tarballs with a different prefix. Sadly,
    it doesn't work for debug symbols which are not detect as similar
    enough.
---
 debbindiff/comparators/__init__.py |  16 ++++++++++++
 debbindiff/comparators/binary.py   |   8 ++++++
 debbindiff/comparators/utils.py    |  15 ++++++++---
 debian/control                     |   1 +
 setup.py                           |   1 +
 tests/comparators/test_utils.py    |  51 +++++++++++++++++++++++++++++++++++++
 tests/data/fuzzy1.tar              | Bin 0 -> 10240 bytes
 tests/data/fuzzy2.tar              | Bin 0 -> 10240 bytes
 tests/data/fuzzy3.tar              | Bin 0 -> 10240 bytes
 9 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/debbindiff/comparators/__init__.py b/debbindiff/comparators/__init__.py
index b1634ad..b35e309 100644
--- a/debbindiff/comparators/__init__.py
+++ b/debbindiff/comparators/__init__.py
@@ -22,6 +22,7 @@ import magic
 import os.path
 import re
 import sys
+import ssdeep
 from debbindiff import logger, tool_required
 from debbindiff.difference import Difference
 from debbindiff.comparators.binary import \
@@ -122,3 +123,18 @@ def specialize(file):
             return file
     logger.debug('Unidentified file. Magic says: %s' % file.magic_file_type)
     return file
+
+
+fuzzy_threshold = 85
+
+
+def perform_fuzzy_matching(files1, files2):
+    already_compared = set()
+    for file1 in filter(lambda f: not f.is_directory(), files1):
+        for file2 in filter(lambda f: not f.is_directory(), files2):
+            similarity = ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash)
+            logger.debug('fuzzy matching %s %s: %d', file1.name, file2.name, similarity)
+            if similarity >= fuzzy_threshold:
+                yield file1, file2, similarity
+                already_compared.add(file2)
+                break
diff --git a/debbindiff/comparators/binary.py b/debbindiff/comparators/binary.py
index afa19a0..4481cf6 100644
--- a/debbindiff/comparators/binary.py
+++ b/debbindiff/comparators/binary.py
@@ -26,6 +26,7 @@ import os.path
 import re
 from stat import S_ISCHR, S_ISBLK
 import subprocess
+import ssdeep
 import magic
 from debbindiff.difference import Difference
 from debbindiff import tool_required, RequiredToolNotFound, logger
@@ -111,6 +112,13 @@ class File(object):
                 self._magic_file_type = File.guess_file_type(self.path)
         return self._magic_file_type
 
+    @property
+    def fuzzy_hash(self):
+        if not hasattr(self, '_fuzzy_hash'):
+            with self.get_content():
+                self._fuzzy_hash = ssdeep.hash_from_file(self.path)
+        return self._fuzzy_hash
+
     @abstractmethod
     @contextmanager
     def get_content(self):
diff --git a/debbindiff/comparators/utils.py b/debbindiff/comparators/utils.py
index 7734a40..1857970 100644
--- a/debbindiff/comparators/utils.py
+++ b/debbindiff/comparators/utils.py
@@ -171,15 +171,24 @@ class Container(object):
 
     def compare(self, other, source=None):
         differences = []
-        my_names = self.get_member_names()
-        other_names = other.get_member_names()
-        for name in sorted(set(my_names).intersection(other_names)):
+        my_names = set(self.get_member_names())
+        other_names = set(other.get_member_names())
+        for name in sorted(my_names.intersection(other_names)):
             logger.debug('compare member %s', name)
             my_file = self.get_member(name)
             other_file = other.get_member(name)
             differences.append(
                 debbindiff.comparators.compare_files(
                     my_file, other_file, source=name))
+        my_extra_files = map(self.get_member, my_names.difference(other_names))
+        other_extra_files = map(other.get_member, other_names.difference(my_names))
+        for my_file, other_file, similarity in debbindiff.comparators.perform_fuzzy_matching(my_extra_files, other_extra_files):
+            difference = debbindiff.comparators.compare_files(my_file, other_file)
+            if difference is None:
+                difference = Difference(None, my_file.name, other_file.name)
+            difference.comment = (difference.comment or '') + \
+                'Files similar (%d%%) despite different names' % similarity
+            differences.append(difference)
         return differences
 
 
diff --git a/debian/control b/debian/control
index 895acb3..38f59dd 100644
--- a/debian/control
+++ b/debian/control
@@ -15,6 +15,7 @@ Build-Depends: debhelper (>= 9),
                python-pytest,
                python-rpm,
                python-setuptools,
+               python-ssdeep,
                rpm-common
 Standards-Version: 3.9.6
 X-Python-Version: >= 2.7
diff --git a/setup.py b/setup.py
index 5f97127..41aee26 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@ setup(name='debbindiff',
           'magic',
           'rpm',
           'libarchive-c',
+          'ssdeep',
           ],
       classifiers=[
           'Development Status :: 3 - Alpha',
diff --git a/tests/comparators/test_utils.py b/tests/comparators/test_utils.py
new file mode 100644
index 0000000..301db8a
--- /dev/null
+++ b/tests/comparators/test_utils.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# debbindiff: highlight differences between two builds of Debian packages
+#
+# Copyright © 2015 Jérémy Bobbio <lunar at debian.org>
+#
+# debbindiff is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# debbindiff is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with debbindiff.  If not, see <http://www.gnu.org/licenses/>.
+
+import codecs
+import os.path
+import pytest
+from debbindiff.comparators import specialize
+from debbindiff.comparators.binary import FilesystemFile
+from debbindiff.comparators.text import TextFile
+
+ at pytest.fixture
+def fuzzy_tar1():
+    return specialize(FilesystemFile(os.path.join(os.path.dirname(__file__), '../data/fuzzy1.tar')))
+
+ at pytest.fixture
+def fuzzy_tar2():
+    return specialize(FilesystemFile(os.path.join(os.path.dirname(__file__), '../data/fuzzy2.tar')))
+
+ at pytest.fixture
+def fuzzy_tar3():
+    return specialize(FilesystemFile(os.path.join(os.path.dirname(__file__), '../data/fuzzy3.tar')))
+
+def test_fuzzy_matching(fuzzy_tar1, fuzzy_tar2):
+    differences = fuzzy_tar1.compare(fuzzy_tar2).details
+    expected_diff = codecs.open(os.path.join(os.path.dirname(__file__), '../data/text_iso8859_expected_diff'), encoding='utf-8').read()
+    assert differences[1].source1 == './matching'
+    assert differences[1].source2 == './fuzzy'
+    assert 'similar' in differences[1].comment
+    assert differences[1].unified_diff == expected_diff
+
+def test_fuzzy_matching_only_once(fuzzy_tar1, fuzzy_tar3):
+    differences = fuzzy_tar1.compare(fuzzy_tar3).details
+    assert len(differences) == 2
+    expected_diff = codecs.open(os.path.join(os.path.dirname(__file__), '../data/text_iso8859_expected_diff'), encoding='utf-8').read()
diff --git a/tests/data/fuzzy1.tar b/tests/data/fuzzy1.tar
new file mode 100644
index 0000000..4ca1f15
Binary files /dev/null and b/tests/data/fuzzy1.tar differ
diff --git a/tests/data/fuzzy2.tar b/tests/data/fuzzy2.tar
new file mode 100644
index 0000000..3acff7d
Binary files /dev/null and b/tests/data/fuzzy2.tar differ
diff --git a/tests/data/fuzzy3.tar b/tests/data/fuzzy3.tar
new file mode 100644
index 0000000..ec81e80
Binary files /dev/null and b/tests/data/fuzzy3.tar differ

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/debbindiff.git



More information about the Reproducible-commits mailing list