[Reproducible-commits] [debbindiff] 04/04: Implement fuzzy-matching of files in the same container
Jérémy Bobbio
lunar at moszumanska.debian.org
Fri Jul 31 05:44:51 UTC 2015
This is an automated email from the git hooks/post-receive script.
lunar pushed a commit to branch master
in repository debbindiff.
commit 52271e74344710e9e33ec1c025c3bafc9cb7cc83
Author: Jérémy Bobbio <lunar at debian.org>
Date: Fri Jul 31 05:41:59 2015 +0000
Implement fuzzy-matching of files in the same container
We now use ssdeep to perform fuzzy-matching of files. This is
currently limited to a single container. It works good enough
for comparing source tarballs with a different prefix. Sadly,
it doesn't work for debug symbols which are not detect as similar
enough.
---
debbindiff/comparators/__init__.py | 16 ++++++++++++
debbindiff/comparators/binary.py | 8 ++++++
debbindiff/comparators/utils.py | 15 ++++++++---
debian/control | 1 +
setup.py | 1 +
tests/comparators/test_utils.py | 51 +++++++++++++++++++++++++++++++++++++
tests/data/fuzzy1.tar | Bin 0 -> 10240 bytes
tests/data/fuzzy2.tar | Bin 0 -> 10240 bytes
tests/data/fuzzy3.tar | Bin 0 -> 10240 bytes
9 files changed, 89 insertions(+), 3 deletions(-)
diff --git a/debbindiff/comparators/__init__.py b/debbindiff/comparators/__init__.py
index b1634ad..b35e309 100644
--- a/debbindiff/comparators/__init__.py
+++ b/debbindiff/comparators/__init__.py
@@ -22,6 +22,7 @@ import magic
import os.path
import re
import sys
+import ssdeep
from debbindiff import logger, tool_required
from debbindiff.difference import Difference
from debbindiff.comparators.binary import \
@@ -122,3 +123,18 @@ def specialize(file):
return file
logger.debug('Unidentified file. Magic says: %s' % file.magic_file_type)
return file
+
+
+fuzzy_threshold = 85
+
+
+def perform_fuzzy_matching(files1, files2):
+ already_compared = set()
+ for file1 in filter(lambda f: not f.is_directory(), files1):
+ for file2 in filter(lambda f: not f.is_directory(), files2):
+ similarity = ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash)
+ logger.debug('fuzzy matching %s %s: %d', file1.name, file2.name, similarity)
+ if similarity >= fuzzy_threshold:
+ yield file1, file2, similarity
+ already_compared.add(file2)
+ break
diff --git a/debbindiff/comparators/binary.py b/debbindiff/comparators/binary.py
index afa19a0..4481cf6 100644
--- a/debbindiff/comparators/binary.py
+++ b/debbindiff/comparators/binary.py
@@ -26,6 +26,7 @@ import os.path
import re
from stat import S_ISCHR, S_ISBLK
import subprocess
+import ssdeep
import magic
from debbindiff.difference import Difference
from debbindiff import tool_required, RequiredToolNotFound, logger
@@ -111,6 +112,13 @@ class File(object):
self._magic_file_type = File.guess_file_type(self.path)
return self._magic_file_type
+ @property
+ def fuzzy_hash(self):
+ if not hasattr(self, '_fuzzy_hash'):
+ with self.get_content():
+ self._fuzzy_hash = ssdeep.hash_from_file(self.path)
+ return self._fuzzy_hash
+
@abstractmethod
@contextmanager
def get_content(self):
diff --git a/debbindiff/comparators/utils.py b/debbindiff/comparators/utils.py
index 7734a40..1857970 100644
--- a/debbindiff/comparators/utils.py
+++ b/debbindiff/comparators/utils.py
@@ -171,15 +171,24 @@ class Container(object):
def compare(self, other, source=None):
differences = []
- my_names = self.get_member_names()
- other_names = other.get_member_names()
- for name in sorted(set(my_names).intersection(other_names)):
+ my_names = set(self.get_member_names())
+ other_names = set(other.get_member_names())
+ for name in sorted(my_names.intersection(other_names)):
logger.debug('compare member %s', name)
my_file = self.get_member(name)
other_file = other.get_member(name)
differences.append(
debbindiff.comparators.compare_files(
my_file, other_file, source=name))
+ my_extra_files = map(self.get_member, my_names.difference(other_names))
+ other_extra_files = map(other.get_member, other_names.difference(my_names))
+ for my_file, other_file, similarity in debbindiff.comparators.perform_fuzzy_matching(my_extra_files, other_extra_files):
+ difference = debbindiff.comparators.compare_files(my_file, other_file)
+ if difference is None:
+ difference = Difference(None, my_file.name, other_file.name)
+ difference.comment = (difference.comment or '') + \
+ 'Files similar (%d%%) despite different names' % similarity
+ differences.append(difference)
return differences
diff --git a/debian/control b/debian/control
index 895acb3..38f59dd 100644
--- a/debian/control
+++ b/debian/control
@@ -15,6 +15,7 @@ Build-Depends: debhelper (>= 9),
python-pytest,
python-rpm,
python-setuptools,
+ python-ssdeep,
rpm-common
Standards-Version: 3.9.6
X-Python-Version: >= 2.7
diff --git a/setup.py b/setup.py
index 5f97127..41aee26 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@ setup(name='debbindiff',
'magic',
'rpm',
'libarchive-c',
+ 'ssdeep',
],
classifiers=[
'Development Status :: 3 - Alpha',
diff --git a/tests/comparators/test_utils.py b/tests/comparators/test_utils.py
new file mode 100644
index 0000000..301db8a
--- /dev/null
+++ b/tests/comparators/test_utils.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# debbindiff: highlight differences between two builds of Debian packages
+#
+# Copyright © 2015 Jérémy Bobbio <lunar at debian.org>
+#
+# debbindiff is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# debbindiff is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with debbindiff. If not, see <http://www.gnu.org/licenses/>.
+
+import codecs
+import os.path
+import pytest
+from debbindiff.comparators import specialize
+from debbindiff.comparators.binary import FilesystemFile
+from debbindiff.comparators.text import TextFile
+
+ at pytest.fixture
+def fuzzy_tar1():
+ return specialize(FilesystemFile(os.path.join(os.path.dirname(__file__), '../data/fuzzy1.tar')))
+
+ at pytest.fixture
+def fuzzy_tar2():
+ return specialize(FilesystemFile(os.path.join(os.path.dirname(__file__), '../data/fuzzy2.tar')))
+
+ at pytest.fixture
+def fuzzy_tar3():
+ return specialize(FilesystemFile(os.path.join(os.path.dirname(__file__), '../data/fuzzy3.tar')))
+
+def test_fuzzy_matching(fuzzy_tar1, fuzzy_tar2):
+ differences = fuzzy_tar1.compare(fuzzy_tar2).details
+ expected_diff = codecs.open(os.path.join(os.path.dirname(__file__), '../data/text_iso8859_expected_diff'), encoding='utf-8').read()
+ assert differences[1].source1 == './matching'
+ assert differences[1].source2 == './fuzzy'
+ assert 'similar' in differences[1].comment
+ assert differences[1].unified_diff == expected_diff
+
+def test_fuzzy_matching_only_once(fuzzy_tar1, fuzzy_tar3):
+ differences = fuzzy_tar1.compare(fuzzy_tar3).details
+ assert len(differences) == 2
+ expected_diff = codecs.open(os.path.join(os.path.dirname(__file__), '../data/text_iso8859_expected_diff'), encoding='utf-8').read()
diff --git a/tests/data/fuzzy1.tar b/tests/data/fuzzy1.tar
new file mode 100644
index 0000000..4ca1f15
Binary files /dev/null and b/tests/data/fuzzy1.tar differ
diff --git a/tests/data/fuzzy2.tar b/tests/data/fuzzy2.tar
new file mode 100644
index 0000000..3acff7d
Binary files /dev/null and b/tests/data/fuzzy2.tar differ
diff --git a/tests/data/fuzzy3.tar b/tests/data/fuzzy3.tar
new file mode 100644
index 0000000..ec81e80
Binary files /dev/null and b/tests/data/fuzzy3.tar differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/debbindiff.git
More information about the Reproducible-commits
mailing list