[Reproducible-commits] [diffoscope] 01/09: Replace ssdeep by tlsh
Jérémy Bobbio
lunar at moszumanska.debian.org
Thu Aug 20 22:09:41 UTC 2015
This is an automated email from the git hooks/post-receive script.
lunar pushed a commit to branch master
in repository diffoscope.
commit 6c9206de3cb2fe6578baace71bd2a23320c4a41b
Author: Jérémy Bobbio <lunar at debian.org>
Date: Fri Jul 31 10:21:28 2015 +0000
Replace ssdeep by tlsh
It's just way more accurate! Typical example is identifying debug symbols.
---
debian/control | 2 +-
diffoscope/comparators/__init__.py | 22 +++++++++++++---------
diffoscope/comparators/binary.py | 8 ++++++--
diffoscope/comparators/utils.py | 4 ++--
setup.py | 2 +-
5 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/debian/control b/debian/control
index db93d93..2673149 100644
--- a/debian/control
+++ b/debian/control
@@ -15,7 +15,7 @@ Build-Depends: debhelper (>= 9),
python-pytest,
python-rpm,
python-setuptools,
- python-ssdeep,
+ python-tlsh,
rpm-common
Standards-Version: 3.9.6
X-Python-Version: >= 2.7
diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index fa8bd5c..399929e 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -23,7 +23,7 @@ import operator
import os.path
import re
import sys
-import ssdeep
+import tlsh
from diffoscope import logger, tool_required
from diffoscope.difference import Difference
from diffoscope.comparators.binary import \
@@ -126,20 +126,24 @@ def specialize(file):
return file
-fuzzy_threshold = 85
+fuzzy_threshold = 60
def perform_fuzzy_matching(files1, files2):
files2 = set(files2)
already_compared = set()
for file1 in filter(lambda f: not f.is_directory(), files1):
- comparisons = [(ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash), file2)
- for file2 in files2 - already_compared
- if not file2.is_directory()]
+ if not file1.fuzzy_hash:
+ continue
+ comparisons = []
+ for file2 in files2 - already_compared:
+ if file2.is_directory() or not file2.fuzzy_hash:
+ continue
+ comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), file2))
if comparisons:
comparisons.sort(key=operator.itemgetter(0))
- similarity, file2 = comparisons[-1]
- logger.debug('fuzzy top match %s %s: %d', file1.name, file2.name, similarity)
- if similarity >= fuzzy_threshold:
- yield file1, file2, similarity
+ score, file2 = comparisons[0]
+ logger.debug('fuzzy top match %s %s: %d difference score', file1.name, file2.name, score)
+ if score < fuzzy_threshold:
+ yield file1, file2, score
already_compared.add(file2)
diff --git a/diffoscope/comparators/binary.py b/diffoscope/comparators/binary.py
index e92ca82..3192a6f 100644
--- a/diffoscope/comparators/binary.py
+++ b/diffoscope/comparators/binary.py
@@ -26,7 +26,7 @@ import os.path
import re
from stat import S_ISCHR, S_ISBLK
import subprocess
-import ssdeep
+import tlsh
import magic
from diffoscope.difference import Difference
from diffoscope import tool_required, RequiredToolNotFound, logger
@@ -116,7 +116,11 @@ class File(object):
def fuzzy_hash(self):
if not hasattr(self, '_fuzzy_hash'):
with self.get_content():
- self._fuzzy_hash = ssdeep.hash_from_file(self.path)
+ # tlsh is not meaningful with files smaller than 512 bytes
+ if os.stat(self.path).st_size >= 512:
+ self._fuzzy_hash = tlsh.hash(open(self.path).read())
+ else:
+ self._fuzzy_hash = None
return self._fuzzy_hash
@abstractmethod
diff --git a/diffoscope/comparators/utils.py b/diffoscope/comparators/utils.py
index e39e5ec..af54f87 100644
--- a/diffoscope/comparators/utils.py
+++ b/diffoscope/comparators/utils.py
@@ -180,12 +180,12 @@ class Container(object):
my_file, other_file, source=name))
my_extra_files = map(self.get_member, my_names.difference(other_names))
other_extra_files = map(other.get_member, other_names.difference(my_names))
- for my_file, other_file, similarity in diffoscope.comparators.perform_fuzzy_matching(my_extra_files, other_extra_files):
+ for my_file, other_file, score in diffoscope.comparators.perform_fuzzy_matching(my_extra_files, other_extra_files):
difference = diffoscope.comparators.compare_files(my_file, other_file)
if difference is None:
difference = Difference(None, my_file.name, other_file.name)
difference.add_comment(
- 'Files similar (%d%%) despite different names' % similarity)
+ 'Files similar despite different names (difference score: %d)' % score)
differences.append(difference)
return differences
diff --git a/setup.py b/setup.py
index 7ffac88..b9c8cac 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ setup(name='diffoscope',
'magic',
'rpm',
'libarchive-c',
- 'ssdeep',
+ 'tlsh',
],
classifiers=[
'Development Status :: 3 - Alpha',
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the Reproducible-commits
mailing list