[Reproducible-commits] [diffoscope] 01/09: Replace ssdeep by tlsh

Jérémy Bobbio lunar at moszumanska.debian.org
Thu Aug 20 22:09:41 UTC 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch master
in repository diffoscope.

commit 6c9206de3cb2fe6578baace71bd2a23320c4a41b
Author: Jérémy Bobbio <lunar at debian.org>
Date:   Fri Jul 31 10:21:28 2015 +0000

    Replace ssdeep by tlsh
    
    It's just way more accurate! Typical example is identifying debug symbols.
---
 debian/control                     |  2 +-
 diffoscope/comparators/__init__.py | 22 +++++++++++++---------
 diffoscope/comparators/binary.py   |  8 ++++++--
 diffoscope/comparators/utils.py    |  4 ++--
 setup.py                           |  2 +-
 5 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/debian/control b/debian/control
index db93d93..2673149 100644
--- a/debian/control
+++ b/debian/control
@@ -15,7 +15,7 @@ Build-Depends: debhelper (>= 9),
                python-pytest,
                python-rpm,
                python-setuptools,
-               python-ssdeep,
+               python-tlsh,
                rpm-common
 Standards-Version: 3.9.6
 X-Python-Version: >= 2.7
diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index fa8bd5c..399929e 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -23,7 +23,7 @@ import operator
 import os.path
 import re
 import sys
-import ssdeep
+import tlsh
 from diffoscope import logger, tool_required
 from diffoscope.difference import Difference
 from diffoscope.comparators.binary import \
@@ -126,20 +126,24 @@ def specialize(file):
     return file
 
 
-fuzzy_threshold = 85
+fuzzy_threshold = 60
 
 
 def perform_fuzzy_matching(files1, files2):
     files2 = set(files2)
     already_compared = set()
     for file1 in filter(lambda f: not f.is_directory(), files1):
-        comparisons = [(ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash), file2)
-                       for file2 in files2 - already_compared
-                       if not file2.is_directory()]
+        if not file1.fuzzy_hash:
+            continue
+        comparisons = []
+        for file2 in files2 - already_compared:
+            if file2.is_directory() or not file2.fuzzy_hash:
+                continue
+            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), file2))
         if comparisons:
             comparisons.sort(key=operator.itemgetter(0))
-            similarity, file2 = comparisons[-1]
-            logger.debug('fuzzy top  match %s %s: %d', file1.name, file2.name, similarity)
-            if similarity >= fuzzy_threshold:
-                yield file1, file2, similarity
+            score, file2 = comparisons[0]
+            logger.debug('fuzzy top match %s %s: %d difference score', file1.name, file2.name, score)
+            if score < fuzzy_threshold:
+                yield file1, file2, score
                 already_compared.add(file2)
diff --git a/diffoscope/comparators/binary.py b/diffoscope/comparators/binary.py
index e92ca82..3192a6f 100644
--- a/diffoscope/comparators/binary.py
+++ b/diffoscope/comparators/binary.py
@@ -26,7 +26,7 @@ import os.path
 import re
 from stat import S_ISCHR, S_ISBLK
 import subprocess
-import ssdeep
+import tlsh
 import magic
 from diffoscope.difference import Difference
 from diffoscope import tool_required, RequiredToolNotFound, logger
@@ -116,7 +116,11 @@ class File(object):
     def fuzzy_hash(self):
         if not hasattr(self, '_fuzzy_hash'):
             with self.get_content():
-                self._fuzzy_hash = ssdeep.hash_from_file(self.path)
+                # tlsh is not meaningful with files smaller than 512 bytes
+                if os.stat(self.path).st_size >= 512:
+                    self._fuzzy_hash = tlsh.hash(open(self.path).read())
+                else:
+                    self._fuzzy_hash = None
         return self._fuzzy_hash
 
     @abstractmethod
diff --git a/diffoscope/comparators/utils.py b/diffoscope/comparators/utils.py
index e39e5ec..af54f87 100644
--- a/diffoscope/comparators/utils.py
+++ b/diffoscope/comparators/utils.py
@@ -180,12 +180,12 @@ class Container(object):
                     my_file, other_file, source=name))
         my_extra_files = map(self.get_member, my_names.difference(other_names))
         other_extra_files = map(other.get_member, other_names.difference(my_names))
-        for my_file, other_file, similarity in diffoscope.comparators.perform_fuzzy_matching(my_extra_files, other_extra_files):
+        for my_file, other_file, score in diffoscope.comparators.perform_fuzzy_matching(my_extra_files, other_extra_files):
             difference = diffoscope.comparators.compare_files(my_file, other_file)
             if difference is None:
                 difference = Difference(None, my_file.name, other_file.name)
             difference.add_comment(
-                'Files similar (%d%%) despite different names' % similarity)
+                'Files similar despite different names (difference score: %d)' % score)
             differences.append(difference)
         return differences
 
diff --git a/setup.py b/setup.py
index 7ffac88..b9c8cac 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ setup(name='diffoscope',
           'magic',
           'rpm',
           'libarchive-c',
-          'ssdeep',
+          'tlsh',
           ],
       classifiers=[
           'Development Status :: 3 - Alpha',

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git



More information about the Reproducible-commits mailing list