[Reproducible-commits] [debbindiff] 01/01: Work-in-progress: replace ssdeep by tlsh

Jérémy Bobbio lunar at moszumanska.debian.org
Fri Jul 31 10:22:34 UTC 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch pu/tlsh
in repository debbindiff.

commit 6cbefca592034e98c85b719b332a4b720bcdeb79
Author: Jérémy Bobbio <lunar at debian.org>
Date:   Fri Jul 31 10:21:28 2015 +0000

    Work-in-progress: replace ssdeep by tlsh
---
 debbindiff/comparators/__init__.py | 20 ++++++++++++--------
 debbindiff/comparators/binary.py   |  8 ++++++--
 debbindiff/comparators/utils.py    |  2 +-
 debian/control                     |  2 +-
 setup.py                           |  2 +-
 5 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/debbindiff/comparators/__init__.py b/debbindiff/comparators/__init__.py
index cf28f82..0cc8b3f 100644
--- a/debbindiff/comparators/__init__.py
+++ b/debbindiff/comparators/__init__.py
@@ -23,7 +23,7 @@ import operator
 import os.path
 import re
 import sys
-import ssdeep
+import tlsh
 from debbindiff import logger, tool_required
 from debbindiff.difference import Difference
 from debbindiff.comparators.binary import \
@@ -126,20 +126,24 @@ def specialize(file):
     return file
 
 
-fuzzy_threshold = 85
+fuzzy_threshold = 60
 
 
 def perform_fuzzy_matching(files1, files2):
     files2 = set(files2)
     already_compared = set()
     for file1 in filter(lambda f: not f.is_directory(), files1):
-        comparisons = [(ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash), file2)
-                       for file2 in files2 - already_compared
-                       if not file2.is_directory()]
+        if not file1.fuzzy_hash:
+            continue
+        comparisons = []
+        for file2 in files2 - already_compared:
+            if file2.is_directory() or not file2.fuzzy_hash:
+                continue
+            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), file2))
         if comparisons:
             comparisons.sort(key=operator.itemgetter(0))
-            similarity, file2 = comparisons[-1]
-            logger.debug('fuzzy top  match %s %s: %d', file1.name, file2.name, similarity)
-            if similarity >= fuzzy_threshold:
+            similarity, file2 = comparisons[0]
+            logger.debug('fuzzy top match %s %s: %d difference score', file1.name, file2.name, similarity)
+            if similarity < fuzzy_threshold:
                 yield file1, file2, similarity
                 already_compared.add(file2)
diff --git a/debbindiff/comparators/binary.py b/debbindiff/comparators/binary.py
index 4481cf6..4a8274e 100644
--- a/debbindiff/comparators/binary.py
+++ b/debbindiff/comparators/binary.py
@@ -26,7 +26,7 @@ import os.path
 import re
 from stat import S_ISCHR, S_ISBLK
 import subprocess
-import ssdeep
+import tlsh
 import magic
 from debbindiff.difference import Difference
 from debbindiff import tool_required, RequiredToolNotFound, logger
@@ -116,7 +116,11 @@ class File(object):
     def fuzzy_hash(self):
         if not hasattr(self, '_fuzzy_hash'):
             with self.get_content():
-                self._fuzzy_hash = ssdeep.hash_from_file(self.path)
+                # tlsh is not meaningful with files smaller than 512 bytes
+                if os.stat(self.path).st_size >= 512:
+                    self._fuzzy_hash = tlsh.hash(open(self.path).read())
+                else:
+                    self._fuzzy_hash = None
         return self._fuzzy_hash
 
     @abstractmethod
diff --git a/debbindiff/comparators/utils.py b/debbindiff/comparators/utils.py
index b471837..e7cd67f 100644
--- a/debbindiff/comparators/utils.py
+++ b/debbindiff/comparators/utils.py
@@ -185,7 +185,7 @@ class Container(object):
             if difference is None:
                 difference = Difference(None, my_file.name, other_file.name)
             difference.comment = (difference.comment or '') + \
-                'Files similar (%d%%) despite different names' % similarity
+                'Files similar despite different names (difference score: %d%%)' % similarity
             differences.append(difference)
         return differences
 
diff --git a/debian/control b/debian/control
index e854ee4..75e09e1 100644
--- a/debian/control
+++ b/debian/control
@@ -15,7 +15,7 @@ Build-Depends: debhelper (>= 9),
                python-pytest,
                python-rpm,
                python-setuptools,
-               python-ssdeep,
+               python-tlsh,
                rpm-common
 Standards-Version: 3.9.6
 X-Python-Version: >= 2.7
diff --git a/setup.py b/setup.py
index 41aee26..0225740 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ setup(name='debbindiff',
           'magic',
           'rpm',
           'libarchive-c',
-          'ssdeep',
+          'tlsh',
           ],
       classifiers=[
           'Development Status :: 3 - Alpha',

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/debbindiff.git



More information about the Reproducible-commits mailing list