[Reproducible-commits] [debbindiff] 01/01: Work-in-progress: replace ssdeep by tlsh
Jérémy Bobbio
lunar at moszumanska.debian.org
Fri Jul 31 10:22:34 UTC 2015
This is an automated email from the git hooks/post-receive script.
lunar pushed a commit to branch pu/tlsh
in repository debbindiff.
commit 6cbefca592034e98c85b719b332a4b720bcdeb79
Author: Jérémy Bobbio <lunar at debian.org>
Date: Fri Jul 31 10:21:28 2015 +0000
Work-in-progress: replace ssdeep by tlsh
---
debbindiff/comparators/__init__.py | 20 ++++++++++++--------
debbindiff/comparators/binary.py | 8 ++++++--
debbindiff/comparators/utils.py | 2 +-
debian/control | 2 +-
setup.py | 2 +-
5 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/debbindiff/comparators/__init__.py b/debbindiff/comparators/__init__.py
index cf28f82..0cc8b3f 100644
--- a/debbindiff/comparators/__init__.py
+++ b/debbindiff/comparators/__init__.py
@@ -23,7 +23,7 @@ import operator
import os.path
import re
import sys
-import ssdeep
+import tlsh
from debbindiff import logger, tool_required
from debbindiff.difference import Difference
from debbindiff.comparators.binary import \
@@ -126,20 +126,24 @@ def specialize(file):
return file
-fuzzy_threshold = 85
+fuzzy_threshold = 60
def perform_fuzzy_matching(files1, files2):
files2 = set(files2)
already_compared = set()
for file1 in filter(lambda f: not f.is_directory(), files1):
- comparisons = [(ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash), file2)
- for file2 in files2 - already_compared
- if not file2.is_directory()]
+ if not file1.fuzzy_hash:
+ continue
+ comparisons = []
+ for file2 in files2 - already_compared:
+ if file2.is_directory() or not file2.fuzzy_hash:
+ continue
+ comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), file2))
if comparisons:
comparisons.sort(key=operator.itemgetter(0))
- similarity, file2 = comparisons[-1]
- logger.debug('fuzzy top match %s %s: %d', file1.name, file2.name, similarity)
- if similarity >= fuzzy_threshold:
+ similarity, file2 = comparisons[0]
+ logger.debug('fuzzy top match %s %s: %d difference score', file1.name, file2.name, similarity)
+ if similarity < fuzzy_threshold:
yield file1, file2, similarity
already_compared.add(file2)
diff --git a/debbindiff/comparators/binary.py b/debbindiff/comparators/binary.py
index 4481cf6..4a8274e 100644
--- a/debbindiff/comparators/binary.py
+++ b/debbindiff/comparators/binary.py
@@ -26,7 +26,7 @@ import os.path
import re
from stat import S_ISCHR, S_ISBLK
import subprocess
-import ssdeep
+import tlsh
import magic
from debbindiff.difference import Difference
from debbindiff import tool_required, RequiredToolNotFound, logger
@@ -116,7 +116,11 @@ class File(object):
def fuzzy_hash(self):
if not hasattr(self, '_fuzzy_hash'):
with self.get_content():
- self._fuzzy_hash = ssdeep.hash_from_file(self.path)
+ # tlsh is not meaningful with files smaller than 512 bytes
+ if os.stat(self.path).st_size >= 512:
+ self._fuzzy_hash = tlsh.hash(open(self.path).read())
+ else:
+ self._fuzzy_hash = None
return self._fuzzy_hash
@abstractmethod
diff --git a/debbindiff/comparators/utils.py b/debbindiff/comparators/utils.py
index b471837..e7cd67f 100644
--- a/debbindiff/comparators/utils.py
+++ b/debbindiff/comparators/utils.py
@@ -185,7 +185,7 @@ class Container(object):
if difference is None:
difference = Difference(None, my_file.name, other_file.name)
difference.comment = (difference.comment or '') + \
- 'Files similar (%d%%) despite different names' % similarity
+ 'Files similar despite different names (difference score: %d%%)' % similarity
differences.append(difference)
return differences
diff --git a/debian/control b/debian/control
index e854ee4..75e09e1 100644
--- a/debian/control
+++ b/debian/control
@@ -15,7 +15,7 @@ Build-Depends: debhelper (>= 9),
python-pytest,
python-rpm,
python-setuptools,
- python-ssdeep,
+ python-tlsh,
rpm-common
Standards-Version: 3.9.6
X-Python-Version: >= 2.7
diff --git a/setup.py b/setup.py
index 41aee26..0225740 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ setup(name='debbindiff',
'magic',
'rpm',
'libarchive-c',
- 'ssdeep',
+ 'tlsh',
],
classifiers=[
'Development Status :: 3 - Alpha',
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/debbindiff.git
More information about the Reproducible-commits
mailing list