[Reproducible-commits] [diffoscope] 06/06: Add --fuzzy-threshold option
Jérémy Bobbio
lunar at moszumanska.debian.org
Wed Sep 2 12:47:15 UTC 2015
This is an automated email from the git hooks/post-receive script.
lunar pushed a commit to branch master
in repository diffoscope.
commit 4342fa3605397c4de99c99a796969decc6159ad3
Author: Jérémy Bobbio <lunar at debian.org>
Date: Wed Sep 2 11:08:25 2015 +0000
Add --fuzzy-threshold option
This allows to specify the TLSH score used as cut-off for fuzzy matching.
Specifying 0 will disable fuzzy-matching entirely.
Thanks Jakub Wilk for prompting me to implement this.
Closes: #797557
---
diffoscope/__main__.py | 8 +++++++-
diffoscope/comparators/__init__.py | 8 ++++----
diffoscope/config.py | 8 ++++++++
3 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/diffoscope/__main__.py b/diffoscope/__main__.py
index 8426dd8..927e9ad 100644
--- a/diffoscope/__main__.py
+++ b/diffoscope/__main__.py
@@ -58,11 +58,16 @@ def create_parser():
parser.add_argument('--max-diff-block-lines', dest='max_diff_block_lines', type=int,
help='maximum number of lines per diff block (default: %d)' %
Config.general.max_diff_block_lines,
- default=Config.general.max_diff_input_lines)
+ default=Config.general.max_diff_block_lines)
parser.add_argument('--max-diff-input-lines', dest='max_diff_input_lines', type=int,
help='maximum number of lines fed to diff (default: %d)' %
Config.general.max_diff_input_lines,
default=Config.general.max_diff_input_lines)
+ parser.add_argument('--fuzzy-threshold', dest='fuzzy_threshold', type=int,
+ help='threshold for fuzzy-matching '
+ '(0 to disable, %d is default, 400 is high fuzziness)' %
+ (Config.general.fuzzy_threshold),
+ default=Config.general.fuzzy_threshold)
parser.add_argument('--css', metavar='url', dest='css_url',
help='link to an extra CSS for the HTML report')
parser.add_argument('file1', help='first file to compare')
@@ -101,6 +106,7 @@ def run_diffoscope(parsed_args):
Config.general.max_diff_block_lines = parsed_args.max_diff_block_lines
Config.general.max_diff_input_lines = parsed_args.max_diff_input_lines
Config.general.max_report_size = parsed_args.max_report_size
+ Config.general.fuzzy_threshold = parsed_args.fuzzy_threshold
if parsed_args.debug:
logger.setLevel(logging.DEBUG)
set_locale()
diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index 119b385..3181985 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -25,6 +25,7 @@ import re
import sys
import tlsh
from diffoscope import logger, tool_required
+from diffoscope.config import Config
from diffoscope.difference import Difference
from diffoscope.comparators.binary import \
File, FilesystemFile, compare_binary_files
@@ -126,10 +127,9 @@ def specialize(file):
return file
-fuzzy_threshold = 60
-
-
def perform_fuzzy_matching(files1, files2):
+ if Config.general.fuzzy_threshold == 0:
+ return
files2 = set(files2)
already_compared = set()
for file1 in filter(lambda f: not f.is_directory(), files1):
@@ -144,6 +144,6 @@ def perform_fuzzy_matching(files1, files2):
comparisons.sort(key=operator.itemgetter(0))
score, file2 = comparisons[0]
logger.debug('fuzzy top match %s %s: %d difference score', file1.name, file2.name, score)
- if score < fuzzy_threshold:
+ if score < Config.general.fuzzy_threshold:
yield file1, file2, score
already_compared.add(file2)
diff --git a/diffoscope/config.py b/diffoscope/config.py
index cf927ca..4468fc8 100644
--- a/diffoscope/config.py
+++ b/diffoscope/config.py
@@ -30,6 +30,7 @@ class Config(object):
self._max_diff_block_lines = 50
self._max_diff_input_lines = 100000 # GNU diff cannot process arbitrary large files :(
self._max_report_size = 2000 * 2 ** 10 # 2000 kB
+ self._fuzzy_threshold = 60
@classproperty
def general(cls):
@@ -61,3 +62,10 @@ class Config(object):
def max_report_size(self, value):
self._max_report_size = value
+ @property
+ def fuzzy_threshold(self):
+ return self._fuzzy_threshold
+
+ @fuzzy_threshold.setter
+ def fuzzy_threshold(self, value):
+ self._fuzzy_threshold = value
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the Reproducible-commits
mailing list