[Reproducible-commits] [diffoscope] 06/06: Add --fuzzy-threshold option

Jérémy Bobbio lunar at moszumanska.debian.org
Wed Sep 2 12:47:15 UTC 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch master
in repository diffoscope.

commit 4342fa3605397c4de99c99a796969decc6159ad3
Author: Jérémy Bobbio <lunar at debian.org>
Date:   Wed Sep 2 11:08:25 2015 +0000

    Add --fuzzy-threshold option
    
    This allows to specify the TLSH score used as cut-off for fuzzy matching.
    Specifying 0 will disable fuzzy-matching entirely.
    
    Thanks Jakub Wilk for prompting me to implement this.
    
    Closes: #797557
---
 diffoscope/__main__.py             | 8 +++++++-
 diffoscope/comparators/__init__.py | 8 ++++----
 diffoscope/config.py               | 8 ++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/diffoscope/__main__.py b/diffoscope/__main__.py
index 8426dd8..927e9ad 100644
--- a/diffoscope/__main__.py
+++ b/diffoscope/__main__.py
@@ -58,11 +58,16 @@ def create_parser():
     parser.add_argument('--max-diff-block-lines', dest='max_diff_block_lines', type=int,
                         help='maximum number of lines per diff block (default: %d)' %
                         Config.general.max_diff_block_lines,
-                        default=Config.general.max_diff_input_lines)
+                        default=Config.general.max_diff_block_lines)
     parser.add_argument('--max-diff-input-lines', dest='max_diff_input_lines', type=int,
                         help='maximum number of lines fed to diff (default: %d)' %
                         Config.general.max_diff_input_lines,
                         default=Config.general.max_diff_input_lines)
+    parser.add_argument('--fuzzy-threshold', dest='fuzzy_threshold', type=int,
+                        help='threshold for fuzzy-matching '
+                             '(0 to disable, %d is default, 400 is high fuzziness)' %
+                             (Config.general.fuzzy_threshold),
+                        default=Config.general.fuzzy_threshold)
     parser.add_argument('--css', metavar='url', dest='css_url',
                         help='link to an extra CSS for the HTML report')
     parser.add_argument('file1', help='first file to compare')
@@ -101,6 +106,7 @@ def run_diffoscope(parsed_args):
     Config.general.max_diff_block_lines = parsed_args.max_diff_block_lines
     Config.general.max_diff_input_lines = parsed_args.max_diff_input_lines
     Config.general.max_report_size = parsed_args.max_report_size
+    Config.general.fuzzy_threshold = parsed_args.fuzzy_threshold
     if parsed_args.debug:
         logger.setLevel(logging.DEBUG)
     set_locale()
diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index 119b385..3181985 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -25,6 +25,7 @@ import re
 import sys
 import tlsh
 from diffoscope import logger, tool_required
+from diffoscope.config import Config
 from diffoscope.difference import Difference
 from diffoscope.comparators.binary import \
     File, FilesystemFile, compare_binary_files
@@ -126,10 +127,9 @@ def specialize(file):
     return file
 
 
-fuzzy_threshold = 60
-
-
 def perform_fuzzy_matching(files1, files2):
+    if Config.general.fuzzy_threshold == 0:
+        return
     files2 = set(files2)
     already_compared = set()
     for file1 in filter(lambda f: not f.is_directory(), files1):
@@ -144,6 +144,6 @@ def perform_fuzzy_matching(files1, files2):
             comparisons.sort(key=operator.itemgetter(0))
             score, file2 = comparisons[0]
             logger.debug('fuzzy top match %s %s: %d difference score', file1.name, file2.name, score)
-            if score < fuzzy_threshold:
+            if score < Config.general.fuzzy_threshold:
                 yield file1, file2, score
                 already_compared.add(file2)
diff --git a/diffoscope/config.py b/diffoscope/config.py
index cf927ca..4468fc8 100644
--- a/diffoscope/config.py
+++ b/diffoscope/config.py
@@ -30,6 +30,7 @@ class Config(object):
         self._max_diff_block_lines = 50
         self._max_diff_input_lines = 100000 # GNU diff cannot process arbitrary large files :(
         self._max_report_size = 2000 * 2 ** 10 # 2000 kB
+        self._fuzzy_threshold = 60
 
     @classproperty
     def general(cls):
@@ -61,3 +62,10 @@ class Config(object):
     def max_report_size(self, value):
         self._max_report_size = value
 
+    @property
+    def fuzzy_threshold(self):
+        return self._fuzzy_threshold
+
+    @fuzzy_threshold.setter
+    def fuzzy_threshold(self, value):
+        self._fuzzy_threshold = value

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git



More information about the Reproducible-commits mailing list