[diffoscope] 03/05: Split fuzzy matching to its own module.

Chris Lamb chris at chris-lamb.co.uk
Tue Dec 27 17:16:32 UTC 2016


This is an automated email from the git hooks/post-receive script.

lamby pushed a commit to branch master
in repository diffoscope.

commit d8bff90cbd9858b7e1fef3dbd393ea8dc596d1af
Author: Chris Lamb <lamby at debian.org>
Date:   Tue Dec 27 14:50:31 2016 +0000

    Split fuzzy matching to its own module.
---
 diffoscope/comparators/__init__.py    | 28 -------------------
 diffoscope/comparators/utils/fuzzy.py | 52 +++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index 80a4deb..e9a6e3d 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -32,10 +32,6 @@ from diffoscope.difference import Difference
 
 from .binary import NonExistingFile
 
-try:
-    import tlsh
-except ImportError:
-    tlsh = None
 
 COMPARATORS = (
     ('directory.Directory',),
@@ -127,28 +123,4 @@ def specialize(file):
     logger.debug('Unidentified file. Magic says: %s', file.magic_file_type)
     return file
 
-
-def perform_fuzzy_matching(members1, members2):
-    if tlsh == None or Config().fuzzy_threshold == 0:
-        return
-    already_compared = set()
-    # Perform local copies because they will be modified by consumer
-    members1 = dict(members1)
-    members2 = dict(members2)
-    for name1, file1 in members1.items():
-        if file1.is_directory() or not file1.fuzzy_hash:
-            continue
-        comparisons = []
-        for name2, file2 in members2.items():
-            if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
-                continue
-            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
-        if comparisons:
-            comparisons.sort(key=operator.itemgetter(0))
-            score, name2 = comparisons[0]
-            logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
-            if score < Config().fuzzy_threshold:
-                yield name1, name2, score
-                already_compared.add(name2)
-
 FILE_CLASSES = import_comparators(COMPARATORS)
diff --git a/diffoscope/comparators/utils/fuzzy.py b/diffoscope/comparators/utils/fuzzy.py
new file mode 100644
index 0000000..16e224c
--- /dev/null
+++ b/diffoscope/comparators/utils/fuzzy.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
+
+import operator
+
+from diffoscope import logger
+from diffoscope.config import Config
+
+try:
+    import tlsh
+except ImportError:
+    tlsh = None
+
+
+def perform_fuzzy_matching(members1, members2):
+    if tlsh == None or Config().fuzzy_threshold == 0:
+        return
+    already_compared = set()
+    # Perform local copies because they will be modified by consumer
+    members1 = dict(members1)
+    members2 = dict(members2)
+    for name1, file1 in members1.items():
+        if file1.is_directory() or not file1.fuzzy_hash:
+            continue
+        comparisons = []
+        for name2, file2 in members2.items():
+            if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
+                continue
+            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
+        if comparisons:
+            comparisons.sort(key=operator.itemgetter(0))
+            score, name2 = comparisons[0]
+            logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
+            if score < Config().fuzzy_threshold:
+                yield name1, name2, score
+                already_compared.add(name2)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git



More information about the Reproducible-commits mailing list