[diffoscope] 03/05: Split fuzzy matching to its own module.
Chris Lamb
chris at chris-lamb.co.uk
Tue Dec 27 17:16:32 UTC 2016
This is an automated email from the git hooks/post-receive script.
lamby pushed a commit to branch master
in repository diffoscope.
commit d8bff90cbd9858b7e1fef3dbd393ea8dc596d1af
Author: Chris Lamb <lamby at debian.org>
Date: Tue Dec 27 14:50:31 2016 +0000
Split fuzzy matching to its own module.
---
diffoscope/comparators/__init__.py | 28 -------------------
diffoscope/comparators/utils/fuzzy.py | 52 +++++++++++++++++++++++++++++++++++
2 files changed, 52 insertions(+), 28 deletions(-)
diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index 80a4deb..e9a6e3d 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -32,10 +32,6 @@ from diffoscope.difference import Difference
from .binary import NonExistingFile
-try:
- import tlsh
-except ImportError:
- tlsh = None
COMPARATORS = (
('directory.Directory',),
@@ -127,28 +123,4 @@ def specialize(file):
logger.debug('Unidentified file. Magic says: %s', file.magic_file_type)
return file
-
-def perform_fuzzy_matching(members1, members2):
- if tlsh == None or Config().fuzzy_threshold == 0:
- return
- already_compared = set()
- # Perform local copies because they will be modified by consumer
- members1 = dict(members1)
- members2 = dict(members2)
- for name1, file1 in members1.items():
- if file1.is_directory() or not file1.fuzzy_hash:
- continue
- comparisons = []
- for name2, file2 in members2.items():
- if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
- continue
- comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
- if comparisons:
- comparisons.sort(key=operator.itemgetter(0))
- score, name2 = comparisons[0]
- logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
- if score < Config().fuzzy_threshold:
- yield name1, name2, score
- already_compared.add(name2)
-
FILE_CLASSES = import_comparators(COMPARATORS)
diff --git a/diffoscope/comparators/utils/fuzzy.py b/diffoscope/comparators/utils/fuzzy.py
new file mode 100644
index 0000000..16e224c
--- /dev/null
+++ b/diffoscope/comparators/utils/fuzzy.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
+
+import operator
+
+from diffoscope import logger
+from diffoscope.config import Config
+
+try:
+ import tlsh
+except ImportError:
+ tlsh = None
+
+
+def perform_fuzzy_matching(members1, members2):
+ if tlsh == None or Config().fuzzy_threshold == 0:
+ return
+ already_compared = set()
+ # Perform local copies because they will be modified by consumer
+ members1 = dict(members1)
+ members2 = dict(members2)
+ for name1, file1 in members1.items():
+ if file1.is_directory() or not file1.fuzzy_hash:
+ continue
+ comparisons = []
+ for name2, file2 in members2.items():
+ if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
+ continue
+ comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
+ if comparisons:
+ comparisons.sort(key=operator.itemgetter(0))
+ score, name2 = comparisons[0]
+ logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
+ if score < Config().fuzzy_threshold:
+ yield name1, name2, score
+ already_compared.add(name2)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the Reproducible-commits
mailing list