[Reproducible-commits] [debbindiff] 06/08: Perform comparison with best fuzzy matching instead of first above threshold
Jérémy Bobbio
lunar at moszumanska.debian.org
Fri Jul 31 08:54:51 UTC 2015
This is an automated email from the git hooks/post-receive script.
lunar pushed a commit to branch master
in repository debbindiff.
commit c342857c91aaad5e1ee643948c6f55e206e970b9
Author: Jérémy Bobbio <lunar at debian.org>
Date: Fri Jul 31 08:36:15 2015 +0000
Perform comparison with best fuzzy matching instead of first above threshold
---
debbindiff/comparators/__init__.py | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/debbindiff/comparators/__init__.py b/debbindiff/comparators/__init__.py
index b35e309..cf28f82 100644
--- a/debbindiff/comparators/__init__.py
+++ b/debbindiff/comparators/__init__.py
@@ -19,6 +19,7 @@
# along with debbindiff. If not, see <http://www.gnu.org/licenses/>.
import magic
+import operator
import os.path
import re
import sys
@@ -129,12 +130,16 @@ fuzzy_threshold = 85
def perform_fuzzy_matching(files1, files2):
+ files2 = set(files2)
already_compared = set()
for file1 in filter(lambda f: not f.is_directory(), files1):
- for file2 in filter(lambda f: not f.is_directory(), files2):
- similarity = ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash)
- logger.debug('fuzzy matching %s %s: %d', file1.name, file2.name, similarity)
+ comparisons = [(ssdeep.compare(file1.fuzzy_hash, file2.fuzzy_hash), file2)
+ for file2 in files2 - already_compared
+ if not file2.is_directory()]
+ if comparisons:
+ comparisons.sort(key=operator.itemgetter(0))
+ similarity, file2 = comparisons[-1]
+ logger.debug('fuzzy top match %s %s: %d', file1.name, file2.name, similarity)
if similarity >= fuzzy_threshold:
yield file1, file2, similarity
already_compared.add(file2)
- break
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/debbindiff.git
More information about the Reproducible-commits
mailing list