[Forensics-changes] [ssdeep] 01/05: Imported Upstream version 2.12
Helmut Grohne
helmutg at moszumanska.debian.org
Fri Oct 24 17:11:15 UTC 2014
This is an automated email from the git hooks/post-receive script.
helmutg pushed a commit to branch debian
in repository ssdeep.
commit 2bd40e73e2341ff43fab09e16275d08d9dc85a65
Author: Helmut Grohne <helmut at subdivi.de>
Date: Fri Oct 24 18:38:56 2014 +0200
Imported Upstream version 2.12
---
ChangeLog | 20 ++++
Makefile.am | 8 +-
Makefile.in | 8 +-
NEWS | 15 +++
configure | 20 ++--
configure.ac | 2 +-
edit_dist.c | 338 +++++++++++++++--------------------------------------------
edit_dist.h | 18 ++++
fuzzy.c | 5 +-
ssdeep.1 | 2 +-
10 files changed, 163 insertions(+), 273 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 420bbde..0d0f7c0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,23 @@
+2014-10-24: Jesse Kornblum <research at jessekornblum.com>:
+
+ * Makefile.am: Adding edit_dist.h
+
+2014-10-20: Jesse Kornblum <research at jessekornblum.com>:
+
+ * edit_dist.c: Replaced with new, explicitly GPL licensed code.
+ * fuzzy.c: Removed edit distance function prototype
+ * edit_dist.h: Added edit distance function prototype
+
+2014-10-16: Jesse Kornblum <research at jessekornblum.com>:
+
+ * configure.ac: Version bump to 2.12
+ * fuzzy.c: Fixed bug when comparing identical hashes, bug #16.
+ * normal.sh: Use multiple cores when compiling
+
+2014-09-26: Jesse Kornblum <research at jessekornblum.com>:
+
+ * Makefile.am: Made libfuzzy compile as a shared library again.
+
2014-09-09: Jesse Kornblum <research at jessekornblum.com>:
* fuzzy.c: Fixing edge case bug for signature creation.
diff --git a/Makefile.am b/Makefile.am
index ea90344..52b3b71 100755
--- a/Makefile.am
+++ b/Makefile.am
@@ -8,20 +8,20 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES=libfuzzy.la
libfuzzy_la_SOURCES=fuzzy.c edit_dist.c find-file-size.c
-libfuzzy_la_LDFLAGS=-static -no-undefined -version-info 2:0:0
+libfuzzy_la_LDFLAGS=-no-undefined -version-info 2:0:0
-include_HEADERS=fuzzy.h
+include_HEADERS=fuzzy.h edit_dist.h
man_MANS=ssdeep.1
ssdeep_SOURCES = main.cpp match.cpp engine.cpp filedata.cpp \
- dig.cpp cycles.cpp helpers.cpp ui.cpp \
+ dig.cpp cycles.cpp helpers.cpp ui.cpp edit_dist.h \
main.h fuzzy.h tchar-local.h ssdeep.h filedata.h match.h
dll: $(libfuzzy_la_SOURCES)
$(CC) $(CFLAGS) -shared -o fuzzy.dll $(libfuzzy_la_SOURCES) \
-Wl,--output-def,fuzzy.def,--out-implib,libfuzzy.a
- $(STRIP) fuzzy.dll
+ $(STRIP) fuzzy.dll
CLEANFILES=fuzzy.dll fuzzy.def
diff --git a/Makefile.in b/Makefile.in
index c63371f..8c412a6 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -373,11 +373,11 @@ ssdeep_LDFLAGS = -static
ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libfuzzy.la
libfuzzy_la_SOURCES = fuzzy.c edit_dist.c find-file-size.c
-libfuzzy_la_LDFLAGS = -static -no-undefined -version-info 2:0:0
-include_HEADERS = fuzzy.h
+libfuzzy_la_LDFLAGS = -no-undefined -version-info 2:0:0
+include_HEADERS = fuzzy.h edit_dist.h
man_MANS = ssdeep.1
ssdeep_SOURCES = main.cpp match.cpp engine.cpp filedata.cpp \
- dig.cpp cycles.cpp helpers.cpp ui.cpp \
+ dig.cpp cycles.cpp helpers.cpp ui.cpp edit_dist.h \
main.h fuzzy.h tchar-local.h ssdeep.h filedata.h match.h
CLEANFILES = fuzzy.dll fuzzy.def
@@ -1022,7 +1022,7 @@ uninstall-man: uninstall-man1
dll: $(libfuzzy_la_SOURCES)
$(CC) $(CFLAGS) -shared -o fuzzy.dll $(libfuzzy_la_SOURCES) \
-Wl,--output-def,fuzzy.def,--out-implib,libfuzzy.a
- $(STRIP) fuzzy.dll
+ $(STRIP) fuzzy.dll
README.TXT: ssdeep.1
man ./ssdeep.1 | col -bx > README.TXT
diff --git a/NEWS b/NEWS
index ce4d88a..43334ae 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,18 @@
+** Version 2.12 - 24 Oct 2014
+
+* Bug Fixes
+
+ - Fixed issue when comparing identical hashes but with different
+ block sizes.
+
+
+** Version 2.11.1 - 27 Sep 2014
+
+* Bug Fixes
+
+ - Made libfuzzy compile as a shared library again.
+
+
** Version 2.11 - 11 Sep 2014
* New Features
diff --git a/configure b/configure
index 218df6f..4d2b09d 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for SSDEEP 2.11.
+# Generated by GNU Autoconf 2.69 for SSDEEP 2.12.
#
# Report bugs to <research at jessekornblum.com>.
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='SSDEEP'
PACKAGE_TARNAME='ssdeep'
-PACKAGE_VERSION='2.11'
-PACKAGE_STRING='SSDEEP 2.11'
+PACKAGE_VERSION='2.12'
+PACKAGE_STRING='SSDEEP 2.12'
PACKAGE_BUGREPORT='research at jessekornblum.com'
PACKAGE_URL=''
@@ -1322,7 +1322,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures SSDEEP 2.11 to adapt to many kinds of systems.
+\`configure' configures SSDEEP 2.12 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of SSDEEP 2.11:";;
+ short | recursive ) echo "Configuration of SSDEEP 2.12:";;
esac
cat <<\_ACEOF
@@ -1501,7 +1501,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-SSDEEP configure 2.11
+SSDEEP configure 2.12
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1991,7 +1991,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by SSDEEP $as_me 2.11, which was
+It was created by SSDEEP $as_me 2.12, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2854,7 +2854,7 @@ fi
# Define the identity of the package.
PACKAGE='ssdeep'
- VERSION='2.11'
+ VERSION='2.12'
cat >>confdefs.h <<_ACEOF
@@ -16493,7 +16493,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by SSDEEP $as_me 2.11, which was
+This file was extended by SSDEEP $as_me 2.12, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -16559,7 +16559,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-SSDEEP config.status 2.11
+SSDEEP config.status 2.12
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index a83dab6..848d602 100755
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([SSDEEP],[2.11],[research at jessekornblum.com])
+AC_INIT([SSDEEP],[2.12],[research at jessekornblum.com])
AM_INIT_AUTOMAKE
AC_CONFIG_FILES([Makefile])
diff --git a/edit_dist.c b/edit_dist.c
index b3dfaad..6979f38 100644
--- a/edit_dist.c
+++ b/edit_dist.c
@@ -1,266 +1,102 @@
-/*
- This edit distance code is taken from trn3.6. A few minor
- modifications have been made by Andrew Tridgell <tridge at samba.org>
- for use in spamsum.
-*/
-
-
-/***************************************************************************/
-
-
-/* The authors make no claims as to the fitness or correctness of this software
- * for any use whatsoever, and it is provided as is. Any use of this software
- * is at the user's own risk.
+/**
+ * Modified levenshtein distance calculation
+ *
+ * This program can be used, redistributed or modified under any of
+ * Boost Software License 1.0, GPL v2 or GPL v3
+ * See the file COPYING for details.
+ *
+ * $Id$
+ *
+ * Copyright (C) 2014 kikairoya <kikairoya at gmail.com>
+ * Copyright (C) 2014 Jesse Kornblum <research at jessekornblum.com>
*/
+#include <string.h>
+
+#define EDIT_DISTN_MAXLEN 64 /* MAX_SPAMSUM */
+#define EDIT_DISTN_INSERT_COST 1
+#define EDIT_DISTN_REMOVE_COST 1
+#define EDIT_DISTN_REPLACE_COST 2
+
+#define MIN(x,y) ((x)<(y)?(x):(y))
+
+int edit_distn(const char *s1, size_t s1len, const char *s2, size_t s2len) {
+ int t[2][EDIT_DISTN_MAXLEN+1];
+ int *t1 = t[0];
+ int *t2 = t[1];
+ int *t3;
+ size_t i1, i2;
+ for (i2 = 0; i2 <= s2len; i2++)
+ t[0][i2] = i2;
+ for (i1 = 0; i1 < s1len; i1++) {
+ t2[0] = i1 + 1;
+ for (i2 = 0; i2 < s2len; i2++) {
+ int cost_a = t1[i2+1] + EDIT_DISTN_INSERT_COST;
+ int cost_d = t2[i2] + EDIT_DISTN_REMOVE_COST;
+ int cost_r = t1[i2] + (s1[i1] == s2[i2] ? 0 : EDIT_DISTN_REPLACE_COST);
+ t2[i2+1] = MIN(MIN(cost_a, cost_d), cost_r);
+ }
+ t3 = t1;
+ t1 = t2;
+ t2 = t3;
+ }
+ return t1[s2len];
+}
+
+#ifdef __UNITTEST
#include <stdio.h>
#include <stdlib.h>
-/* edit_dist -- returns the minimum edit distance between two strings
-
- Program by: Mark Maimone CMU Computer Science 13 Nov 89
- Last Modified: 28 Jan 90
-
- If the input strings have length n and m, the algorithm runs in time
- O(nm) and space O(min(m,n)).
+#define HELLOWORLD "Hello World!"
-HISTORY
- 13 Nov 89 (mwm) Created edit_dist() and set_costs().
+// Convenience method for getting the edit distance of two strings
+int edit_dist(const char *a, const char *b) {
+ unsigned int a_len = 0, b_len = 0;
+ if (a)
+ a_len = strlen(a);
+ if (b)
+ b_len = strlen(b);
- 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
- computations will work even when THRESHOLD is not a multiple of
- sizeof(int).
-
- 17 May 93 (mwm) Improved performance when used with trn's newsgroup
- processing; assume all costs are 1, and you can terminate when a
- threshold is exceeded.
-*/
-
-#define MIN_DIST 100
-
-#define TRN_SPEEDUP /* Use a less-general version of the
- routine, one that's better for trn.
- All change costs are 1, and it's okay
- to terminate if the edit distance is
- known to exceed MIN_DIST */
-
-#define THRESHOLD 4000 /* worry about allocating more memory only
- when this # of bytes is exceeded */
-#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
-
-#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
-
-#define swap_int(x,y) do { int _iswap = (x); (x) = (y); (y) = _iswap; } while (0)
-#define swap_char(x,y) do { const char *_cswap = (x); (x) = (y); (y) = _cswap; } while (0)
-
-static inline int min3(int x, int y, int z) {
- return x < y ? (x < z ? x : z) : (z < y) ? z : y;
-}
-static inline int min2(int x, int y)
-{
- return x < y ? x : y;
+ return edit_distn(a, a_len, b, b_len);
}
-static int insert_cost = 1;
-static int delete_cost = 1;
-#ifndef TRN_SPEEDUP
-static int change_cost = 1;
-static int swap_cost = 1;
-#endif
-
-/* edit_distn -- returns the edit distance between two strings, or -1 on
- failure */
-
-int
-edit_distn(const char *from, int from_len, const char *to, int to_len)
-{
-#ifndef TRN_SPEEDUP
- register int ins, del, ch; /* local copies of edit costs */
-#endif
- register int row, col, index; /* dynamic programming counters */
- register int radix; /* radix for modular indexing */
-#ifdef TRN_SPEEDUP
- register int low;
-#endif
- int *buffer; /* pointer to storage for one row
- of the d.p. array */
- int store[THRESHOLD / sizeof (int)];
- /* a small amount of static
- storage, to be used when the
- input strings are small enough */
-
-/* Handle trivial cases when one string is empty */
-
- if (from == NULL || !from_len)
- if (to == NULL || !to_len)
- return 0;
- else
- return to_len * insert_cost;
- else if (to == NULL || !to_len)
- return from_len * delete_cost;
-
-/* Initialize registers */
-
- radix = 2 * from_len + 3;
-#ifdef TRN_SPEEDUP
-#define ins 1
-#define del 1
-#define ch 3
-#define swap_cost 5
-#else
- ins = insert_cost;
- del = delete_cost;
- ch = change_cost;
-#endif
-
-/* Make from short enough to fit in the static storage, if it's at all
- possible */
-
- if (from_len > to_len && from_len > STRLENTHRESHOLD) {
- swap_int(from_len, to_len);
- swap_char(from, to);
-#ifndef TRN_SPEEDUP
- swap_int(ins, del);
-#endif
- } /* if from_len > to_len */
-
-/* Allocate the array storage (from the heap if necessary) */
-
- if (from_len <= STRLENTHRESHOLD)
- buffer = store;
- else
- buffer = (int *) malloc(radix * sizeof (int));
-
-/* Here's where the fun begins. We will find the minimum edit distance
- using dynamic programming. We only need to store two rows of the matrix
- at a time, since we always progress down the matrix. For example,
- given the strings "one" and "two", and insert, delete and change costs
- equal to 1:
-
- _ o n e
- _ 0 1 2 3
- t 1 1 2 3
- w 2 2 2 3
- o 3 2 3 3
-
- The dynamic programming recursion is defined as follows:
-
- ar(x,0) := x * insert_cost
- ar(0,y) := y * delete_cost
- ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
- a(x - 1, y) + insert_cost,
- a(x, y - 1) + delete_cost,
- a(x - 2, y - 2) + (from[x] == to[y-1] &&
- from[x-1] == to[y] ? swap_cost :
- infinity))
-
- Since this only looks at most two rows and three columns back, we need
- only store the values for the two preceeding rows. In this
- implementation, we do not explicitly store the zero column, so only 2 *
- from_len + 2 words are needed. However, in the implementation of the
- swap_cost check, the current matrix value is used as a buffer; we
- can't overwrite the earlier value until the swap_cost check has
- been performed. So we use 2 * from_len + 3 elements in the buffer.
-*/
-
-#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
- buffer[mod(index)]))
-#define NW(x,y) ar(x, y, index + from_len + 2)
-#define N(x,y) ar(x, y, index + from_len + 3)
-#define W(x,y) ar(x, y, index + radix - 1)
-#define NNWW(x,y) ar(x, y, index + 1)
-#define mod(x) ((x) % radix)
-
- index = 0;
-
-#ifdef DEBUG_EDITDIST
- printf(" ");
- for (col = 0; col < from_len; col++)
- printf(" %c ", from[col]);
- printf("\n ");
-
- for (col = 0; col <= from_len; col++)
- printf("%2d ", col * del);
-#endif
-
-/* Row 0 is handled implicitly; its value at a given column is col*del.
- The loop below computes the values for Row 1. At this point we know the
- strings are nonempty. We also don't need to consider swap costs in row
- 1.
-
- COMMENT: the indicies row and col below point into the STRING, so
- the corresponding MATRIX indicies are row+1 and col+1.
-*/
-
- buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
-#ifdef TRN_SPEEDUP
- low = buffer[mod(index + radix - 1)];
-#endif
-
-#ifdef DEBUG_EDITDIST
- printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
-#endif
-
- for (col = 1; col < from_len; col++) {
- buffer[index] = min3(
- col * del + ((from[col] == to[0]) ? 0 : ch),
- (col + 1) * del + ins,
- buffer[index - 1] + del);
-#ifdef TRN_SPEEDUP
- if (buffer[index] < low)
- low = buffer[index];
-#endif
- index++;
-
-#ifdef DEBUG_EDITDIST
- printf("%2d ", buffer[index - 1]);
-#endif
+// Exercises edit_dist on a and b. If the result matches the expected value,
+// returns 0. If not, displays the message and returns 1.
+int run_test(char *a, char *b, int expected, char *msg) {
+ int actual = edit_dist(a,b);
+ if (actual == expected)
+ return 0;
+
+ printf ("FAIL: Expected %d, got %d for %s:%s, %s\n",
+ expected,
+ actual,
+ a,
+ b,
+ msg);
+ return 1;
+}
- } /* for col = 1 */
+#define RUN_TEST(A,B,EXPECTED,MSG) failures += run_test(A,B,EXPECTED,MSG)
-#ifdef DEBUG_EDITDIST
- printf("\n %c %2d ", to[1], 2 * ins);
-#endif
+int main(void) {
+ unsigned int failures = 0;
-/* Now handle the rest of the matrix */
+ RUN_TEST(NULL, HELLOWORLD, 12, "Null source");
+ RUN_TEST(HELLOWORLD, NULL, 12, "Null dest");
+ RUN_TEST("", HELLOWORLD, 12, "Empty source");
+ RUN_TEST(HELLOWORLD, "", 12, "Empty destination");
+ RUN_TEST(HELLOWORLD, HELLOWORLD, 0, "Equal strings");
+ RUN_TEST("Hello world", "Hell world", 1, "Delete");
+ RUN_TEST("Hell world", "Hello world", 1, "Insert");
+ RUN_TEST("Hello world", "Hello owrld", 2, "Swap");
+ RUN_TEST("Hello world", "HellX world", 2, "Change");
- for (row = 1; row < to_len; row++) {
- for (col = 0; col < from_len; col++) {
- buffer[index] = min3(
- NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
- N(row, col + 1) + ins,
- W(row + 1, col) + del);
- if (from[col] == to[row - 1] && col > 0 &&
- from[col - 1] == to[row])
- buffer[index] = min2(buffer[index],
- NNWW(row - 1, col - 1) + swap_cost);
+ if (failures) {
+ printf ("\n%u tests failed.\n", failures);
+ return EXIT_FAILURE;
+ }
-#ifdef DEBUG_EDITDIST
- printf("%2d ", buffer[index]);
-#endif
-#ifdef TRN_SPEEDUP
- if (buffer[index] < low || col == 0)
- low = buffer[index];
-#endif
-
- index = mod(index + 1);
- } /* for col = 1 */
-#ifdef DEBUG_EDITDIST
- if (row < to_len - 1)
- printf("\n %c %2d ", to[row+1], (row + 2) * ins);
- else
- printf("\n");
-#endif
-#ifdef TRN_SPEEDUP
- if (low > MIN_DIST)
- break;
+ printf ("All tests passed.\n");
+ return EXIT_SUCCESS;
+}
#endif
- } /* for row = 1 */
-
- row = buffer[mod(index + radix - 1)];
- if (buffer != store)
- free((char *) buffer);
- return row;
-} /* edit_distn */
-
-
diff --git a/edit_dist.h b/edit_dist.h
new file mode 100644
index 0000000..76838d9
--- /dev/null
+++ b/edit_dist.h
@@ -0,0 +1,18 @@
+#ifndef __EDIT_DIST_H
+#define __EDIT_DIST_H
+/**
+ * Modified levenshtein distance calculation
+ *
+ * This program can be used, redistributed or modified under any of
+ * Boost Software License 1.0, GPL v2 or GPL v3
+ * See the file COPYING for details.
+ *
+ * $Id$
+ *
+ * Copyright (C) 2014 kikairoya <kikairoya at gmail.com>
+ * Copyright (C) 2014 Jesse Kornblum <research at jessekornblum.com>
+ */
+
+int edit_distn(const char *s1, size_t s1len, const char *s2, size_t s2len);
+
+#endif // ifndef __EDIT_DIST_H
diff --git a/fuzzy.c b/fuzzy.c
index 66c882d..547e7f1 100644
--- a/fuzzy.c
+++ b/fuzzy.c
@@ -28,7 +28,9 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+
#include "fuzzy.h"
+#include "edit_dist.h"
#if defined(__GNUC__) && __GNUC__ >= 3
#define likely(x) __builtin_expect(!!(x), 1)
@@ -590,7 +592,6 @@ static uint32_t score_strings(const char *s1,
{
uint32_t score;
size_t len1, len2;
- int edit_distn(const char *from, int from_len, const char *to, int to_len);
len1 = strlen(s1);
len2 = strlen(s2);
@@ -732,7 +733,7 @@ int fuzzy_compare(const char *str1, const char *str2)
// Now that we know the strings are both well formed, are they
// identical? We could save ourselves some work here
- if (strlen(s1) == strlen(s2)) {
+ if (block_size1 == block_size2 && strlen(s1) == strlen(s2)) {
if (!strncmp(s1, s2, strlen(s1))) {
free (s1);
free (s2);
diff --git a/ssdeep.1 b/ssdeep.1
index 62bc1a9..2e2791c 100644
--- a/ssdeep.1
+++ b/ssdeep.1
@@ -1,4 +1,4 @@
-.TH SSDEEP "1" "Version 2.11 \- 11 Sep 2014" "Facebook" "Facebook"
+.TH SSDEEP "1" "Version 2.12 \- 24 Oct 2014" "Facebook" "Facebook"
.SH NAME
ssdeep - Computes context triggered piecewise hashes (fuzzy hashes)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/ssdeep.git
More information about the forensics-changes
mailing list