[Pkg-clamav-commits] [SCM] Debian repository for ClamAV branch, debian/unstable, updated. debian/0.95+dfsg-1-6156-g094ec9b

Sun Apr 4 01:19:20 UTC 2010

The following commit has been merged in the debian/unstable branch:
commit e0ac80ab3812e0b3e32927adfd2316483e83de4c
Author: Török Edvin <edwin at clamav.net>
Date:   Tue Feb 9 12:01:31 2010 +0200

    Prepare for prefiltering: add new files.
    
    Move SO_search to filtering.c as filtering_search.
    Add unit test.

diff --git a/libclamav/Makefile.am b/libclamav/Makefile.am
index e203242..c2da9ef 100644
--- a/libclamav/Makefile.am
+++ b/libclamav/Makefile.am
@@ -263,7 +263,11 @@ libclamav_la_SOURCES = \
 	version.c\
 	version.h\
 	mpool.c\
-	mpool.h \
+	mpool.h\
+	filtering.h\
+	filtering.c\
+	perflogging.c\
+	perflogging.h\
 	default.h\
 	sha256.c\
 	sha256.h
diff --git a/libclamav/Makefile.in b/libclamav/Makefile.in
index e65e6db..10c796d 100644
--- a/libclamav/Makefile.in
+++ b/libclamav/Makefile.in
@@ -101,7 +101,7 @@ am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher.lo \
 	phish_whitelist.lo regex_list.lo regex_suffix.lo mspack.lo \
 	cab.lo entconv.lo hashtab.lo dconf.lo lzma_iface.lo explode.lo \
 	textnorm.lo dlp.lo js-norm.lo uniq.lo version.lo mpool.lo \
-	sha256.lo
+	filtering.lo perflogging.lo sha256.lo
 libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS)
 libclamav_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
@@ -560,7 +560,11 @@ libclamav_la_SOURCES = \
 	version.c\
 	version.h\
 	mpool.c\
-	mpool.h \
+	mpool.h\
+	filtering.h\
+	filtering.c\
+	perflogging.c\
+	perflogging.h\
 	default.h\
 	sha256.c\
 	sha256.h
@@ -672,6 +676,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/entconv.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/explode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/filetypes.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/filtering.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/fsg.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/hashtab.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/htmlnorm.Plo at am__quote@
@@ -705,6 +710,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/packlibs.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/pdf.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/pe.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/perflogging.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/petite.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/phish_domaincheck_db.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/phish_whitelist.Plo at am__quote@
diff --git a/libclamav/filtering.c b/libclamav/filtering.c
new file mode 100644
index 0000000..d7f77b5
--- /dev/null
+++ b/libclamav/filtering.c
@@ -0,0 +1,744 @@
+/*
+ *  A fast filter for static patterns.
+ *
+ *  Copyright (C) 2008 Sourcefire, Inc.
+ *
+ *  Authors: Török Edvin
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ *  MA 02110-1301, USA.
+ */
+#if HAVE_CONFIG_H
+#include "clamav-config.h"
+#endif
+#include "filtering.h"
+#include "matcher-ac.h"
+#include <string.h>
+#include <assert.h>
+#include "perflogging.h"
+/* ----- shift-or filtering -------------- */
+
+/*
+ * Description of algorithm:
+ *
+ * Multiple patterns are added to the filter.
+ * The filter retains an approximation of these patterns, which can lead to
+ * false positive matches, but not false negative matches.
+ *
+ * For each position in the filter we retain what qgrams can match at that
+ * position, for example (if we'd use characters as qgrams):
+ * pattern1: atu
+ * pattern2: bzf
+ * pattern3: xat
+ * 
+ * filter accepts:
+ * [abx][tza][uft]
+ *
+ * But it also accepts (false positives):
+ * azu, azf, azt, ...
+ *
+ * It doesn't however accept:
+ * aaa, atz, ...
+ *
+ * This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states, 
+ * each active bit meaning that a state is active.
+ * 
+ * The states are activated sequentially, eachtransition decision is made 
+ * considering if we can accept the character at position X. 
+ * Since we can start a match at any position, position 0 is
+ * reactivated each time.
+ * When the last position is activated, the filter reports a match.
+ * If we can't accept the character at position X, the state remains inactive,
+ * and further states aren't activated (unless we activate this state in the
+ * future).
+ *
+ * Essentially this is an automaton like this:
+ *
+ *  /\    (a|b|x)        (t|z|a)        (u|f|t)
+ * [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match
+ *  \_______________/             |               
+ *  \_____________________________/               
+ *
+ *
+ * But we are tracking multiple active states at each time (or run N automatons
+ * in parallel if you like, N = number of states).
+ *
+ * We can have S3 and S2 active, meaning that if the next character is
+ * acceptable, it transitions to S1,S3 and S4 being active, otherwise it
+ * transitions to S1 being active.
+ *
+ * Active states can either be represented as a binary 1 or 0, and using
+ * bit-shifting and masking.
+ * If we choose 1, we must use &, and after shifting always reactivate bit 0.
+ * If we choose 0, we must use |, and after shifting we don't need to do
+ * anything (since by shifting a 0 is implicitly introduced).
+ *
+ * This file implements the latter (shift-or) method.
+ *
+ * The discussion above considered pattern to be of same length (or truncated to
+ * be so). In reality patterns are of variable length, and we often have short
+ * pattern.
+ *
+ * Thus another bitmap was introduced, meaning that if (end[Q] == set), then
+ * a pattern can end at this position.
+ * Also we would fill the pattern's position filters quite quickly with only 256
+ * choices for a position, so the algorithm uses overlapping qgrams of length 2:
+ * 'abcd' is 3 qgrams: 'ab','bc','cd'
+ *
+ * The algorithm is very sensitive to the end[Q] filter, since it can have false
+ * positives due to short patterns!
+ * For optimal performance we need:
+ *   - patterns as long as possible
+ *   - probability for end[Q] to match low (avoid 0000, and other common case
+ *   - choose the most "diverse" subset from a long pattern
+ *
+ * diverse = refering to what we are scanning, so that the filter rarely
+ * matches, so this actually means that we *want* to avoid adding more
+ * characters to the filter, if we have 2 patterns:
+ * abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we
+ * don't add new character at the beginning.
+ *
+ * With NDB signatures there are more challenges to overcome:
+ *    e8??0000000aa
+ *
+ *    will make the filter accept:
+ *    e8<all-256-values-here>, <all-256-values>00, ... 000000aa
+ *
+ *    We should delay the pattern end as long as possible, especially if it is  0000
+ *    The problem is that now the filter accepts 0000 on position 3, regardless
+ *    of what we have on position 1 (even if we have something else than e8), so
+ *    we have to be very careful not to allow 0000 on first position too,
+ *    otherwise the filter will happily accept 000000000000.
+ *
+ * To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram
+ * based (fits L2), both must match for the filter to consider it a match.   
+ *
+ *
+ */
+
+#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
+#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
+
+void filter_init(struct filter *m)
+{
+	memset(m->B, ~0, sizeof(m->B));
+	memset(m->end, ~0, sizeof(m->end));
+}
+
+/* because we use uint32_t */
+#define MAXSOPATLEN 8
+
+static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val)
+{
+	return !(m->B[val] & (1<<pos));
+}
+
+static inline void filter_set_atpos(struct filter *m, unsigned pos, uint16_t val)
+{
+	if (!filter_isset(m, pos, val)) {
+		cli_perf_log_count(FILTER_LOAD, pos);
+		m->B[val] &= ~(1<<pos);
+	}
+}
+
+
+static inline int filter_end_isset(const struct filter *m, unsigned pos, uint16_t a)
+{
+	return !(m->end[a] & (1<<pos));
+}
+
+static inline void filter_set_end(struct filter *m, unsigned pos, uint16_t a)
+{
+	if (!filter_end_isset(m, pos, a)) {
+		cli_perf_log_count(FILTER_END_LOAD, pos);
+		m->end[a] &= ~(1 << pos);
+	}
+}
+#define MAX_CHOICES 8
+/* just an arbitrary limit, if patterns are longer, we cut
+ * the filter can only use MAXSOPATLEN (32) characters,
+ * this longer buffer is needed so that we can choose the "best" subpattern from
+ * it */
+#define MAXPATLEN 255
+
+/* merge another pattern into the filter
+ * add('abc'); add('bcd'); will match [ab][bc][cd] */
+int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name)
+{
+	uint16_t q;
+	uint8_t j, maxlen;
+	uint32_t best = 0xffffffff;
+	uint8_t best_pos = 0;
+
+	cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len);
+	/* TODO: choose best among MAXCHOICES */
+	/* cut length */
+	if(len > MAXPATLEN) {
+		len = MAXPATLEN;
+	}
+	if(len < 2)
+		return -1;
+
+	/* we want subsigs to be as long as possible */
+	if (len > 4) {
+		maxlen = len - 4;
+		if (maxlen == 1) maxlen = 2;
+	} else
+		maxlen = 2;
+	for(j=0;(best < 100 && j<MAX_CHOICES) || (j < maxlen) ;j++) {
+		uint32_t num = MAXSOPATLEN;
+		uint8_t k;
+		if (j+2 > len)
+			break;
+		for(k=j;k<len-1 && (k-j < MAXSOPATLEN);k++) {
+			q = cli_readint16( &pattern[k] );
+			/* we want to favor subsigs that add as little as
+			 * possible to the filter */
+			num += filter_isset(m, k-j, q) ? 0 : MAXSOPATLEN - (k-j);
+			if ((k == j || k == j+1) && (q == 0x0000 || q == 0xffff))
+				num += k==j ?  10000 : 1000;/* bad */
+		}
+		/* it is very important to keep the end set small */
+		num += 10*(filter_end_isset(m, k-j-1, q) ? 0 : 1);
+		/* it is very important to have signatures as long as possible
+		 * */
+		num += 5*(MAXSOPATLEN - (k-j));
+		/* if we are lower length than threshold penalize */
+		if (k-j+1 < 4)
+			num += 200;
+		/* favour longer patterns */
+		num -= (2*MAXSOPATLEN - (k + 1+j))*(k-j)/2;
+
+		if (num < best) {
+			best = num;
+			best_pos = j;
+		}
+	}
+
+	assert(best_pos < len-1);
+	if (pattern[best_pos] == 0 && pattern[best_pos+1] == 0) {
+		cli_warnmsg("filter: subsignature begins with zero (static): %s\n", name);
+	}
+	pattern += best_pos;
+	len -= best_pos;
+	/* cut length */
+	if(len > MAXSOPATLEN) {
+		len = MAXSOPATLEN;
+	}
+	/* Shift-Or like preprocessing */
+	for(j=0;j < len-1;j++) {
+		/* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */
+		q = cli_readint16( &pattern[j] );
+		filter_set_atpos(m, j, q);
+	}
+	/* we use variable length patterns, use last character to mark pattern end,
+	 * can lead to false positives.*/
+	/* mark that at state j, the q-gram q can end the pattern */
+	if(j) {
+		j--;
+		filter_set_end(m, j, q);
+	}
+	return j+2;
+}
+
+struct char_spec {
+	/* if non-null i-th character = alt[start + step*i]; start+step*i < end;
+	 */
+	struct cli_ac_alt *alt;
+	uint8_t start;
+	uint8_t end;
+	uint8_t step;
+};
+
+static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i)
+{
+	const struct cli_ac_alt *alt = spec->alt;
+	if (alt) {
+		assert (alt->chmode);
+		assert (i < alt->num);
+		return alt->str[i];
+	}
+	return i;
+}
+
+static const struct char_spec full_range = {NULL, 0,0xff,1};
+
+static inline int spec_is_fullrange(const struct char_spec *spec0, const struct char_spec *spec1)
+{
+	return !memcmp(spec0, &full_range, sizeof(full_range)) &&
+	       !memcmp(spec1, &full_range, sizeof(full_range));
+}
+
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+enum badness {
+	reject,
+	/* try to avoid if possible */
+	avoid_first,
+	avoid_anywhere, /* includes avoid_first! */
+	/* not that bad, but still not best */
+	dontlike,
+	accept,
+	like
+};
+static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end)
+{
+	int32_t base;
+	unsigned k0, k1, num_introduced = 0, num_end_introduced = 0;
+	switch (badness) {
+		case reject:
+			/* not reached */
+			assert(0);
+			base = -0x7fffff;
+			break;
+		case avoid_first:
+			if (!i)
+				base = -0x700000;
+			else
+				base = 0;
+			break;
+		case avoid_anywhere:
+			if (!i)
+				base = -0x720000;
+			else
+				base = -0x1000;
+			break;
+		case dontlike:
+			base = 0;
+			break;
+		case accept:
+			base = 0x200;
+			break;
+		case like:
+			/* a bit better only */
+			base = 0x201;
+			break;
+	}
+	if (base < 0) {
+		*score = base;
+		*score_end = base;
+		return;
+	}
+	/* at most 256 iterations here, otherwise base would be negative */
+	for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
+		for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
+			unsigned char c0 = spec_ith_char(spec0, k0);
+			unsigned char c1 = spec_ith_char(spec1, k1);
+			uint16_t a = c0 | (c1<<8);
+			num_introduced += filter_isset(m, i, a);
+			num_end_introduced += filter_end_isset(m, i, a);
+		}
+	}
+	*score = base - num_introduced;
+	*score_end = base - num_end_introduced;
+	if (badness == avoid_first && i) {
+		/* what is bad to begin with, is bad at end too */
+		*score_end -= 0x1000;
+	}
+}
+
+struct choice {
+	enum badness base;
+	unsigned begin;
+	unsigned len;
+};
+
+static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness)
+{
+	struct choice *choice;
+	int i_neg = -1;
+	assert(ie < MAXPATLEN);
+	if (ie < i+1)
+		return;
+	if (*cnt >= MAX_CHOICES)
+		return;
+	if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) {
+		unsigned j;
+		/* replace very bad picks if we're full */
+		for (j=0;j<*cnt;j++) {
+			if (choices[j].base < badness) {
+				if (i_neg == -1 || choices[j].base < choices[i_neg].base) {
+					i_neg = j;
+				}
+			}
+		}
+	}
+	if (i_neg != -1) {
+		choice = &choices[i_neg];
+	} else {
+		choice = &choices[(*cnt)++];
+	}
+	choice->begin = i;
+	choice->len = ie - i + 1;
+	choice->base = badness;
+}
+
+static inline int32_t spec_iter(const struct char_spec *spec)
+{
+	assert(spec->step);
+	return (1 + spec->end - spec->start)/spec->step;
+}
+
+int  filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat)
+{
+	unsigned i, j = 0, stop = 0, l=0;
+	uint16_t k0, k1;
+
+	struct char_spec chars[MAXPATLEN];
+	enum badness char_badness[MAXPATLEN];
+	unsigned char patc[MAXPATLEN];
+	unsigned altcnt = 0;
+	int32_t best_score = -0x7fffffff;
+	unsigned best_score_i = 0;
+	unsigned best_score_len = 0;
+	struct char_spec *spec0, *spec1;
+
+	struct choice choices[MAX_CHOICES];
+	unsigned choices_cnt = 0;
+	unsigned prefix_len = pat->prefix_length;
+
+	j = MIN(prefix_len + pat->length, MAXPATLEN);
+	for(i=0;i<j;i++) {
+		const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
+		if ((p&CLI_MATCH_WILDCARD) != CLI_MATCH_CHAR)
+			break;
+		patc[i] = (uint8_t)p;
+	}
+	if (i == j) {
+		/* all static, use add_static it has better heuristics for this
+		 * case */
+		return filter_add_static(m, patc, j, pat->virname);
+	}
+	cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j);
+	/* transform AC characters into our representation */
+	for (i=0;i<j && !stop; i++) {
+		struct char_spec *spec = &chars[i];
+		const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
+		spec->alt = NULL;
+		switch (p & CLI_MATCH_WILDCARD) {
+			case CLI_MATCH_CHAR:
+				spec->start = spec->end = (uint8_t)p;
+				spec->step  = 1;
+				break;
+			case CLI_MATCH_IGNORE:
+				spec->start = 0x00;
+				spec->end   = 0xff;
+				spec->step  = 1;
+				break;
+			case CLI_MATCH_ALTERNATIVE:
+				assert(pat->alttable);
+				assert(altcnt < pat->alt);
+				assert(pat->alttable[altcnt]);
+				if (pat->alttable[altcnt++]->chmode) {
+					spec->start = 0;
+					spec->end = pat->alttable[altcnt-1]->num - 1;
+					spec->step = 1;
+					spec->alt = pat->alttable[altcnt-1];
+					break;
+				}
+				stop = 1;
+				break;
+			case CLI_MATCH_NIBBLE_HIGH:
+				spec->start = (p & 0xf0);
+				spec->end   = spec->start | 0x0f;
+				spec->step  = 1;
+				break;
+			case CLI_MATCH_NIBBLE_LOW:
+				spec->start = (p & 0xf);
+				spec->end   = 0xf0 | spec->start;
+				spec->step  = 0x10;
+				break;
+			default:
+				cli_errmsg("filtering: unknown wildcard character: %d\n", p);
+				return -1;
+		}
+	}
+	if (stop) --i;
+	j = i;
+	if (j < 2) {
+		if (stop)
+			cli_warnmsg("Don't know how to create filter for: %s\n",pat->virname);
+		else
+			cli_warnmsg("Subpattern too short: %s\n", pat->virname);
+		return -1;
+	}
+
+	for(i=0;i<j-1;i++) {
+		int32_t num_iter;
+		/* new qgrams added to the filter */
+		spec0 = &chars[i];
+		spec1 = &chars[i+1];
+		num_iter = spec_iter(spec0) * spec_iter(spec1);
+
+		if (num_iter >= 0x100) {
+			if (num_iter == 0x10000)
+				char_badness[i] = reject;
+			else
+				char_badness[i] = avoid_anywhere;
+		} else {
+			int8_t binary = 0;
+			enum badness scor = accept;
+			for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
+				for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
+					unsigned char c0 = spec_ith_char(spec0, k0);
+					unsigned char c1 = spec_ith_char(spec1, k1);
+					if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) {
+						scor = avoid_first;
+						break;
+					}
+					if (c0 == c1) {
+						scor = dontlike;
+						break;
+					}
+					if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 >127))
+						binary = 1;
+				}
+			}
+			if (scor == accept && binary) {
+				/* slightly favor binary */
+				scor = like;
+			}
+			char_badness[i] = scor;
+		}
+	}
+
+	/* try to choose best subpattern */
+
+	/* calculating the score for all possible i start pos
+	 * and all possible length is too slow, so choose best among N choices
+	 * only */
+	for (i=0;i<j-1 && choices_cnt < MAX_CHOICES;i++) {
+		enum badness base0 = like, base1 = like;
+		unsigned kend = MIN(j-1, (i + MAXSOPATLEN)&~1), k;
+		int ki = -0xff;
+		/* add 2 scores: pattern with max length, one where we stop at
+		 * first negative, and one we stop at last positive, but never
+		 * include reject */
+		assert(kend-1 < j-1);
+		if (char_badness[i]  == reject)
+			continue;
+		if ((char_badness[i] == avoid_anywhere || char_badness[i] == avoid_first)
+				&& choices_cnt > 0)
+			/* if we have another choice don't choose this */
+			continue;
+		while ((kend > i+3) && char_badness[kend-1] == reject) kend--;
+		for (k=i;k<kend;k++) {
+			enum badness badness = char_badness[k];
+			if (badness < accept) {
+				if (badness == reject) {
+					/* this is a never pick */
+					kend = k;
+					break;
+				}
+				if (badness == avoid_first && k != i)
+					badness = dontlike;
+				if (k == i && badness == avoid_anywhere)
+					badness = avoid_first;
+				if (ki == -0xff)
+					ki = k;
+			}
+			base0 = MIN(base0, badness);
+			if (ki == -0xff)
+				base1 = MIN(base1, badness);
+		}
+		add_choice(choices, &choices_cnt, i, kend, base0);
+		if (ki > (int)i) {
+			/* ki|ki+1|??| */
+			/* try subpattern from after the wildcard */
+			i = ki;
+		}
+		/* if score is positive, it replaces a negative choice */
+	}
+	for(l=0;l<choices_cnt;l++) {
+		int32_t score;
+		unsigned kend;
+		unsigned k;
+
+		i = choices[l].begin;
+		kend = i + choices[l].len;
+		score = 0;
+
+		for(k = i; k < kend-1; k++) {
+			unsigned p = k - i;
+			int32_t iscore, score_end;
+			assert(k < j);
+			get_score(char_badness[k], p, m, &chars[k], &chars[k+1],
+				  &iscore, &score_end);
+			/* give more importance to the score of the characters
+			 * at the beginning */
+			/* TODO: tune magic number here */
+			if (p < 6) {
+				iscore *= (6-p);
+				score_end *= (6-p);
+			}
+			score += iscore;
+			if (score + score_end > best_score) {
+				/* we may have negative scores, so truncating
+				 * the pattern could actually get us a higher
+				 * score */
+				best_score = score + score_end;
+				best_score_len = p + 2;
+				best_score_i = i;
+				assert(i + best_score_len <= j);
+			}
+		}
+	}
+
+	if (best_score <= -0x7fffffff) {
+		cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score);
+		return -1;
+	}
+	if (choices_cnt == 0) {
+		cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname);
+		return -1;
+	}
+	assert(best_score_len >= 2);
+
+	cli_dbgmsg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len);
+	/* Shift-Or like preprocessing */
+	assert(1 < best_score_len);
+	for (i=0;i < best_score_len-1;i++) {
+		spec0 = &chars[best_score_i + i];
+		spec1 = &chars[best_score_i + i + 1];
+		/* use overlapping little-endian 2-grams, overlapping because match can start
+		 * at any position (including odd) */
+
+		for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
+			for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
+				unsigned char c0 = spec_ith_char(spec0, k0);
+				unsigned char c1 = spec_ith_char(spec1, k1);
+				if (!c0 && !c1 && !i) {
+					cli_warnmsg("filter: subsignature begins with zero: %s\n",pat->virname);
+				}
+				filter_set_atpos(m, i, c0 | (c1<<8));
+			}
+		}
+	}
+
+	j  = best_score_len - 2;
+	for (k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
+		for (k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
+			unsigned char c0 = spec_ith_char(spec0, k0);
+			unsigned char c1 = spec_ith_char(spec1, k1);
+			if (!c0 && !c1) {
+				cli_dbgmsg("filter: subsignature ends with zero: %s\n",pat->virname);
+			}
+			filter_set_end(m, j, c0 | (c1<<8));
+		}
+	}
+	return j+2;
+}
+
+static const struct match_len_info {
+	uint8_t shortest;
+	uint8_t longest;
+} match_len[256] = {
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{8,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{9,9},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{7,8},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{8,8},
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7},
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{6,7},
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7},
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{7,7},
+	{2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{5,6},
+	{2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{6,6},
+	{2,5},{3,5},{2,5},{4,5},{2,5},{3,5},{2,5},{5,5},
+	{2,4},{3,4},{2,4},{4,4},{2,3},{3,3},{2,2},{0,0}
+};
+/* state 11110011 means that we may have a match of length min 4, max 5 */
+
+__hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf)
+{
+	size_t j;
+	uint8_t state = ~0;
+	const uint8_t *B = m->B;
+	const uint8_t *End = m->end;
+	uint8_t shortest, longest=0;
+
+	if (len < 2) return -1;
+	/* look for first match */
+	for (j=0; j < len-1;j++) {
+		uint8_t match_state_end;
+		const uint16_t q0 = cli_readint16( &data[j] );
+
+		state = (state << 1) | B[q0];
+		match_state_end = state | End[q0];
+		if (match_state_end != 0xff) {
+			inf->first_match = j;
+      return 0;
+		}
+	}
+  /* no match, inf is invalid */
+  return -1;
+}
+
+/* this is like a FSM, with multiple active states at the same time.
+ * each bit in "state" means an active state, when a char is encountered
+ * we determine what states can remain active.
+ * The FSM transition rules are expressed as bit-masks */
+long filter_search(const struct filter *m, const unsigned char *data, unsigned long len)
+{
+	size_t j;
+	uint8_t state = ~0;
+	const uint8_t *B = m->B;
+	const uint8_t *End = m->end;
+
+	/* we use 2-grams, must be higher than 1 */
+	if(len < 2) return -1;
+	/* Shift-Or like search algorithm */
+	for(j=0;j < len-1; j++) {
+		const uint16_t q0 = cli_readint16( &data[j] );
+		uint8_t match_end;
+		state = (state << 1) | B[q0];
+		/* state marks with a 0 bit all active states
+		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
+		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
+		match_end = state | End[q0];
+		if(match_end != 0xff) {
+
+			/* if state is reachable, and this character can finish a pattern, assume match */
+			/* to reduce false positives check if qgram can finish the pattern */
+			/* return position of probable match */
+			/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
+			 * longest pattern that could match */
+			return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
+		}
+	}
+	/* no match */
+	return -1;
+}
diff --git a/libclamav/uniq.h b/libclamav/filtering.h
similarity index 52%
copy from libclamav/uniq.h
copy to libclamav/filtering.h
index 5503182..f03a572 100644
--- a/libclamav/uniq.h
+++ b/libclamav/filtering.h
@@ -1,9 +1,9 @@
 /*
- *  md5 based hashtab
+ *  A fast filter for static patterns.
  *
  *  Copyright (C) 2008 Sourcefire, Inc.
  *
- *  Authors: aCaB <acab at clamav.net>
+ *  Authors: Török Edvin
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2 as
@@ -19,29 +19,24 @@
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
  */
-
-#ifndef _UNIQ_H
-#define _UNIQ_H
-
+#ifndef FILTER_H
+#define FILTER_H
 #include "cltypes.h"
-
-struct UNIQMD5 {
-  struct UNIQMD5 *next;
-  uint32_t count;
-  uint8_t md5[16];
-  char name[33];
+struct filter {
+	uint8_t B[65536];
+	uint8_t end[65536];
+	unsigned long m;
 };
 
-struct uniq {
-  struct UNIQMD5 *md5s;
-  uint32_t items;
-  uint32_t idx[256];
+struct filter_match_info {
+	unsigned long first_match;
 };
 
-struct uniq *uniq_init(uint32_t);
-void uniq_free(struct uniq *);
-uint32_t uniq_add(struct uniq *, const char *, uint32_t, char **);
-uint32_t uniq_get(struct uniq *, const char *, uint32_t, char **);
-
+struct cli_ac_patt;
+void filter_init(struct filter *m);
+long filter_search(const struct filter *m, const unsigned char *data, unsigned long len);
+int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf);
+int  filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name);
+int  filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat);
 
 #endif
diff --git a/libclamav/libclamav.map b/libclamav/libclamav.map
index 5fca6d4..5e382dc 100644
--- a/libclamav/libclamav.map
+++ b/libclamav/libclamav.map
@@ -115,8 +115,10 @@ CLAMAV_PRIVATE {
     cli_bm_init;
     cli_bm_scanbuff;
     cli_bm_free;
+    cli_scanbuff;
     html_screnc_decode;
     mpool_create;
+    mpool_calloc;
     mpool_destroy;
     mpool_free;
     mpool_getstats;
diff --git a/libclamav/others.h b/libclamav/others.h
index f772319..61bc160 100644
--- a/libclamav/others.h
+++ b/libclamav/others.h
@@ -331,6 +331,18 @@ void cli_errmsg(const char *str, ...);
 #define UNLIKELY(cond) (cond)
 #endif
 
+#if defined (__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+#define __hot__ __attribute__((hot))
+#else
+#define __hot__
+#endif
+
+#ifdef __GNUC__
+#define always_inline __attribute__((always_inline))
+#else
+#define always_inline
+#endif
+
 #define cli_dbgmsg (!UNLIKELY(cli_debug_flag)) ? (void)0 : cli_dbgmsg_internal
 
 #ifdef __GNUC__
diff --git a/libclamav/perflogging.c b/libclamav/perflogging.c
new file mode 100644
index 0000000..2e25230
--- /dev/null
+++ b/libclamav/perflogging.c
@@ -0,0 +1,148 @@
+/*
+ *  Gather statistics from performance sensitive code.
+ *
+ *  Copyright (C) 2008 Sourcefire, Inc.
+ *
+ *  Authors: Török Edvin
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ *  MA 02110-1301, USA.
+ */
+#ifdef HAVE_CONFIG_H
+#include "clamav-config.h"
+#endif
+
+#include "perflogging.h"
+#include <stdio.h>
+#ifdef CLI_PERF_LOGGING
+
+__thread last_flushed = 0;
+__thread cli_perf_registered = 0;
+__thread uint64_t cli_perf_sum_tls[__LAST_SUMABLE];
+__thread uint64_t cli_perf_count_tls[__LAST_COUNTABLE][256];
+
+uint64_t cli_perf_sum[__LAST_SUMABLE];
+uint64_t cli_perf_count[__LAST_COUNTABLE][256];
+
+static pthread_key_t thread_exit_key;
+int pthread_key_create(pthread_key_t *key, void (*destr_function) (void *)); 
+
+static void cli_perf_thread_exit(void* arg)
+{
+	/* save counters into global */
+	cli_perf_flush();
+}
+
+void __attribute__((constructor)) __cli_perf_init(void)
+{
+	pthread_key_create(&thread_exit_key, cli_perf_thread_exit);
+}
+
+void __attribute__((destructor)) __cli_perf_exit(void)
+{
+	cli_perf_thread_exit(NULL);
+}
+
+static int dummy;
+void cli_perf_register(void)
+{
+	/* set a fake key, so that destructor gets called */
+	pthread_setspecific(thread_exit_key, &dummy);
+	cli_perf_registered = 1;
+}
+
+static const char *perf_log_names_sum[__LAST_SUMABLE] = {
+	"raw scanned",
+	"filter scanned",
+	"AC scanned",
+	"BM scanned"
+};
+
+static const char *perf_log_names_cnt[__LAST_COUNTABLE] = {
+	"trie bytes scanned",
+	"filter position load",
+	"filter end load",
+	"trie pattern original length"
+};
+
+#define NONE __LAST_SUMABLE
+static enum perf_log_sumable perf_log_percent[__LAST_SUMABLE] = {
+	NONE,
+	RAW_BYTES_SCANNED,
+	RAW_BYTES_SCANNED,
+	RAW_BYTES_SCANNED,
+};
+
+static enum perf_log_countable perf_log_percent_cnt[__LAST_COUNTABLE] = {
+	RAW_BYTES_SCANNED,
+	NONE,
+	NONE,
+	NONE,
+};
+
+static void cli_perf_print(void)
+{
+	enum perf_log_sumable i;
+	enum perf_log_countable j;
+	unsigned k;
+
+	uint64_t raw_scanned = cli_perf_sum[RAW_BYTES_SCANNED];
+	const double MEGA = 1024*1024.0;
+
+	/* in multiscan mode multiple threads can output, so output a unique id
+	 * here*/
+	printf("PERF: %p\n", &cli_perf_registered);
+	for(i=0;i<__LAST_SUMABLE;i++) {
+		printf("PERF: %s: %g MB", perf_log_names_sum[i], cli_perf_sum[i] / MEGA);
+		if (perf_log_percent[i] != NONE)
+			printf("(%6.3f%%)", 100.0*cli_perf_sum[i] / cli_perf_sum[perf_log_percent[i]]);
+		printf("\n");
+	}
+	printf("\n");
+	for(j=0;j<__LAST_COUNTABLE;j++) {
+		printf("PERF: %s: ", perf_log_names_cnt[j]);
+		for (k=0;k<256;k++)
+			if (cli_perf_count[j][k]) {
+				printf(" %u -> %ju", k, cli_perf_count[j][k]);
+				if (perf_log_percent_cnt[j] != NONE)
+					printf("(%6.3f%%)", 100.0*cli_perf_count[j][k] / cli_perf_sum[perf_log_percent_cnt[j]]);
+			}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+static pthread_mutex_t cli_perf_log_mutex = PTHREAD_MUTEX_INITIALIZER;
+void cli_perf_flush(void)
+{
+	unsigned i, j;
+
+	pthread_mutex_lock(&cli_perf_log_mutex);
+
+	for (i = 0; i < __LAST_SUMABLE; i++) {
+		cli_perf_sum[i] += cli_perf_sum_tls[i];
+		cli_perf_sum_tls[i] = 0;
+	}
+
+	for (i = 0; i < __LAST_COUNTABLE; i++) {
+		for (j = 0; j < 256; j++) {
+			cli_perf_count[i][j] += cli_perf_count_tls[i][j];
+			cli_perf_count_tls[i][j] = 0;
+		}
+	}
+
+	cli_perf_print();
+	pthread_mutex_unlock(&cli_perf_log_mutex);
+}
+#endif
diff --git a/libclamav/perflogging.h b/libclamav/perflogging.h
new file mode 100644
index 0000000..07959d2
--- /dev/null
+++ b/libclamav/perflogging.h
@@ -0,0 +1,101 @@
+/*
+ *  Gather statistics from performance sensitive code.
+ *
+ *  Copyright (C) 2008 Sourcefire, Inc.
+ *
+ *  Authors: Török Edvin
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ *  MA 02110-1301, USA.
+ */
+#ifndef PERFLOGGING_H
+#define PERFLOGGING_H
+
+/* this is a compile-time selectable, default off module to log certain
+ * statistics, such as which tries are used, efficiency of filtering and so on.
+ * it must have as little overhead as possible */
+
+//#define CLI_PERF_LOGGING
+#ifdef CLI_PERF_LOGGING
+
+#ifndef __GNUC__
+#error "Performance logging requires GNU C compatible compiler"
+#else
+/*TODO: maybe we need a GCC version check too here */
+#include <pthread.h>
+#include "cltypes.h"
+
+enum perf_log_sumable {
+	RAW_BYTES_SCANNED,
+	FILTER_BYTES_SCANNED,
+  AC_SCANNED,
+	BM_SCANNED,
+	__LAST_SUMABLE
+};
+
+enum perf_log_countable {
+	TRIE_SCANNED,
+	FILTER_LOAD,
+	FILTER_END_LOAD,
+	TRIE_ORIG_LEN,
+	__LAST_COUNTABLE
+};
+
+extern __thread int last_flushed;
+extern __thread int cli_perf_registered;
+extern __thread uint64_t cli_perf_sum_tls[__LAST_SUMABLE];
+extern __thread uint64_t cli_perf_count_tls[__LAST_COUNTABLE][256];
+extern __thread int last_flushed;
+
+extern uint64_t cli_perf_sum[__LAST_SUMABLE];
+extern uint64_t cli_perf_count[__LAST_COUNTABLE][256];
+
+void cli_perf_register(void);
+void cli_perf_flush(void);
+
+static inline void cli_perf_enter(void)
+{
+	if (!cli_perf_registered) cli_perf_register();
+	if (cli_perf_sum_tls[RAW_BYTES_SCANNED] - last_flushed > 100*1024*1024) {
+		cli_perf_flush();
+		last_flushed = cli_perf_sum_tls[RAW_BYTES_SCANNED];
+	}
+}
+
+static inline void cli_perf_log_add(enum perf_log_sumable kind, uint64_t add)
+{
+	cli_perf_enter();
+	assert( kind < __LAST_SUMABLE);
+	cli_perf_sum_tls[kind] += add;
+}
+
+static inline void cli_perf_log_count2(enum perf_log_countable kind, uint8_t event, uint64_t cnt)
+{
+	cli_perf_enter();
+	assert( kind < __LAST_COUNTABLE);
+	cli_perf_count_tls[kind][event] += cnt;
+}
+
+static inline void cli_perf_log_count(enum perf_log_countable kind, uint8_t event)
+{
+	cli_perf_log_count2(kind, event, 1);
+}
+
+#endif
+
+#else
+#define cli_perf_log_count(a,b) do {} while(0)
+#endif
+
+#endif
diff --git a/libclamav/regex_list.c b/libclamav/regex_list.c
index cb737ec..03ae07d 100644
--- a/libclamav/regex_list.c
+++ b/libclamav/regex_list.c
@@ -64,104 +64,6 @@ static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_le
 static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
 /* ---------- */
 
-/* ----- shift-or filtering -------------- */
-
-#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
-#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
-
-static void SO_init(struct filter *m)
-{
-	memset(m->B, ~0, sizeof(m->B));
-	memset(m->end, ~0, sizeof(m->end));
-	memset(m->end_fast, ~0, sizeof(m->end_fast));
-}
-
-/* because we use uint32_t */
-#define MAXSOPATLEN 32
-
-/* merge another pattern into the filter
- * add('abc'); add('bcd'); will match [ab][bc][cd] */
-static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len)
-{
-	uint16_t q;
-	uint8_t j;
-
-	/* cut length, and make it modulo 2 */
-	if(len > MAXSOPATLEN) {
-		len = MAXSOPATLEN;
-	} else {
-		/* we use 2-grams, must be multiple of 2 */
-		len = len & ~1;
-	}
-	if(!len)
-		return 0;
-
-	/* Shift-Or like preprocessing */
-	for(j=0;j < len-1;j++) {
-		/* use overlapping 2-grams. We need them overlapping because matching can start at any position */
-		q = cli_readint16( &pattern[j] );
-		m->B[q] &= ~(1 << j);
-	}
-	/* we use variable length patterns, use last character to mark pattern end,
-	 * can lead to false positives.*/
-	/* mark that at state j, the q-gram q can end the pattern */
-	if(j) {
-		j--;
-		m->end[q] &= ~(1 << j);
-		m->end_fast[pattern[j+1]] &= ~(1<<j);
-	}
-	return 0;
-}
-
-/* this is like a FSM, with multiple active states at the same time.
- * each bit in "state" means an active state, when a char is encountered
- * we determine what states can remain active.
- * The FSM transition rules are expressed as bit-masks */
-long SO_search(const struct filter *m, const unsigned char *data, unsigned long len)
-{
-	size_t j;
-	uint32_t state = ~0;
-	const uint32_t *B = m->B;
-	const uint32_t *End = m->end;
-	const uint32_t *EndFast = m->end_fast;
-
-	/* cut length, and make it modulo 2 */
-	if(len > MAXSOPATLEN) {
-		len = MAXSOPATLEN;
-	} else {
-		/* we use 2-grams, must be multiple of 2 */
-		len = len & ~1;
-	}
-	if(!len) return -1;
-	/* Shift-Or like search algorithm */
-	for(j=0;j < len-1; j++) {
-		const uint16_t q0 = cli_readint16( &data[j] );
-		uint32_t match_end;
-		state = (state << 1) | B[q0];
-		/* state marks with a 0 bit all active states
-		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
-		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
-		match_end = state | EndFast[data[j+1]];
-		if((match_end != 0xffffffff) && (state | End[q0]) !=  0xffffffff) {
-			/* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if
-			 * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working
-			 * set inside L2 */
-
-			/* if state is reachable, and this character can finish a pattern, assume match */
-			/* to reduce false positives check if qgram can finish the pattern */
-			/* return position of probable match */
-			/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
-			 * longest pattern that could match */
-			return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
-		}
-	}
-	/* no match */
-	return -1;
-}
-
-/* ----------------------------------------------------------- */
-
-
 #define MATCH_SUCCESS 0
 #define MATCH_FAILED  -1
 
@@ -295,7 +197,7 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di
 		if(!bufrev)
 			return CL_EMEM;
 		reverse_string(bufrev);
-		rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
+		rc = filter_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
 		if(rc == -1) {
 			free(buffer);
 			free(bufrev);
@@ -368,8 +270,8 @@ int init_regex_list(struct regex_matcher* matcher)
 	if((rc = cli_bm_init(&matcher->sha256_hashes))) {
 		return rc;
 	}
-	SO_init(&matcher->filter);
-	SO_init(&matcher->sha256_filter);
+	filter_init(&matcher->filter);
+	filter_init(&matcher->sha256_filter);
 	return CL_SUCCESS;
 }
 
@@ -432,7 +334,7 @@ static int add_hash(struct regex_matcher *matcher, char* pattern, const char fl)
 		return CL_EMEM;
 	}
 	*pat->virname = fl;
-	SO_preprocess_add(&matcher->sha256_filter, pat->pattern, pat->length);
+	filter_add_static(&matcher->sha256_filter, pat->pattern, pat->length, "regex");
 	if((rc = cli_bm_addpatt(&matcher->sha256_hashes, pat))) {
 		cli_errmsg("add_hash: failed to add BM pattern\n");
 		free(pat->pattern);
@@ -646,7 +548,7 @@ static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info,
 		mpool_free(matcher->mempool, new);
 		return ret;
 	}
-	SO_preprocess_add(&matcher->filter, (const unsigned char*)suffix, len);
+	filter_add_static(&matcher->filter, (const unsigned char*)suffix, len, "regex");
 	return CL_SUCCESS;
 }
 
diff --git a/libclamav/regex_list.h b/libclamav/regex_list.h
index c21f829..971792c 100644
--- a/libclamav/regex_list.h
+++ b/libclamav/regex_list.h
@@ -27,17 +27,11 @@
 #include "phishcheck.h"
 #include "readdb.h"
 #include "matcher.h"
+#include "filtering.h"
 #include <zlib.h> /* for gzFile */
 
 #include "mpool.h"
 
-struct filter {
-	uint32_t B[65536];
-	uint32_t end_fast[256];
-	uint32_t end[65536];
-	unsigned long m;
-};
-
 struct regex_list_ht {
 	struct regex_list *head;
 	struct regex_list *tail;
@@ -69,7 +63,6 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio
 void regex_list_cleanup(struct regex_matcher* matcher);
 void regex_list_done(struct regex_matcher* matcher);
 int is_regex_ok(struct regex_matcher* matcher);
-long SO_search(const struct filter *m, const unsigned char *data, unsigned long len);
 
 #endif
 
diff --git a/unit_tests/check_matchers.c b/unit_tests/check_matchers.c
index 9fbc129..33915ca 100644
--- a/unit_tests/check_matchers.c
+++ b/unit_tests/check_matchers.c
@@ -30,6 +30,7 @@
 #include "../libclamav/matcher.h"
 #include "../libclamav/matcher-ac.h"
 #include "../libclamav/matcher-bm.h"
+#include "../libclamav/others.h"
 #include "../libclamav/default.h"
 #include "checks.h"
 
@@ -46,19 +47,44 @@ static const struct ac_testdata_s {
     { "abdcabcddabccadbbdbacb", "6463{2-3}64646162(63|64|65)6361*6462????6261{-1}6362", "Test_5" },
     { "abcdefghijkabcdefghijk", "62????65666768*696a6b6162{2-3}656667[1-3]6b", "Test_6" },
     { "abcadbabcadbabcacb", "6?6164?26?62{3}?26162?361", "Test_7" },
+    /* testcase for filter bug: it was checking only first 32 chars, and last
+     * maxpatlen */
+    { "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1dddddddddddddddddddd5\1\1\1\1\1\1\1\1\1\1\1\1\1","6464646464646464646464646464646464646464(35|36)","Test_8"},
 
     { NULL, NULL, NULL}
 };
 
-START_TEST (test_ac_scanbuff) {
+
+static cli_ctx ctx;
+static const char *virname = NULL;
+static void setup(void)
+{
 	struct cli_matcher *root;
+	virname = NULL;
+	ctx.virname = &virname;
+	ctx.engine = cl_engine_new();
+	fail_unless(!!ctx.engine, "cl_engine_new() failed");
+	root = (struct cli_matcher *) mpool_calloc(ctx.engine->mempool, 1, sizeof(struct cli_matcher));
+	fail_unless(root != NULL, "root == NULL");
+#ifdef USE_MPOOL
+	root->mempool = ctx.engine->mempool;
+#endif
+
+	ctx.engine->root[0] = root;
+}
+
+static void teardown(void)
+{
+	cl_engine_free((struct cl_engine*)ctx.engine);
+}
+
+START_TEST (test_ac_scanbuff) {
 	struct cli_ac_data mdata;
-	const char *virname = NULL;
+	struct cli_matcher *root;
 	unsigned int i;
 	int ret;
 
-
-    root = (struct cli_matcher *) cli_calloc(1, sizeof(struct cli_matcher));
+    root = ctx.engine->root[0];
     fail_unless(root != NULL, "root == NULL");
     root->ac_only = 1;
 
@@ -68,6 +94,7 @@ START_TEST (test_ac_scanbuff) {
     ret = cli_ac_init(root, CLI_DEFAULT_AC_MINDEPTH, CLI_DEFAULT_AC_MAXDEPTH);
     fail_unless(ret == CL_SUCCESS, "cli_ac_init() failed");
 
+
     for(i = 0; ac_testdata[i].data; i++) {
 	ret = cli_parse_add(root, ac_testdata[i].virname, ac_testdata[i].hexsig, 0, 0, NULL, 0, NULL, 0);
 	fail_unless(ret == CL_SUCCESS, "cli_parse_add() failed");
@@ -83,14 +110,13 @@ START_TEST (test_ac_scanbuff) {
 	ret = cli_ac_scanbuff(ac_testdata[i].data, strlen(ac_testdata[i].data), &virname, NULL, NULL, root, &mdata, 0, 0, -1, NULL, AC_SCAN_VIR, NULL);
 	fail_unless_fmt(ret == CL_VIRUS, "cli_ac_scanbuff() failed for %s", ac_testdata[i].virname);
 	fail_unless_fmt(!strncmp(virname, ac_testdata[i].virname, strlen(ac_testdata[i].virname)), "Dataset %u matched with %s", i, virname);
+
+	ret = cli_scanbuff(ac_testdata[i].data, strlen(ac_testdata[i].data), &ctx, 0, NULL);
+	fail_unless_fmt(ret == CL_VIRUS, "cli_scanbuff() failed for %s", ac_testdata[i].virname);
+	fail_unless_fmt(!strncmp(virname, ac_testdata[i].virname, strlen(ac_testdata[i].virname)), "Dataset %u matched with %s", i, virname);
     }
 
     cli_ac_freedata(&mdata);
-    cli_ac_free(root);
-#ifdef USE_MPOOL
-    mpool_destroy(root->mempool);
-#endif
-    free(root);
 }
 END_TEST
 
@@ -100,7 +126,7 @@ START_TEST (test_bm_scanbuff) {
 	int ret;
 
 
-    root = (struct cli_matcher *) cli_calloc(1, sizeof(struct cli_matcher));
+    root = ctx.engine->root[0];
     fail_unless(root != NULL, "root == NULL");
 
 #ifdef USE_MPOOL
@@ -119,11 +145,6 @@ START_TEST (test_bm_scanbuff) {
     ret = cli_bm_scanbuff("blah\xde\xad\xbe\xef", 12, &virname, root, 0, 0, -1);
     fail_unless(ret == CL_VIRUS, "cli_bm_scanbuff() failed");
     fail_unless(!strncmp(virname, "Sig2", 4), "Incorrect signature matched in cli_bm_scanbuff()\n");
-    cli_bm_free(root);
-#ifdef USE_MPOOL
-    mpool_destroy(root->mempool);
-#endif
-    free(root);
 }
 END_TEST
 
@@ -133,6 +154,7 @@ Suite *test_matchers_suite(void)
     TCase *tc_matchers;
     tc_matchers = tcase_create("matchers");
     suite_add_tcase(s, tc_matchers);
+    tcase_add_checked_fixture (tc_matchers, setup, teardown);
     tcase_add_test(tc_matchers, test_ac_scanbuff);
     tcase_add_test(tc_matchers, test_bm_scanbuff);
     return s;

-- 
Debian repository for ClamAV