[Forensics-changes] [yara] 391/415: Avoid duplicated fibers in yr_re_exec
Hilko Bengen
bengen at moszumanska.debian.org
Thu Apr 3 05:43:27 UTC 2014
This is an automated email from the git hooks/post-receive script.
bengen pushed a commit to branch debian
in repository yara.
commit 2bf181fc4e192e12a120eb8076211ee945522dbf
Author: Victor M. Alvarez <plusvic at gmail.com>
Date: Wed Feb 5 11:44:21 2014 +0100
Avoid duplicated fibers in yr_re_exec
This makes some regexps faster and avoid consuming too much memory on certain cases.
---
libyara/re.c | 411 ++++++++++++++++++++++++++++++++++++--------------------
libyara/rules.c | 36 ++---
2 files changed, 290 insertions(+), 157 deletions(-)
diff --git a/libyara/re.c b/libyara/re.c
index 863a1c2..cb2e452 100644
--- a/libyara/re.c
+++ b/libyara/re.c
@@ -42,8 +42,7 @@ order to avoid confusion with operating system threads.
#include "re.h"
-#define MAX_RE_STACK 1024
-
+#define RE_MAX_STACK 1024
#define RE_SCAN_LIMIT 4096
#define EMIT_FLAGS_BACKWARDS 1
@@ -58,7 +57,7 @@ typedef struct _RE_FIBER
{
uint8_t* ip;
int32_t sp;
- uint16_t stack[MAX_RE_STACK];
+ uint16_t stack[RE_MAX_STACK];
struct _RE_FIBER* prev;
struct _RE_FIBER* next;
@@ -955,6 +954,36 @@ int yr_re_emit_code(
}
+int _yr_re_alloc_storage(
+ RE_THREAD_STORAGE** storage)
+{
+ #ifdef WIN32
+ *storage = TlsGetValue(thread_storage_key);
+ #else
+ *storage = pthread_getspecific(thread_storage_key);
+ #endif
+
+ if (*storage == NULL)
+ {
+ *storage = yr_malloc(sizeof(RE_THREAD_STORAGE));
+
+ if (*storage == NULL)
+ return ERROR_INSUFICIENT_MEMORY;
+
+ (*storage)->fiber_pool.head = NULL;
+ (*storage)->fiber_pool.tail = NULL;
+
+ #ifdef WIN32
+ TlsSetValue(thread_storage_key, *storage);
+ #else
+ pthread_setspecific(thread_storage_key, *storage);
+ #endif
+ }
+
+ return ERROR_SUCCESS;
+}
+
+
RE_FIBER* _yr_re_fiber_create(
RE_FIBER_LIST* fiber_pool)
{
@@ -983,10 +1012,20 @@ RE_FIBER* _yr_re_fiber_create(
return fiber;
}
+
+//
+// _yr_re_fiber_append
+//
+// Appends 'fiber' to 'fiber_list'
+//
+
void _yr_re_fiber_append(
- RE_FIBER* fiber,
- RE_FIBER_LIST* fiber_list)
+ RE_FIBER_LIST* fiber_list,
+ RE_FIBER* fiber)
{
+ assert(fiber->prev == NULL);
+ assert(fiber->next == NULL);
+
fiber->prev = fiber_list->tail;
if (fiber_list->tail != NULL)
@@ -1002,6 +1041,69 @@ void _yr_re_fiber_append(
}
+//
+// _yr_re_fiber_exists
+//
+// Verifies if a fiber with the same properties (ip, sp, and stack values)
+// than 'target_fiber' exists in 'fiber_list'. The list is iterated from
+// the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
+// taken into account.
+//
+
+int _yr_re_fiber_exists(
+ RE_FIBER_LIST* fiber_list,
+ RE_FIBER* target_fiber,
+ RE_FIBER* last_fiber)
+{
+ RE_FIBER* fiber = fiber_list->head;
+
+ int equal_stacks;
+ int i;
+
+
+ if (last_fiber == NULL)
+ return FALSE;
+
+ while (fiber != last_fiber->next)
+ {
+ if (fiber->ip == target_fiber->ip &&
+ fiber->sp == target_fiber->sp)
+ {
+ equal_stacks = TRUE;
+
+ for (i = 0; i <= fiber->sp; i++)
+ {
+ if (fiber->stack[i] != target_fiber->stack[i])
+ {
+ equal_stacks = FALSE;
+ break;
+ }
+ }
+
+ if (equal_stacks)
+ return TRUE;
+ }
+
+ fiber = fiber->next;
+ }
+
+ return FALSE;
+}
+
+
+//
+// _yr_re_fiber_split
+//
+// Duplicates 'fiber' in 'fiber_list', if 'fiber_list' was:
+//
+// f1 -> f2 -> f3 -> f4
+//
+// Splitting f2 will result in:
+//
+// f1 -> f2 -> f2 -> f3 -> f4
+//
+//
+
RE_FIBER* _yr_re_fiber_split(
RE_FIBER* fiber,
RE_FIBER_LIST* fiber_list,
@@ -1047,9 +1149,9 @@ RE_FIBER* _yr_re_fiber_split(
//
RE_FIBER* _yr_re_fiber_kill(
- RE_FIBER* fiber,
RE_FIBER_LIST* fiber_list,
- RE_FIBER_LIST* fiber_pool)
+ RE_FIBER_LIST* fiber_pool,
+ RE_FIBER* fiber)
{
RE_FIBER* next_fiber = fiber->next;
@@ -1086,9 +1188,9 @@ RE_FIBER* _yr_re_fiber_kill(
//
void _yr_re_fiber_kill_tail(
- RE_FIBER* fiber,
RE_FIBER_LIST* fiber_list,
- RE_FIBER_LIST* fiber_pool)
+ RE_FIBER_LIST* fiber_pool,
+ RE_FIBER* fiber)
{
RE_FIBER* prev_fiber = fiber->prev;
@@ -1111,18 +1213,91 @@ void _yr_re_fiber_kill_tail(
}
-#define prolog \
- if (count >= max_count) \
- { \
- fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool); \
- break; \
- } \
+//
+// _yr_re_fiber_kill_tail
+//
+// Kills all fibers from in the fiber list.
+//
+
+void _yr_re_fiber_kill_all(
+ RE_FIBER_LIST* fiber_list,
+ RE_FIBER_LIST* fiber_pool)
+{
+ if (fiber_list->head != NULL)
+ _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
+}
+
+
+//
+// _yr_re_fiber_sync
+//
+// Executes a fiber until reaching an "matching" instruction. A "matching"
+// instruction is one that actually reads a byte from the input and performs
+// some matching. If the fiber reaches a split instruction, the new fiber is
+// also synced.
+//
+
+void _yr_re_fiber_sync(
+ RE_FIBER_LIST* fiber_list,
+ RE_FIBER_LIST* fiber_pool,
+ RE_FIBER* fiber_to_sync)
+{
+ RE_FIBER* fiber;
+ RE_FIBER* last;
+ RE_FIBER* prev;
+ RE_FIBER* new_fiber;
+
+ fiber = fiber_to_sync;
+ prev = fiber_to_sync->prev;
+ last = fiber_to_sync->next;
+
+ while(fiber != last)
+ {
+ switch(*fiber->ip)
+ {
+ case RE_OPCODE_SPLIT_A:
+ new_fiber = _yr_re_fiber_split(fiber, fiber_list, fiber_pool);
+ new_fiber->ip += *(int16_t*)(fiber->ip + 1);
+ fiber->ip += 3;
+ break;
+
+ case RE_OPCODE_SPLIT_B:
+ new_fiber = _yr_re_fiber_split(fiber, fiber_list, fiber_pool);
+ new_fiber->ip += 3;
+ fiber->ip += *(int16_t*)(fiber->ip + 1);
+ break;
+
+ case RE_OPCODE_JUMP:
+ fiber->ip += *(int16_t*)(fiber->ip + 1);
+ break;
+
+ case RE_OPCODE_JNZ:
+ fiber->stack[fiber->sp]--;
+ if (fiber->stack[fiber->sp] > 0)
+ fiber->ip += *(int16_t*)(fiber->ip + 1);
+ else
+ fiber->ip += 3;
+ break;
+
+ case RE_OPCODE_PUSH:
+ fiber->stack[++fiber->sp] = *(uint16_t*)(fiber->ip + 1);
+ fiber->ip += 3;
+ break;
+
+ case RE_OPCODE_POP:
+ fiber->sp--;
+ fiber->ip++;
+ break;
+
+ default:
+ if (_yr_re_fiber_exists(fiber_list, fiber, prev))
+ fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
+ else
+ fiber = fiber->next;
+ }
+ }
+}
-#define epilog \
- if (match) \
- fiber = fiber->next; \
- else\
- fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool) \
//
// yr_re_exec
@@ -1141,6 +1316,12 @@ void _yr_re_fiber_kill_tail(
// RE_MATCH_CALLBACK_FUNC callback - Callback function
// void* callback_args - Callback argument
//
+// Returns:
+// Integer indicating the number of matching bytes, including 0 when
+// matching an empty regexp. Negative values indicate:
+// -1 No match
+// -2 Insuficient memory
+// -3 Fatal error
int yr_re_exec(
uint8_t* code,
@@ -1158,7 +1339,7 @@ int yr_re_exec(
RE_FIBER_LIST fibers;
RE_THREAD_STORAGE* storage;
RE_FIBER* fiber;
- RE_FIBER* new_fiber;
+ RE_FIBER* next_fiber;
int count;
int max_count;
@@ -1166,43 +1347,25 @@ int yr_re_exec(
int character_size;
int result = -1;
- #ifdef WIN32
- storage = TlsGetValue(thread_storage_key);
- #else
- storage = pthread_getspecific(thread_storage_key);
- #endif
-
- if (storage == NULL)
- {
- storage = yr_malloc(sizeof(RE_THREAD_STORAGE));
-
- if (storage == NULL)
- return ERROR_INSUFICIENT_MEMORY;
-
- storage->fiber_pool.head = NULL;
- storage->fiber_pool.tail = NULL;
-
- #ifdef WIN32
- TlsSetValue(thread_storage_key, storage);
- #else
- pthread_setspecific(thread_storage_key, storage);
- #endif
- }
+ if (_yr_re_alloc_storage(&storage) != ERROR_SUCCESS)
+ return -2;
if (flags & RE_FLAGS_WIDE)
character_size = 2;
else
character_size = 1;
+ max_count = min(input_size, RE_SCAN_LIMIT);
+ input = input_data;
+ count = 0;
+
fiber = _yr_re_fiber_create(&storage->fiber_pool);
fiber->ip = code;
fibers.head = fiber;
fibers.tail = fiber;
- input = input_data;
- count = 0;
- max_count = min(input_size, RE_SCAN_LIMIT);
+ _yr_re_fiber_sync(&fibers, &storage->fiber_pool, fiber);
while (fibers.head != NULL)
{
@@ -1212,27 +1375,61 @@ int yr_re_exec(
{
ip = fiber->ip;
+ if (*ip == RE_OPCODE_MATCH ||
+ *ip == RE_OPCODE_MATCH_AT_START ||
+ *ip == RE_OPCODE_MATCH_AT_END)
+ {
+ if ((*ip == RE_OPCODE_MATCH_AT_START &&
+ input_size - 1 > count - character_size) ||
+ (*ip == RE_OPCODE_MATCH_AT_END &&
+ input_size > count))
+ {
+ fiber = _yr_re_fiber_kill(&fibers, &storage->fiber_pool, fiber);
+ continue;
+ }
+
+ result = count;
+
+ if (flags & RE_FLAGS_EXHAUSTIVE)
+ {
+ if (flags & RE_FLAGS_BACKWARDS)
+ callback(input + character_size, count, flags, callback_args);
+ else
+ callback(input_data, count, flags, callback_args);
+
+ fiber = _yr_re_fiber_kill(&fibers, &storage->fiber_pool, fiber);
+ }
+ else
+ {
+ _yr_re_fiber_kill_tail(&fibers, &storage->fiber_pool, fiber);
+ fiber = NULL;
+ }
+
+ continue;
+ }
+
+ if (count >= max_count)
+ {
+ fiber = _yr_re_fiber_kill(&fibers, &storage->fiber_pool, fiber);
+ continue;
+ }
+
switch(*ip)
{
+ case RE_OPCODE_ANY:
+ match = (*input != 0x0A || flags & RE_FLAGS_DOT_ALL);
+ fiber->ip += 1;
+ break;
+
case RE_OPCODE_LITERAL:
- prolog;
if (flags & RE_FLAGS_NO_CASE)
match = lowercase[*input] == lowercase[*(ip + 1)];
else
match = (*input == *(ip + 1));
fiber->ip += 2;
- epilog;
- break;
-
- case RE_OPCODE_ANY:
- prolog;
- match = (*input != 0x0A || flags & RE_FLAGS_DOT_ALL);
- fiber->ip += 1;
- epilog;
break;
case RE_OPCODE_MASKED_LITERAL:
- prolog;
value = *(int16_t*)(ip + 1) & 0xFF;
mask = *(int16_t*)(ip + 1) >> 8;
@@ -1242,136 +1439,64 @@ int yr_re_exec(
match = ((*input & mask) == value);
fiber->ip += 3;
- epilog;
break;
case RE_OPCODE_CLASS:
- prolog;
if (flags & RE_FLAGS_NO_CASE)
match = CHAR_IN_CLASS(*input, ip + 1) ||
CHAR_IN_CLASS(altercase[*input], ip + 1);
else
match = CHAR_IN_CLASS(*input, ip + 1);
fiber->ip += 33;
- epilog;
break;
case RE_OPCODE_WORD_CHAR:
- prolog;
match = (isalnum(*input) || *input == '_');
fiber->ip += 1;
- epilog;
break;
case RE_OPCODE_NON_WORD_CHAR:
- prolog;
match = (!isalnum(*input) && *input != '_');
fiber->ip += 1;
- epilog;
break;
case RE_OPCODE_SPACE:
- prolog;
match = (*input == ' ' || *input == '\t');
fiber->ip += 1;
- epilog;
break;
case RE_OPCODE_NON_SPACE:
- prolog;
match = (*input != ' ' && *input != '\t');
fiber->ip += 1;
- epilog;
break;
case RE_OPCODE_DIGIT:
- prolog;
match = isdigit(*input);
fiber->ip += 1;
- epilog;
break;
case RE_OPCODE_NON_DIGIT:
- prolog;
match = !isdigit(*input);
fiber->ip += 1;
- epilog;
- break;
-
- case RE_OPCODE_SPLIT_A:
- new_fiber = _yr_re_fiber_split(fiber, &fibers, &storage->fiber_pool);
- new_fiber->ip += *(int16_t*)(ip + 1);
- fiber->ip += 3;
- break;
-
- case RE_OPCODE_SPLIT_B:
- new_fiber = _yr_re_fiber_split(fiber, &fibers, &storage->fiber_pool);
- new_fiber->ip += 3;
- fiber->ip += *(int16_t*)(ip + 1);
- break;
-
- case RE_OPCODE_JUMP:
- fiber->ip = ip + *(int16_t*)(ip + 1);
- break;
-
- case RE_OPCODE_JNZ:
- fiber->stack[fiber->sp]--;
- if (fiber->stack[fiber->sp] > 0)
- fiber->ip = ip + *(int16_t*)(ip + 1);
- else
- fiber->ip += 3;
- break;
-
- case RE_OPCODE_PUSH:
- fiber->stack[++fiber->sp] = *(uint16_t*)(ip + 1);
- fiber->ip += 3;
- break;
-
- case RE_OPCODE_POP:
- fiber->sp--;
- fiber->ip++;
- break;
-
- case RE_OPCODE_MATCH:
- case RE_OPCODE_MATCH_AT_START:
- case RE_OPCODE_MATCH_AT_END:
-
- if ((*ip == RE_OPCODE_MATCH_AT_START &&
- input_size - 1 > count - character_size) ||
- (*ip == RE_OPCODE_MATCH_AT_END &&
- input_size > count))
- {
- fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool);
- break;
- }
-
- result = count;
-
- if (flags & RE_FLAGS_EXHAUSTIVE)
- {
- if (flags & RE_FLAGS_BACKWARDS)
- callback(input + character_size, count, flags, callback_args);
- else
- callback(input_data, count, flags, callback_args);
-
- fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool);
- }
- else
- {
- _yr_re_fiber_kill_tail(fiber, &fibers, &storage->fiber_pool);
- fiber = NULL;
- }
-
break;
default:
assert(FALSE);
}
+
+ if (!match)
+ {
+ fiber = _yr_re_fiber_kill(&fibers, &storage->fiber_pool, fiber);
+ continue;
+ }
+
+ next_fiber = fiber->next;
+ _yr_re_fiber_sync(&fibers, &storage->fiber_pool, fiber);
+ fiber = next_fiber;
}
- if (fibers.head != NULL &&
- flags & RE_FLAGS_WIDE && *(input + 1) != 0)
- _yr_re_fiber_kill_tail(fibers.head, &fibers, &storage->fiber_pool);
+ if (flags & RE_FLAGS_WIDE && *(input + 1) != 0)
+ _yr_re_fiber_kill_all(&fibers, &storage->fiber_pool);
if (flags & RE_FLAGS_BACKWARDS)
input -= character_size;
@@ -1380,11 +1505,13 @@ int yr_re_exec(
count += character_size;
- if ((flags & RE_FLAGS_SCAN) && count < max_count)
+ if (flags & RE_FLAGS_SCAN && count < max_count)
{
fiber = _yr_re_fiber_create(&storage->fiber_pool);
fiber->ip = code;
- _yr_re_fiber_append(fiber, &fibers);
+
+ _yr_re_fiber_append(&fibers, fiber);
+ _yr_re_fiber_sync(&fibers, &storage->fiber_pool, fiber);
}
}
diff --git a/libyara/rules.c b/libyara/rules.c
index d240053..a9ed9f0 100644
--- a/libyara/rules.c
+++ b/libyara/rules.c
@@ -152,6 +152,8 @@ inline int _yr_scan_wicompare(
// characteristics of the code generated for this kind of strings and do the
// matching in a faster way.
//
+// See return values in yr_re_exec (re.c)
+//
#define MAX_FAST_HEX_RE_STACK 300
@@ -267,7 +269,7 @@ int _yr_scan_fast_hex_re_exec(
assert(sp < MAX_FAST_HEX_RE_STACK);
if (sp >= MAX_FAST_HEX_RE_STACK)
- return -2;
+ return -3;
code_stack[sp] = ip + 11;
input_stack[sp] = next_input;
@@ -685,11 +687,15 @@ int _yr_scan_verify_re_match(
NULL);
}
- if (forward_matches == -2)
- return ERROR_INTERNAL_FATAL_ERROR;
-
- if (forward_matches == -1)
- return ERROR_SUCCESS;
+ switch(forward_matches)
+ {
+ case -1:
+ return ERROR_SUCCESS;
+ case -2:
+ return ERROR_INSUFICIENT_MEMORY;
+ case -3:
+ return ERROR_INTERNAL_FATAL_ERROR;
+ }
if (forward_matches == 0 && ac_match->backward_code == NULL)
return ERROR_SUCCESS;
@@ -1025,13 +1031,13 @@ int yr_rules_scan_mem_block(
{
if (ac_match->backtrack <= i)
{
- _yr_scan_verify_match(
- ac_match,
- data,
- data_size,
- i - ac_match->backtrack,
- matches_arena,
- fast_scan_mode);
+ FAIL_ON_ERROR(_yr_scan_verify_match(
+ ac_match,
+ data,
+ data_size,
+ i - ac_match->backtrack,
+ matches_arena,
+ fast_scan_mode));
}
ac_match = ac_match->next;
@@ -1063,13 +1069,13 @@ int yr_rules_scan_mem_block(
{
if (ac_match->backtrack <= data_size)
{
- _yr_scan_verify_match(
+ FAIL_ON_ERROR(_yr_scan_verify_match(
ac_match,
data,
data_size,
data_size - ac_match->backtrack,
matches_arena,
- fast_scan_mode);
+ fast_scan_mode));
}
ac_match = ac_match->next;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git
More information about the forensics-changes
mailing list