[Forensics-changes] [yara] 270/415: Improve yr_re_execute with a non-recursive algorithm
Hilko Bengen
bengen at moszumanska.debian.org
Thu Apr 3 05:43:14 UTC 2014
This is an automated email from the git hooks/post-receive script.
bengen pushed a commit to branch debian
in repository yara.
commit c5b3b65a32a0a0917a39376e832ece2fa352c69f
Author: Victor Manuel Alvarez <vmalvarez at virustotal.com>
Date: Mon Dec 2 23:39:15 2013 +0100
Improve yr_re_execute with a non-recursive algorithm
---
libyara/re.c | 511 +++++++++++++++++++++++++++--------------------------------
1 file changed, 232 insertions(+), 279 deletions(-)
diff --git a/libyara/re.c b/libyara/re.c
index 5596c0f..0efeeb3 100644
--- a/libyara/re.c
+++ b/libyara/re.c
@@ -41,7 +41,6 @@ order to avoid confusion with operating system threads.
#include "re.h"
-#define MAX_RE_FIBERS 1024
#define MAX_RE_STACK 1024
#define RE_SCAN_LIMIT 65535
@@ -53,55 +52,30 @@ order to avoid confusion with operating system threads.
#define min(x, y) ((x < y) ? (x) : (y))
#endif
-// Each fiber has an associated stack, which is used by
-// PUSH, POP and JNZ
-typedef struct _RE_FIBER_DATA
+typedef struct _RE_FIBER
{
- int stack_top;
+ uint8_t* ip;
+ int32_t sp;
uint16_t stack[MAX_RE_STACK];
- struct _RE_FIBER_DATA* next;
- struct _RE_FIBER_DATA* prev;
-
-} RE_FIBER_DATA;
-
-
-// Stacks are allocated as needed, and freed stacks are kept in
-// a pool for later re-use.
-
-typedef struct _RE_FIBER_DATA_POOL
-{
- RE_FIBER_DATA* free;
- RE_FIBER_DATA* used;
-
-} RE_FIBER_DATA_POOL;
-
-
-// A fiber is described by its current instruction pointer and
-// its fiber_data.
-
-typedef struct _RE_FIBER
-{
- uint8_t* ip;
- RE_FIBER_DATA* fiber_data;
+ struct _RE_FIBER* prev;
+ struct _RE_FIBER* next;
} RE_FIBER;
typedef struct _RE_FIBER_LIST
{
- int count;
- RE_FIBER items[MAX_RE_FIBERS];
+ RE_FIBER* head;
+ RE_FIBER* tail;
} RE_FIBER_LIST;
typedef struct _RE_THREAD_STORAGE
{
- RE_FIBER_LIST list1;
- RE_FIBER_LIST list2;
- RE_FIBER_DATA_POOL fiber_data_pool;
+ RE_FIBER_LIST fiber_pool;
} RE_THREAD_STORAGE;
@@ -168,8 +142,8 @@ int yr_re_finalize()
int yr_re_finalize_thread()
{
- RE_FIBER_DATA* fiber_data;
- RE_FIBER_DATA* next_fiber_data;
+ RE_FIBER* fiber;
+ RE_FIBER* next_fiber;
RE_THREAD_STORAGE* storage;
#ifdef WIN32
@@ -180,13 +154,13 @@ int yr_re_finalize_thread()
if (storage != NULL)
{
- fiber_data = storage->fiber_data_pool.free;
+ fiber = storage->fiber_pool.head;
- while (fiber_data != NULL)
+ while (fiber != NULL)
{
- next_fiber_data = fiber_data->next;
- yr_free(fiber_data);
- fiber_data = next_fiber_data;
+ next_fiber = fiber->next;
+ yr_free(fiber);
+ fiber = next_fiber;
}
yr_free(storage);
@@ -426,12 +400,10 @@ int _yr_re_emit(
int split_size;
int inst_size;
int jmp_size;
- int jmp_offset;
RE_NODE* left;
RE_NODE* right;
- uint16_t idx;
int16_t* split_offset_addr;
int16_t* jmp_offset_addr;
uint8_t* instruction_addr;
@@ -851,188 +823,174 @@ int yr_re_emit_code(
}
-RE_FIBER_DATA* _yr_re_alloc_fiber_data(
- RE_FIBER_DATA_POOL* pool)
+RE_FIBER* _yr_re_fiber_create(
+ RE_FIBER_LIST* fiber_pool)
{
- RE_FIBER_DATA* fiber_data;
+ RE_FIBER* fiber;
- if (pool->free != NULL)
+ if (fiber_pool->head != NULL)
{
- fiber_data = pool->free;
- pool->free = fiber_data->next;
-
- if (pool->free != NULL)
- pool->free->prev = NULL;
+ fiber = fiber_pool->head;
+ fiber_pool->head = fiber->next;
+ if (fiber_pool->tail == fiber)
+ fiber_pool->tail = NULL;
}
else
{
- fiber_data = yr_malloc(sizeof(RE_FIBER_DATA));
+ fiber = yr_malloc(sizeof(RE_FIBER));
}
- fiber_data->stack_top = -1;
- fiber_data->prev = NULL;
-
- if (pool->used != NULL)
- pool->used->prev = fiber_data;
-
- fiber_data->next = pool->used;
- pool->used = fiber_data;
+ if (fiber != NULL)
+ {
+ fiber->ip = NULL;
+ fiber->sp = -1;
+ fiber->next = NULL;
+ fiber->prev = NULL;
+ }
- return fiber_data;
+ return fiber;
}
-
-RE_FIBER_DATA* _yr_re_clone_fiber_data(
- RE_FIBER_DATA* fiber_data,
- RE_FIBER_DATA_POOL* pool)
+void _yr_re_fiber_append(
+ RE_FIBER* fiber,
+ RE_FIBER_LIST* fiber_list)
{
- RE_FIBER_DATA* clon;
- int i;
+ fiber->prev = fiber_list->tail;
- if (fiber_data == NULL)
- return NULL;
+ if (fiber_list->tail != NULL)
+ fiber_list->tail->next = fiber;
- clon = _yr_re_alloc_fiber_data(pool);
- clon->stack_top = fiber_data->stack_top;
+ fiber_list->tail = fiber;
- for (i = 0; i < clon->stack_top; i++)
- clon->stack[i] = fiber_data->stack[i];
+ if (fiber_list->head == NULL)
+ fiber_list->head = fiber;
- return clon;
+ assert(fiber_list->tail->next == NULL);
+ assert(fiber_list->head->prev == NULL);
}
-void _yr_re_free_fiber_data(
- RE_FIBER_DATA* fiber_data,
- RE_FIBER_DATA_POOL* pool)
+RE_FIBER* _yr_re_fiber_split(
+ RE_FIBER* fiber,
+ RE_FIBER_LIST* fiber_list,
+ RE_FIBER_LIST* fiber_pool)
{
- if (fiber_data == NULL)
- return;
+ RE_FIBER* new_fiber;
+ int32_t i;
- if (pool->used == fiber_data)
- pool->used = fiber_data->next;
+ new_fiber = _yr_re_fiber_create(fiber_pool);
- if (fiber_data->prev != NULL)
- fiber_data->prev->next = fiber_data->next;
+ if (new_fiber == NULL)
+ return NULL;
- if (fiber_data->next != NULL)
- fiber_data->next->prev = fiber_data->prev;
+ new_fiber->sp = fiber->sp;
+ new_fiber->ip = fiber->ip;
- fiber_data->next = pool->free;
+ for (i = 0; i < fiber->sp; i++)
+ new_fiber->stack[i] = fiber->stack[i];
- if (pool->free != NULL)
- pool->free->prev = fiber_data;
+ new_fiber->next = fiber->next;
+ new_fiber->prev = fiber;
- pool->free = fiber_data;
- fiber_data->prev = NULL;
-}
+ if (fiber->next != NULL)
+ fiber->next->prev = new_fiber;
+ fiber->next = new_fiber;
-int _yr_re_fiber_exists(
- RE_FIBER_LIST* fibers,
- uint8_t* ip)
-{
- int i;
+ if (fiber_list->tail == fiber)
+ fiber_list->tail = new_fiber;
- for (i = 0; i < fibers->count; i++)
- if (fibers->items[i].ip == ip)
- return TRUE;
+ assert(fiber_list->tail->next == NULL);
+ assert(fiber_list->head->prev == NULL);
- return FALSE;
+ return new_fiber;
}
-void _yr_re_add_fiber(
- RE_FIBER_LIST* fibers,
- RE_THREAD_STORAGE* storage,
- uint8_t* ip,
- uint8_t* input,
- RE_FIBER_DATA* fiber_data)
+//
+// _yr_re_fiber_kill
+//
+// Kills a given fiber by removing it from the fiber list and putting it
+// in the fiber pool.
+//
+
+RE_FIBER* _yr_re_fiber_kill(
+ RE_FIBER* fiber,
+ RE_FIBER_LIST* fiber_list,
+ RE_FIBER_LIST* fiber_pool)
{
- RE_FIBER_DATA* new_fiber_data;
+ RE_FIBER* next_fiber = fiber->next;
- uint16_t counter_index;
- int16_t jmp_offset;
+ if (fiber->prev != NULL)
+ fiber->prev->next = next_fiber;
- if (_yr_re_fiber_exists(fibers, ip))
- {
- _yr_re_free_fiber_data(fiber_data, &storage->fiber_data_pool);
- return;
- }
+ if (next_fiber != NULL)
+ next_fiber->prev = fiber->prev;
- switch(*ip)
- {
- case RE_OPCODE_JUMP:
- jmp_offset = *(int16_t*)(ip + 1);
- _yr_re_add_fiber(fibers, storage, ip + jmp_offset, input, fiber_data);
- break;
+ if (fiber_pool->tail != NULL)
+ fiber_pool->tail->next = fiber;
- case RE_OPCODE_JNZ:
- jmp_offset = *(int16_t*)(ip + 1);
- fiber_data->stack[fiber_data->stack_top]--;
+ if (fiber_list->tail == fiber)
+ fiber_list->tail = fiber->prev;
- if (fiber_data->stack[fiber_data->stack_top] > 0)
- _yr_re_add_fiber(fibers, storage, ip + jmp_offset, input, fiber_data);
- else
- _yr_re_add_fiber(fibers, storage, ip + 3, input, fiber_data);
- break;
+ if (fiber_list->head == fiber)
+ fiber_list->head = next_fiber;
- case RE_OPCODE_PUSH:
- if (fiber_data == NULL)
- fiber_data = _yr_re_alloc_fiber_data(&storage->fiber_data_pool);
+ fiber->next = NULL;
+ fiber->prev = fiber_pool->tail;
+ fiber_pool->tail = fiber;
- fiber_data->stack[++fiber_data->stack_top] = *(uint16_t*)(ip + 1);
- _yr_re_add_fiber(fibers, storage, ip + 3, input, fiber_data);
- break;
+ if (fiber_pool->head == NULL)
+ fiber_pool->head = fiber;
- case RE_OPCODE_POP:
- fiber_data->stack_top--;
- if (fiber_data->stack_top == -1)
- {
- _yr_re_free_fiber_data(fiber_data, &storage->fiber_data_pool);
- fiber_data = NULL;
- }
- _yr_re_add_fiber(fibers, storage, ip + 1, input, fiber_data);
- break;
+ return next_fiber;
+}
- case RE_OPCODE_SPLIT_A:
- jmp_offset = *(int16_t*)(ip + 1);
- new_fiber_data = _yr_re_clone_fiber_data(
- fiber_data, &storage->fiber_data_pool);
+//
+// _yr_re_fiber_kill_tail
+//
+// Kills all fibers from the given one up to the end of the fiber list.
+//
- _yr_re_add_fiber(
- fibers, storage, ip + 3, input, fiber_data);
- _yr_re_add_fiber(
- fibers, storage, ip + jmp_offset, input, new_fiber_data);
- break;
+void _yr_re_fiber_kill_tail(
+ RE_FIBER* fiber,
+ RE_FIBER_LIST* fiber_list,
+ RE_FIBER_LIST* fiber_pool)
+{
+ RE_FIBER* prev_fiber = fiber->prev;
- case RE_OPCODE_SPLIT_B:
- jmp_offset = *(int16_t*)(ip + 1);
+ if (prev_fiber != NULL)
+ prev_fiber->next = NULL;
- new_fiber_data = _yr_re_clone_fiber_data(
- fiber_data, &storage->fiber_data_pool);
+ fiber->prev = fiber_pool->tail;
- _yr_re_add_fiber(fibers, storage, ip + jmp_offset, input, fiber_data);
- _yr_re_add_fiber(fibers, storage, ip + 3, input, new_fiber_data);
- break;
+ if (fiber_pool->tail != NULL)
+ fiber_pool->tail->next = fiber;
- default:
- assert(fibers->count < MAX_RE_FIBERS);
- fibers->items[fibers->count].ip = ip;
- fibers->items[fibers->count].fiber_data = fiber_data;
- fibers->count++;
- }
+ fiber_pool->tail = fiber_list->tail;
+ fiber_list->tail = prev_fiber;
+
+ if (fiber_list->head == fiber)
+ fiber_list->head = NULL;
+
+ if (fiber_pool->head == NULL)
+ fiber_pool->head = fiber;
}
-#define swap_fibers(x, y) \
- { \
- RE_FIBER_LIST* tmp; \
- tmp = x; \
- x = y; \
- y = tmp; \
- }
+#define prolog \
+ if (count >= max_count) \
+ { \
+ fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool); \
+ break; \
+ } \
+
+#define epilog \
+ if (match) \
+ fiber = fiber->next; \
+ else\
+ fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool) \
//
// yr_re_exec
@@ -1060,21 +1018,19 @@ int yr_re_exec(
RE_MATCH_CALLBACK_FUNC callback,
void* callback_args)
{
- size_t i;
uint8_t* ip;
uint8_t* input;
uint8_t mask;
uint8_t value;
+ RE_FIBER_LIST fibers;
RE_THREAD_STORAGE* storage;
- RE_FIBER_LIST* fibers;
- RE_FIBER_LIST* next_fibers;
- RE_FIBER_DATA* fiber_data;
+ RE_FIBER* fiber;
+ RE_FIBER* new_fiber;
- int fiber_idx;
- int j;
+ int count;
+ int max_count;
int match;
- char character;
int character_size;
int result = -1;
@@ -1091,8 +1047,8 @@ int yr_re_exec(
if (storage == NULL)
return ERROR_INSUFICIENT_MEMORY;
- storage->fiber_data_pool.free = NULL;
- storage->fiber_data_pool.used = NULL;
+ storage->fiber_pool.head = NULL;
+ storage->fiber_pool.tail = NULL;
#ifdef WIN32
TlsSetValue(thread_storage_key, storage);
@@ -1101,49 +1057,50 @@ int yr_re_exec(
#endif
}
- input = input_data;
- fibers = &storage->list1;
- next_fibers = &storage->list2;
-
if (flags & RE_FLAGS_WIDE)
character_size = 2;
else
character_size = 1;
- fibers->count = 0;
- next_fibers->count = 0;
+ fiber = _yr_re_fiber_create(&storage->fiber_pool);
+ fiber->ip = code;
- // Create the initial execution fiber starting at the provided the beginning
- // of the provided code. The fiber data is initially NULL and will be created
- // dynamically when the first PUSH instruction is found.
+ fibers.head = fiber;
+ fibers.tail = fiber;
- _yr_re_add_fiber(fibers, storage, code, input, NULL);
+ input = input_data;
+ count = 0;
+ max_count = min(input_size, RE_SCAN_LIMIT);
- for (i = 0; i < min(input_size, RE_SCAN_LIMIT); i += character_size)
+ while (fibers.head != NULL)
{
- if ((flags & RE_FLAGS_SCAN) &&
- !(flags & RE_FLAGS_START_ANCHORED))
- _yr_re_add_fiber(fibers, storage, code, input, NULL);
-
- if (fibers->count == 0)
- break;
+ fiber = fibers.head;
- for(fiber_idx = 0; fiber_idx < fibers->count; fiber_idx++)
+ while(fiber != NULL)
{
- ip = fibers->items[fiber_idx].ip;
- fiber_data = fibers->items[fiber_idx].fiber_data;
+ ip = fiber->ip;
switch(*ip)
{
case RE_OPCODE_LITERAL:
+ prolog;
if (flags & RE_FLAGS_NO_CASE)
match = lowercase[*input] == lowercase[*(ip + 1)];
else
match = (*input == *(ip + 1));
- ip += 2;
+ fiber->ip += 2;
+ epilog;
+ break;
+
+ case RE_OPCODE_ANY:
+ prolog;
+ match = (*input != 0x0A || flags & RE_FLAGS_DOT_ALL);
+ fiber->ip += 1;
+ epilog;
break;
case RE_OPCODE_MASKED_LITERAL:
+ prolog;
value = *(int16_t*)(ip + 1) & 0xFF;
mask = *(int16_t*)(ip + 1) >> 8;
@@ -1152,76 +1109,120 @@ int yr_re_exec(
// which can't be case-insensitive.
match = ((*input & mask) == value);
- ip += 3;
+ fiber->ip += 3;
+ epilog;
break;
case RE_OPCODE_CLASS:
+ prolog;
if (flags & RE_FLAGS_NO_CASE)
match = CHAR_IN_CLASS(*input, ip + 1) ||
CHAR_IN_CLASS(altercase[*input], ip + 1);
else
match = CHAR_IN_CLASS(*input, ip + 1);
- ip += 33;
+ fiber->ip += 33;
+ epilog;
break;
case RE_OPCODE_WORD_CHAR:
+ prolog;
match = (isalnum(*input) || *input == '_');
- ip += 1;
+ fiber->ip += 1;
+ epilog;
break;
case RE_OPCODE_NON_WORD_CHAR:
+ prolog;
match = (!isalnum(*input) && *input != '_');
- ip += 1;
+ fiber->ip += 1;
+ epilog;
break;
case RE_OPCODE_SPACE:
+ prolog;
match = (*input == ' ' || *input == '\t');
- ip += 1;
+ fiber->ip += 1;
+ epilog;
break;
case RE_OPCODE_NON_SPACE:
+ prolog;
match = (*input != ' ' && *input != '\t');
- ip += 1;
+ fiber->ip += 1;
+ epilog;
break;
case RE_OPCODE_DIGIT:
+ prolog;
match = isdigit(*input);
- ip += 1;
+ fiber->ip += 1;
+ epilog;
break;
case RE_OPCODE_NON_DIGIT:
+ prolog;
match = !isdigit(*input);
- ip += 1;
+ fiber->ip += 1;
+ epilog;
break;
- case RE_OPCODE_ANY:
- match = (*input != 0x0A || flags & RE_FLAGS_DOT_ALL);
- ip += 1;
+ case RE_OPCODE_SPLIT_A:
+ new_fiber = _yr_re_fiber_split(fiber, &fibers, &storage->fiber_pool);
+ new_fiber->ip += *(int16_t*)(ip + 1);
+ fiber->ip += 3;
break;
- case RE_OPCODE_MATCH:
+ case RE_OPCODE_SPLIT_B:
+ new_fiber = _yr_re_fiber_split(fiber, &fibers, &storage->fiber_pool);
+ new_fiber->ip += 3;
+ fiber->ip += *(int16_t*)(ip + 1);
+ break;
+
+ case RE_OPCODE_JUMP:
+ fiber->ip = ip + *(int16_t*)(ip + 1);
+ break;
- match = FALSE;
+ case RE_OPCODE_JNZ:
+ fiber->stack[fiber->sp]--;
+ if (fiber->stack[fiber->sp] > 0)
+ fiber->ip = ip + *(int16_t*)(ip + 1);
+ else
+ fiber->ip += 3;
+ break;
- if (flags & RE_FLAGS_END_ANCHORED && i < input_size)
+ case RE_OPCODE_PUSH:
+ fiber->stack[++fiber->sp] = *(uint16_t*)(ip + 1);
+ fiber->ip += 3;
+ break;
+
+ case RE_OPCODE_POP:
+ fiber->sp--;
+ fiber->ip++;
+ break;
+
+ case RE_OPCODE_MATCH:
+
+ if (flags & RE_FLAGS_END_ANCHORED && count < input_size)
+ {
+ fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool);
break;
+ }
- result = i;
+ result = count;
if (flags & RE_FLAGS_EXHAUSTIVE)
{
if (flags & RE_FLAGS_BACKWARDS)
- callback(input + character_size, i, flags, callback_args);
+ callback(input + character_size, count, flags, callback_args);
else
- callback(input_data, i, flags, callback_args);
+ callback(input_data, count, flags, callback_args);
+
+ fiber = _yr_re_fiber_kill(fiber, &fibers, &storage->fiber_pool);
}
else
{
- // As we are forcing a jump out of the loop fiber_idx
- // won't be incremented. Let's do it before exiting.
-
- fiber_idx++;
- goto _exit_loop;
+ _yr_re_fiber_kill_tail(fiber, &fibers, &storage->fiber_pool);
+ fiber = NULL;
}
break;
@@ -1229,33 +1230,8 @@ int yr_re_exec(
default:
assert(FALSE);
}
-
- if (match)
- _yr_re_add_fiber(
- next_fibers,
- storage,
- ip,
- input + character_size,
- fiber_data);
- else
- _yr_re_free_fiber_data(
- fiber_data,
- &storage->fiber_data_pool);
}
- _exit_loop:
-
- // Free the fiber data for any remaining fiber that didn't
- // survived for the next step.
-
- for(; fiber_idx < fibers->count; fiber_idx++)
- _yr_re_free_fiber_data(
- fibers->items[fiber_idx].fiber_data,
- &storage->fiber_data_pool);
-
- swap_fibers(fibers, next_fibers);
- next_fibers->count = 0;
-
if (flags & RE_FLAGS_WIDE && *(input + 1) != 0)
break;
@@ -1264,40 +1240,17 @@ int yr_re_exec(
else
input += character_size;
- } //for (i = 0; i < min(input_size, RE_SCAN_LIMIT) ...
+ count++;
- if (!(flags & RE_FLAGS_END_ANCHORED) || i == input_size)
- {
- for(fiber_idx = 0; fiber_idx < fibers->count; fiber_idx++)
+ if ((flags & RE_FLAGS_SCAN) && !(flags & RE_FLAGS_START_ANCHORED) &&
+ count < max_count)
{
- if (*fibers->items[fiber_idx].ip != RE_OPCODE_MATCH)
- continue;
-
- if (flags & RE_FLAGS_EXHAUSTIVE)
- {
- if (flags & RE_FLAGS_BACKWARDS)
- callback(
- input + character_size, i, flags, callback_args);
- else
- callback(
- input_data, i, flags, callback_args);
- }
- else
- {
- result = i;
- break;
- }
+ fiber = _yr_re_fiber_create(&storage->fiber_pool);
+ fiber->ip = code;
+ _yr_re_fiber_append(fiber, &fibers);
}
}
- for(fiber_idx = 0; fiber_idx < fibers->count; fiber_idx++)
- _yr_re_free_fiber_data(
- fibers->items[fiber_idx].fiber_data,
- &storage->fiber_data_pool);
-
- // Ensure that every fiber data was released
- assert(storage->fiber_data_pool.used == NULL);
-
return result;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git
More information about the forensics-changes
mailing list