[Forensics-changes] [yara] 277/368: Implement Aho-Corasick automaton with interleaved arrays.
Hilko Bengen
bengen at moszumanska.debian.org
Sat Jul 1 10:30:47 UTC 2017
This is an automated email from the git hooks/post-receive script.
bengen pushed a commit to annotated tag v3.5.0
in repository yara.
commit 0db16d3639140c0b6a7a6d0de06e5e2622e90c04
Author: plusvic <plusvic at gmail.com>
Date: Mon May 16 12:21:42 2016 +0200
Implement Aho-Corasick automaton with interleaved arrays.
This improves both the performance and the memory footprint.
---
libyara/ahocorasick.c | 746 +++++++++++++++++++++++--------------
libyara/arena.c | 18 +-
libyara/atoms.c | 16 +
libyara/compiler.c | 60 ++-
libyara/exception.h | 1 +
libyara/include/yara/ahocorasick.h | 43 ++-
libyara/include/yara/compiler.h | 3 +-
libyara/include/yara/types.h | 260 +++++++------
libyara/parser.c | 45 +--
libyara/rules.c | 75 ++--
tests/test-alignment.c | 4 +-
11 files changed, 754 insertions(+), 517 deletions(-)
diff --git a/libyara/ahocorasick.c b/libyara/ahocorasick.c
index a9017b6..42e0214 100644
--- a/libyara/ahocorasick.c
+++ b/libyara/ahocorasick.c
@@ -25,8 +25,6 @@ limitations under the License.
#include <yara/mem.h>
-#define MAX_TABLE_BASED_STATES_DEPTH 1
-
typedef struct _QUEUE_NODE
{
@@ -140,85 +138,8 @@ int _yr_ac_queue_is_empty(
}
-YR_AC_STATE* _yr_ac_next_transition(
- YR_AC_STATE* state,
- YR_AC_STATE_TRANSITION* transition)
-{
- int i;
- YR_AC_TABLE_BASED_STATE* table_based_state;
-
- if (state->depth <= MAX_TABLE_BASED_STATES_DEPTH)
- {
- table_based_state = (YR_AC_TABLE_BASED_STATE*) state;
-
- for (i = transition->input + 1; i < 256; i++)
- {
- if (table_based_state->transitions[i].state != NULL)
- {
- transition->state = table_based_state->transitions[i].state;
- transition->input = i;
- transition->next = NULL;
- return transition->state;
- }
- }
- }
- else
- {
- if (transition->next != NULL)
- {
- transition->state = transition->next->state;
- transition->input = transition->next->input;
- transition->next = transition->next->next;
- return transition->state;
- }
- }
-
- return NULL;
-}
-
-
-YR_AC_STATE* _yr_ac_first_transition(
- YR_AC_STATE* state,
- YR_AC_STATE_TRANSITION* transition)
-{
- int i;
-
- YR_AC_LIST_BASED_STATE* list_based_state;
- YR_AC_TABLE_BASED_STATE* table_based_state;
-
- if (state->depth <= MAX_TABLE_BASED_STATES_DEPTH)
- {
- table_based_state = (YR_AC_TABLE_BASED_STATE*) state;
-
- for (i = 0; i < 256; i++)
- {
- if (table_based_state->transitions[i].state != NULL)
- {
- transition->state = table_based_state->transitions[i].state;
- transition->input = i;
- transition->next = NULL;
- return transition->state;
- }
- }
- }
- else
- {
- list_based_state = (YR_AC_LIST_BASED_STATE*) state;
-
- if (list_based_state->transitions != NULL)
- {
- transition->state = list_based_state->transitions->state;
- transition->input = list_based_state->transitions->input;
- transition->next = list_based_state->transitions->next;
- return transition->state;
- }
- }
-
- return NULL;
-}
-
//
-// yr_ac_next_state
+// _yr_ac_next_state
//
// Given an automaton state and an input symbol, returns the new state
// after reading the input symbol.
@@ -231,138 +152,92 @@ YR_AC_STATE* _yr_ac_first_transition(
// Pointer to the next automaton state.
//
-YR_AC_STATE* yr_ac_next_state(
+YR_AC_STATE* _yr_ac_next_state(
YR_AC_STATE* state,
uint8_t input)
{
- YR_AC_STATE_TRANSITION* transition;
+ YR_AC_STATE* next_state = state->first_child;
- if (state->depth <= MAX_TABLE_BASED_STATES_DEPTH)
- {
- return ((YR_AC_TABLE_BASED_STATE*) state)->transitions[input].state;
- }
- else
+ while (next_state != NULL)
{
- transition = ((YR_AC_LIST_BASED_STATE*) state)->transitions;
-
- while (transition != NULL)
- {
- if (transition->input == input)
- return transition->state;
+ if (next_state->input == input)
+ return next_state;
- transition = transition->next;
- }
-
- return NULL;
+ next_state = next_state->siblings;
}
+
+ return NULL;
}
//
-// _yr_ac_create_state
+// _yr_ac_state_create
//
// Creates a new automaton state, the automaton will transition from
// the given state to the new state after reading the input symbol.
//
// Args:
-// YR_ARENA* arena - Automaton's arena
// YR_AC_STATE* state - Origin state
-// uint8_t input - Input symbol
+// uint8_t input - Input symbol
//
// Returns:
// YR_AC_STATE* pointer to the newly allocated state or NULL in case
// of error.
-YR_AC_STATE* _yr_ac_create_state(
- YR_ARENA* arena,
+YR_AC_STATE* _yr_ac_state_create(
YR_AC_STATE* state,
uint8_t input)
{
- int result;
- YR_AC_STATE* new_state;
- YR_AC_LIST_BASED_STATE* list_based_state;
- YR_AC_TABLE_BASED_STATE* table_based_state;
- YR_AC_STATE_TRANSITION* new_transition;
-
- if (state->depth < MAX_TABLE_BASED_STATES_DEPTH)
- {
- result = yr_arena_allocate_struct(
- arena,
- sizeof(YR_AC_TABLE_BASED_STATE),
- (void**) &new_state,
- offsetof(YR_AC_TABLE_BASED_STATE, failure),
- offsetof(YR_AC_TABLE_BASED_STATE, matches),
- EOL);
- }
- else
- {
- result = yr_arena_allocate_struct(
- arena,
- sizeof(YR_AC_LIST_BASED_STATE),
- (void**) &new_state,
- offsetof(YR_AC_LIST_BASED_STATE, failure),
- offsetof(YR_AC_LIST_BASED_STATE, matches),
- offsetof(YR_AC_LIST_BASED_STATE, transitions),
- EOL);
- }
+ YR_AC_STATE* new_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
- if (result != ERROR_SUCCESS)
+ if (new_state == NULL)
return NULL;
- if (state->depth <= MAX_TABLE_BASED_STATES_DEPTH)
- {
- result = yr_arena_make_relocatable(
- arena,
- state,
- offsetof(YR_AC_TABLE_BASED_STATE, transitions[input]),
- EOL);
+ new_state->input = input;
+ new_state->depth = state->depth + 1;
+ new_state->matches = NULL;
+ new_state->failure = NULL;
+ new_state->t_table_slot = 0;
+ new_state->first_child = NULL;
+ new_state->siblings = state->first_child;
+ state->first_child = new_state;
- if (result != ERROR_SUCCESS)
- return NULL;
+ return new_state;
+}
- table_based_state = (YR_AC_TABLE_BASED_STATE*) state;
- table_based_state->transitions[input].state = new_state;
- }
- else
- {
- result = yr_arena_allocate_struct(
- arena,
- sizeof(YR_AC_STATE_TRANSITION),
- (void**) &new_transition,
- offsetof(YR_AC_STATE_TRANSITION, state),
- offsetof(YR_AC_STATE_TRANSITION, next),
- EOL);
- if (result != ERROR_SUCCESS)
- return NULL;
+//
+// _yr_ac_state_destroy
+//
- list_based_state = (YR_AC_LIST_BASED_STATE*) state;
+int _yr_ac_state_destroy(
+ YR_AC_STATE* state)
+{
+ YR_AC_STATE* child_state = state->first_child;
- new_transition->input = input;
- new_transition->state = new_state;
- new_transition->next = list_based_state->transitions;
- list_based_state->transitions = new_transition;
+ while (child_state != NULL)
+ {
+ YR_AC_STATE* next_child_state = child_state->siblings;
+ _yr_ac_state_destroy(child_state);
+ child_state = next_child_state;
}
- new_state->depth = state->depth + 1;
+ yr_free(state);
- return new_state;
+ return ERROR_SUCCESS;
}
//
-// yr_ac_create_failure_links
+// _yr_ac_create_failure_links
//
// Create failure links for each automaton state. This function must
// be called after all the strings have been added to the automaton.
//
-int yr_ac_create_failure_links(
- YR_ARENA* arena,
+int _yr_ac_create_failure_links(
YR_AC_AUTOMATON* automaton)
{
- YR_AC_STATE_TRANSITION transition;
-
YR_AC_STATE* current_state;
YR_AC_STATE* failure_state;
YR_AC_STATE* temp_state;
@@ -383,13 +258,13 @@ int yr_ac_create_failure_links(
// Push root's children and set their failure link to root.
- state = _yr_ac_first_transition(root_state, &transition);
+ state = root_state->first_child;
while (state != NULL)
{
FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
state->failure = root_state;
- state = _yr_ac_next_transition(root_state, &transition);
+ state = state->siblings;
}
// Traverse the trie in BFS order calculating the failure link
@@ -414,9 +289,7 @@ int yr_ac_create_failure_links(
current_state->matches = root_state->matches;
}
- transition_state = _yr_ac_first_transition(
- current_state,
- &transition);
+ transition_state = current_state->first_child;
while (transition_state != NULL)
{
@@ -425,9 +298,8 @@ int yr_ac_create_failure_links(
while (1)
{
- temp_state = yr_ac_next_state(
- failure_state,
- transition.input);
+ temp_state = _yr_ac_next_state(
+ failure_state, transition_state->input);
if (temp_state != NULL)
{
@@ -463,9 +335,7 @@ int yr_ac_create_failure_links(
}
} // while(1)
- transition_state = _yr_ac_next_transition(
- current_state,
- &transition);
+ transition_state = transition_state->siblings;
}
} // while(!__yr_ac_queue_is_empty(&queue))
@@ -488,47 +358,50 @@ int _yr_ac_transitions_subset(
{
uint8_t set[32];
- memset(set, 0, 32);
+ YR_AC_STATE* state = s1->first_child;
- YR_AC_STATE_TRANSITION transition;
- YR_AC_STATE* state = _yr_ac_first_transition(s1, &transition);
+ memset(set, 0, 32);
while (state != NULL)
{
- set[transition.input / 8] |= 1 << transition.input % 8;
- state = _yr_ac_next_transition(s1, &transition);
+ set[state->input / 8] |= 1 << state->input % 8;
+ state = state->siblings;
}
- state = _yr_ac_first_transition(s2, &transition);
+ state = s2->first_child;
while (state != NULL)
{
- if (!(set[transition.input / 8] & 1 << transition.input % 8))
+ if (!(set[state->input / 8] & 1 << state->input % 8))
return FALSE;
- state = _yr_ac_next_transition(s2, &transition);
+ state = state->siblings;
}
return TRUE;
}
-int yr_ac_optimize_failure_links(
+//
+// _yr_ac_optimize_failure_links
+//
+// Removes unnecessary failure links.
+//
+
+int _yr_ac_optimize_failure_links(
YR_AC_AUTOMATON* automaton)
{
- YR_AC_STATE_TRANSITION transition;
-
QUEUE queue = { NULL, NULL};
// Push root's children.
YR_AC_STATE* root_state = automaton->root;
- YR_AC_STATE* state = _yr_ac_first_transition(root_state, &transition);
+ YR_AC_STATE* state = root_state->first_child;
while (state != NULL)
{
FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
- state = _yr_ac_next_transition(root_state, &transition);
+ state = state->siblings;
}
while (!_yr_ac_queue_is_empty(&queue))
@@ -538,24 +411,16 @@ int yr_ac_optimize_failure_links(
if (current_state->failure != root_state)
{
if (_yr_ac_transitions_subset(current_state, current_state->failure))
- {
current_state->failure = current_state->failure->failure;
- }
}
// Push childrens of current_state
+ state = current_state->first_child;
- YR_AC_STATE* transition_state = _yr_ac_first_transition(
- current_state,
- &transition);
-
- while (transition_state != NULL)
+ while (state != NULL)
{
- FAIL_ON_ERROR(_yr_ac_queue_push(&queue, transition_state));
-
- transition_state = _yr_ac_next_transition(
- current_state,
- &transition);
+ FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
+ state = state->siblings;
}
}
@@ -564,114 +429,225 @@ int yr_ac_optimize_failure_links(
//
-// yr_ac_create_automaton
+// _yr_ac_find_suitable_transition_table_slot
//
-// Creates a new automaton
+// Find a place within the transition table where the transitions for the given
+// state can be put. The function first searches for an unused slot to put the
+// failure link, then it checks if the slots corresponding to the state
+// transitions (wich are at offsets 1 to 256 relative to the failure link ) are
+// available too.
//
-int yr_ac_create_automaton(
- YR_ARENA* arena,
- YR_AC_AUTOMATON** automaton)
+int _yr_ac_find_suitable_transition_table_slot(
+ YR_AC_AUTOMATON* automaton,
+ YR_AC_STATE* state,
+ uint32_t* slot)
{
- int result;
- YR_AC_STATE* root_state;
+ YR_AC_STATE* child_state;
- result = yr_arena_allocate_struct(
- arena,
- sizeof(YR_AC_AUTOMATON),
- (void**) automaton,
- offsetof(YR_AC_AUTOMATON, root),
- EOL);
+ uint32_t i = automaton->t_table_unused_candidate;
- if (result != ERROR_SUCCESS)
- return result;
+ int first_unused = TRUE;
+ int found = FALSE;
- result = yr_arena_allocate_struct(
- arena,
- sizeof(YR_AC_TABLE_BASED_STATE),
- (void**) &root_state,
- offsetof(YR_AC_TABLE_BASED_STATE, failure),
- offsetof(YR_AC_TABLE_BASED_STATE, matches),
- EOL);
+ while (!found)
+ {
+ // Check if there is enough room in the table to hold 257 items
+ // (1 failure link + 256 transitions) starting at offset i. If there's
+ // no room double the table size.
- if (result != ERROR_SUCCESS)
- return result;
+ if (automaton->tables_size - i < 257)
+ {
+ size_t t_bytes_size = automaton->tables_size * sizeof(YR_AC_TRANSITION);
+ size_t m_bytes_size = automaton->tables_size * sizeof(YR_AC_MATCH*);
- (*automaton)->root = root_state;
+ automaton->t_table = yr_realloc(automaton->t_table, t_bytes_size * 2);
+ automaton->m_table = yr_realloc(automaton->m_table, m_bytes_size * 2);
- root_state->depth = 0;
- root_state->matches = NULL;
+ if (automaton->t_table == NULL || automaton->m_table == NULL)
+ return ERROR_INSUFICIENT_MEMORY;
- return result;
+ memset((uint8_t*) automaton->t_table + t_bytes_size, 0, t_bytes_size);
+ memset((uint8_t*) automaton->m_table + m_bytes_size, 0, m_bytes_size);
+
+ automaton->tables_size *= 2;
+ }
+
+ if (YR_AC_UNUSED_TRANSITION_SLOT(automaton->t_table[i]))
+ {
+ // A unused slot in the table has been found and could be a potential
+ // candidate. Let's check if table slots for the transitions are
+ // unused too.
+
+ found = TRUE;
+ child_state = state->first_child;
+
+ while (child_state != NULL)
+ {
+ if (YR_AC_USED_TRANSITION_SLOT(
+ automaton->t_table[child_state->input + i + 1]))
+ {
+ found = FALSE;
+ break;
+ }
+
+ child_state = child_state->siblings;
+ }
+
+ // If this is the first unused entry we found, use it as the first
+ // candidate in the next call to this function.
+
+ if (first_unused)
+ {
+ automaton->t_table_unused_candidate = found ? i + 1 : i;
+ first_unused = FALSE;
+ }
+
+ if (found)
+ *slot = i;
+ }
+
+ i++;
+ }
+
+ return ERROR_SUCCESS;
}
+//
+// _yr_ac_build_transition_table
+//
+// Builds the transition table for the automaton. The transition table (T) is a
+// large array of 64-bits integers. Each state in the automaton is represented
+// by an offset S within the array. The integer stored in T[S] is the failure
+// link for state S, it contains the offset for the next state when no valid
+// transition exists for the next input byte.
+//
+// At position T[S+1+B] (where B is a byte) we can find the transition (if any)
+// that must be followed from state S if the next input is B. The value in
+// T[S+1+B] contains the offset for next state or 0. The 0 means that no
+// valid transition exists from state S when next input is B, and the failure
+// link must be used instead.
+//
+// The transition table for state S starts at T[S] and spans the next 257
+// slots in the array (1 for the failure link and 256 for all the posible
+// transitions). But many of those slots are for invalid transitions, so
+// the transitions for multiple states can be interleaved as long as they don't
+// collide. For example, instead of having this transition table with state S1
+// and S2 separated by a large number of slots:
+//
+// S1 S2
+// +------+------+------+------+-- ~ --+------+------+------+-- ~ --+
+// | FLS1 | X | - | - | - | Y | FLS2 | Z | - |
+// +------+------+------+------+-- ~ --+------+------+------+-- ~ --+
+//
+// We can interleave the transitions for states S1 and S2 and get this other
+// transition table, which is more compact:
+//
+// S1 S2
+// +------+------+------+------+-- ~ --+------+
+// | FLS1 | X | FLS2 | Z | - | Y |
+// +------+------+------+------+-- ~ --+------+
+//
+// And how do we know that transition Z belongs to state S2 and not S1? Or that
+// transition Y belongs to S1 and not S2? Because each slot of the array not
+// only contains the offset for the state where the transition points to, it
+// also contains the offset of the transition relative to its owner state. So,
+// the value for the owner offset would be 1 for transitions X, because X
+// belongs to state S1 and it's located 1 position away from S1. The same occurs
+// for Z, it belongs to S2 and it's located one position away from S2 so its
+// owner offset is 1. If we are in S1 and next byte is 2, we are going to read
+// the transition at T[S1+1+2] which is Z. But we know that transition Z is not
+// a valid transition for state S1 because the owner offset for Z is 1 not 3.
+//
+// A more detailed description can be found in: http://goo.gl/lE6zG
-int yr_ac_add_string(
- YR_ARENA* arena,
- YR_AC_AUTOMATON* automaton,
- YR_STRING* string,
- YR_ATOM_LIST_ITEM* atom)
-{
- int result = ERROR_SUCCESS;
- int i;
+int _yr_ac_build_transition_table(
+ YR_AC_AUTOMATON* automaton)
+{
YR_AC_STATE* state;
- YR_AC_STATE* next_state;
- YR_AC_MATCH* new_match;
+ YR_AC_STATE* child_state;
+ YR_AC_STATE* root_state = automaton->root;
- // For each atom create the states in the automaton.
+ uint32_t slot;
- while (atom != NULL)
+ QUEUE queue = { NULL, NULL};
+
+ automaton->tables_size = 1024;
+
+ automaton->t_table = yr_malloc(
+ automaton->tables_size * sizeof(YR_AC_TRANSITION));
+
+ automaton->m_table = yr_malloc(
+ automaton->tables_size * sizeof(YR_AC_MATCH*));
+
+ if (automaton->t_table == NULL || automaton->t_table == NULL)
{
- state = automaton->root;
+ yr_free(automaton->t_table);
+ yr_free(automaton->m_table);
- for(i = 0; i < atom->atom_length; i++)
- {
- next_state = yr_ac_next_state(
- state, atom->atom[i]);
+ return ERROR_INSUFICIENT_MEMORY;
+ }
- if (next_state == NULL)
- {
- next_state = _yr_ac_create_state(
- arena,
- state,
- atom->atom[i]);
+ memset(automaton->t_table, 0,
+ automaton->tables_size * sizeof(YR_AC_TRANSITION));
- if (next_state == NULL)
- return ERROR_INSUFICIENT_MEMORY;
- }
+ memset(automaton->m_table, 0,
+ automaton->tables_size * sizeof(YR_AC_MATCH*));
- state = next_state;
- }
+ automaton->t_table[0] = YR_AC_MAKE_TRANSITION(0, 0, YR_AC_USED_FLAG);
+ automaton->m_table[0] = root_state->matches;
- result = yr_arena_allocate_struct(
- arena,
- sizeof(YR_AC_MATCH),
- (void**) &new_match,
- offsetof(YR_AC_MATCH, string),
- offsetof(YR_AC_MATCH, forward_code),
- offsetof(YR_AC_MATCH, backward_code),
- offsetof(YR_AC_MATCH, next),
- EOL);
+ // Index 0 is for root node. Unused indexes start at 1.
+ automaton->t_table_unused_candidate = 1;
- if (result == ERROR_SUCCESS)
- {
- new_match->backtrack = state->depth + atom->backtrack;
- new_match->string = string;
- new_match->forward_code = atom->forward_code;
- new_match->backward_code = atom->backward_code;
- new_match->next = state->matches;
- state->matches = new_match;
- }
- else
+ child_state = root_state->first_child;
+
+ while (child_state != NULL)
+ {
+ child_state->t_table_slot = child_state->input + 1;
+ automaton->t_table[child_state->input + 1] = YR_AC_MAKE_TRANSITION(
+ 0, child_state->input + 1, YR_AC_USED_FLAG);
+
+ FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
+ child_state = child_state->siblings;
+ }
+
+ while (!_yr_ac_queue_is_empty(&queue))
+ {
+ state = _yr_ac_queue_pop(&queue);
+
+ FAIL_ON_ERROR(_yr_ac_find_suitable_transition_table_slot(
+ automaton,
+ state,
+ &slot));
+
+ automaton->t_table[state->t_table_slot] |= ((uint64_t) slot << 32);
+
+ state->t_table_slot = slot;
+
+ automaton->t_table[slot] = YR_AC_MAKE_TRANSITION(
+ state->failure->t_table_slot, 0, YR_AC_USED_FLAG);
+
+ automaton->m_table[slot] = state->matches;
+
+ // Push childrens of current_state
+
+ child_state = state->first_child;
+
+ while (child_state != NULL)
{
- break;
- }
+ child_state->t_table_slot = slot + child_state->input + 1;
+ automaton->t_table[child_state->t_table_slot] = YR_AC_MAKE_TRANSITION(
+ 0, child_state->input + 1, YR_AC_USED_FLAG);
- atom = atom->next;
+ FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
+
+ child_state = child_state->siblings;
+ }
}
- return result;
+ return ERROR_SUCCESS;
}
@@ -688,20 +664,19 @@ void _yr_ac_print_automaton_state(
int i;
int child_count;
- YR_AC_STATE_TRANSITION transition;
YR_AC_MATCH* match;
YR_AC_STATE* child_state;
for (i = 0; i < state->depth; i++)
printf(" ");
- child_state = _yr_ac_first_transition(state, &transition);
+ child_state = state->first_child;
child_count = 0;
while(child_state != NULL)
{
child_count++;
- child_state = _yr_ac_next_transition(state, &transition);
+ child_state = child_state->siblings;
}
printf("%p childs:%d depth:%d failure:%p",
@@ -751,15 +726,216 @@ void _yr_ac_print_automaton_state(
printf("\n");
- child_state = _yr_ac_first_transition(state, &transition);
+ child_state = state->first_child;
while(child_state != NULL)
{
_yr_ac_print_automaton_state(child_state);
- child_state = _yr_ac_next_transition(state, &transition);
+ child_state = child_state->siblings;
+ }
+}
+
+
+//
+// yr_ac_automaton_create
+//
+// Creates a new automaton
+//
+
+int yr_ac_automaton_create(
+ YR_AC_AUTOMATON** automaton)
+{
+ YR_AC_AUTOMATON* new_automaton;
+ YR_AC_STATE* root_state;
+
+ new_automaton = yr_malloc(sizeof(YR_AC_AUTOMATON));
+ root_state = yr_malloc(sizeof(YR_AC_STATE));
+
+ if (new_automaton == NULL || root_state == NULL)
+ {
+ yr_free(new_automaton);
+ yr_free(root_state);
+
+ return ERROR_INSUFICIENT_MEMORY;
+ }
+
+ root_state->depth = 0;
+ root_state->matches = NULL;
+ root_state->failure = NULL;
+ root_state->first_child = NULL;
+ root_state->siblings = NULL;
+ root_state->t_table_slot = 0;
+
+ new_automaton->root = root_state;
+ new_automaton->m_table = NULL;
+ new_automaton->t_table = NULL;
+ new_automaton->tables_size = 0;
+
+ *automaton = new_automaton;
+
+ return ERROR_SUCCESS;
+}
+
+
+//
+// yr_ac_automaton_destroy
+//
+// Destroys automaton
+//
+
+int yr_ac_automaton_destroy(
+ YR_AC_AUTOMATON* automaton)
+{
+ _yr_ac_state_destroy(automaton->root);
+
+ yr_free(automaton->t_table);
+ yr_free(automaton->m_table);
+ yr_free(automaton);
+
+ return ERROR_SUCCESS;
+}
+
+
+//
+// yr_ac_add_string
+//
+// Adds a string to the automaton. This function is invoked once for each
+// string defined in the rules.
+//
+
+int yr_ac_add_string(
+ YR_AC_AUTOMATON* automaton,
+ YR_STRING* string,
+ YR_ATOM_LIST_ITEM* atom,
+ YR_ARENA* matches_arena)
+{
+ int result = ERROR_SUCCESS;
+ int i;
+
+ YR_AC_STATE* state;
+ YR_AC_STATE* next_state;
+ YR_AC_MATCH* new_match;
+
+ // For each atom create the states in the automaton.
+
+ while (atom != NULL)
+ {
+ state = automaton->root;
+
+ for (i = 0; i < atom->atom_length; i++)
+ {
+ next_state = _yr_ac_next_state(state, atom->atom[i]);
+
+ if (next_state == NULL)
+ {
+ next_state = _yr_ac_state_create(state, atom->atom[i]);
+
+ if (next_state == NULL)
+ return ERROR_INSUFICIENT_MEMORY;
+ }
+
+ state = next_state;
+ }
+
+ result = yr_arena_allocate_struct(
+ matches_arena,
+ sizeof(YR_AC_MATCH),
+ (void**) &new_match,
+ offsetof(YR_AC_MATCH, string),
+ offsetof(YR_AC_MATCH, forward_code),
+ offsetof(YR_AC_MATCH, backward_code),
+ offsetof(YR_AC_MATCH, next),
+ EOL);
+
+ if (result == ERROR_SUCCESS)
+ {
+ new_match->backtrack = state->depth + atom->backtrack;
+ new_match->string = string;
+ new_match->forward_code = atom->forward_code;
+ new_match->backward_code = atom->backward_code;
+ new_match->next = state->matches;
+ state->matches = new_match;
+ }
+ else
+ {
+ break;
+ }
+
+ atom = atom->next;
}
+
+ return result;
}
+
+//
+// yr_ac_compile
+//
+
+int yr_ac_compile(
+ YR_AC_AUTOMATON* automaton,
+ YR_ARENA* arena,
+ YR_AC_TABLES* tables)
+{
+ uint32_t i;
+
+ FAIL_ON_ERROR(_yr_ac_create_failure_links(automaton));
+ FAIL_ON_ERROR(_yr_ac_optimize_failure_links(automaton));
+ FAIL_ON_ERROR(_yr_ac_build_transition_table(automaton));
+
+ FAIL_ON_ERROR(yr_arena_reserve_memory(
+ arena,
+ automaton->tables_size * sizeof(tables->transitions[0]) +
+ automaton->tables_size * sizeof(tables->matches[0])));
+
+ FAIL_ON_ERROR(yr_arena_write_data(
+ arena,
+ automaton->t_table,
+ sizeof(YR_AC_TRANSITION),
+ (void**) &tables->transitions));
+
+ for (i = 1; i < automaton->tables_size; i++)
+ {
+ FAIL_ON_ERROR(yr_arena_write_data(
+ arena,
+ automaton->t_table + i,
+ sizeof(YR_AC_TRANSITION),
+ NULL));
+ }
+
+ FAIL_ON_ERROR(yr_arena_write_data(
+ arena,
+ automaton->m_table,
+ sizeof(YR_AC_MATCH*),
+ (void**) &tables->matches));
+
+ FAIL_ON_ERROR(yr_arena_make_relocatable(
+ arena,
+ tables->matches,
+ 0,
+ EOL));
+
+ for (i = 1; i < automaton->tables_size; i++)
+ {
+ void* ptr;
+
+ FAIL_ON_ERROR(yr_arena_write_data(
+ arena,
+ automaton->m_table + i,
+ sizeof(YR_AC_MATCH*),
+ (void**) &ptr));
+
+ FAIL_ON_ERROR(yr_arena_make_relocatable(
+ arena,
+ ptr,
+ 0,
+ EOL));
+ }
+
+ return ERROR_SUCCESS;
+}
+
+
//
// yr_ac_print_automaton
//
diff --git a/libyara/arena.c b/libyara/arena.c
index b987993..cc9fe6a 100644
--- a/libyara/arena.c
+++ b/libyara/arena.c
@@ -760,7 +760,9 @@ int yr_arena_write_string(
// yr_arena_append
//
// Appends source_arena to target_arena. This operation destroys source_arena,
-// after returning any pointer to source_arena is no longer valid.
+// after returning any pointer to source_arena is no longer valid. The data
+// from source_arena is guaranteed to be aligned to a 16 bytes boundary when
+// written to the source_arena
//
// Args:
// YR_ARENA* target_arena - Pointer to target the arena.
@@ -774,6 +776,20 @@ int yr_arena_append(
YR_ARENA* target_arena,
YR_ARENA* source_arena)
{
+ uint8_t padding_data[15];
+ size_t padding_size = 16 - target_arena->current_page->used % 16;
+
+ if (padding_size < 16)
+ {
+ memset(&padding_data, 0xCC, padding_size);
+
+ FAIL_ON_ERROR(yr_arena_write_data(
+ target_arena,
+ padding_data,
+ padding_size,
+ NULL));
+ }
+
target_arena->current_page->next = source_arena->page_list_head;
source_arena->page_list_head->prev = target_arena->current_page;
target_arena->current_page = source_arena->current_page;
diff --git a/libyara/atoms.c b/libyara/atoms.c
index d387f92..09ebca6 100644
--- a/libyara/atoms.c
+++ b/libyara/atoms.c
@@ -1107,6 +1107,22 @@ int yr_atoms_extract_from_re(
*atoms = _yr_atoms_list_concat(*atoms, case_insentive_atoms);
}
+ // No atoms has been extracted, let's add a zero-length atom.
+
+ if (*atoms == NULL)
+ {
+ *atoms = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
+
+ if (*atoms == NULL)
+ return ERROR_INSUFICIENT_MEMORY;
+
+ (*atoms)->atom_length = 0;
+ (*atoms)->backtrack = 0;
+ (*atoms)->forward_code = re->root_node->forward_code;
+ (*atoms)->backward_code = NULL;
+ (*atoms)->next = NULL;
+ }
+
return ERROR_SUCCESS;
}
diff --git a/libyara/compiler.c b/libyara/compiler.c
index d98e13d..4b1aa0f 100644
--- a/libyara/compiler.c
+++ b/libyara/compiler.c
@@ -80,9 +80,6 @@ YR_API int yr_compiler_create(
result = yr_arena_create(65536, 0, &new_compiler->re_code_arena);
if (result == ERROR_SUCCESS)
- result = yr_arena_create(65536, 0, &new_compiler->automaton_arena);
-
- if (result == ERROR_SUCCESS)
result = yr_arena_create(65536, 0, &new_compiler->externals_arena);
if (result == ERROR_SUCCESS)
@@ -92,9 +89,13 @@ YR_API int yr_compiler_create(
result = yr_arena_create(65536, 0, &new_compiler->metas_arena);
if (result == ERROR_SUCCESS)
- result = yr_ac_create_automaton(
- new_compiler->automaton_arena,
- &new_compiler->automaton);
+ result = yr_arena_create(65536, 0, &new_compiler->automaton_arena);
+
+ if (result == ERROR_SUCCESS)
+ result = yr_arena_create(65536, 0, &new_compiler->matches_arena);
+
+ if (result == ERROR_SUCCESS)
+ result = yr_ac_automaton_create(&new_compiler->automaton);
if (result == ERROR_SUCCESS)
{
@@ -121,10 +122,13 @@ YR_API void yr_compiler_destroy(
yr_arena_destroy(compiler->strings_arena);
yr_arena_destroy(compiler->code_arena);
yr_arena_destroy(compiler->re_code_arena);
- yr_arena_destroy(compiler->automaton_arena);
yr_arena_destroy(compiler->externals_arena);
yr_arena_destroy(compiler->namespaces_arena);
yr_arena_destroy(compiler->metas_arena);
+ yr_arena_destroy(compiler->automaton_arena);
+ yr_arena_destroy(compiler->matches_arena);
+
+ yr_ac_automaton_destroy(compiler->automaton);
yr_hash_table_destroy(
compiler->rules_table,
@@ -374,6 +378,7 @@ YR_API int yr_compiler_add_string(
}
}
+
int _yr_compiler_compile_rules(
YR_COMPILER* compiler)
{
@@ -412,10 +417,13 @@ int _yr_compiler_compile_rules(
sizeof(YR_EXTERNAL_VARIABLE),
NULL);
- // Create Aho-Corasick automaton's failure links.
- result = yr_ac_create_failure_links(
+ YR_AC_TABLES tables;
+
+ // Write Aho-Corasick automaton to arena.
+ result = yr_ac_compile(
+ compiler->automaton,
compiler->automaton_arena,
- compiler->automaton);
+ &tables);
if (result == ERROR_SUCCESS)
result = yr_arena_create(1024, 0, &arena);
@@ -428,7 +436,8 @@ int _yr_compiler_compile_rules(
offsetof(YARA_RULES_FILE_HEADER, rules_list_head),
offsetof(YARA_RULES_FILE_HEADER, externals_list_head),
offsetof(YARA_RULES_FILE_HEADER, code_start),
- offsetof(YARA_RULES_FILE_HEADER, automaton),
+ offsetof(YARA_RULES_FILE_HEADER, match_table),
+ offsetof(YARA_RULES_FILE_HEADER, transition_table),
EOL);
if (result == ERROR_SUCCESS)
@@ -442,18 +451,12 @@ int _yr_compiler_compile_rules(
rules_file_header->code_start = (uint8_t*) yr_arena_base_address(
compiler->code_arena);
- rules_file_header->automaton = (YR_AC_AUTOMATON*) yr_arena_base_address(
- compiler->automaton_arena);
+ rules_file_header->match_table = tables.matches;
+ rules_file_header->transition_table = tables.transitions;
}
if (result == ERROR_SUCCESS)
- result = yr_arena_append(
- arena,
- compiler->automaton_arena);
-
- if (result == ERROR_SUCCESS)
{
- compiler->automaton_arena = NULL;
result = yr_arena_append(
arena,
compiler->code_arena);
@@ -518,6 +521,22 @@ int _yr_compiler_compile_rules(
if (result == ERROR_SUCCESS)
{
compiler->sz_arena = NULL;
+ result = yr_arena_append(
+ arena,
+ compiler->automaton_arena);
+ }
+
+ if (result == ERROR_SUCCESS)
+ {
+ compiler->automaton_arena = NULL;
+ result = yr_arena_append(
+ arena,
+ compiler->matches_arena);
+ }
+
+ if (result == ERROR_SUCCESS)
+ {
+ compiler->matches_arena = NULL;
compiler->compiled_rules_arena = arena;
result = yr_arena_coalesce(arena);
}
@@ -552,7 +571,8 @@ YR_API int yr_compiler_get_rules(
yara_rules->externals_list_head = rules_file_header->externals_list_head;
yara_rules->rules_list_head = rules_file_header->rules_list_head;
- yara_rules->automaton = rules_file_header->automaton;
+ yara_rules->match_table = rules_file_header->match_table;
+ yara_rules->transition_table = rules_file_header->transition_table;
yara_rules->code_start = rules_file_header->code_start;
yara_rules->tidx_mask = 0;
diff --git a/libyara/exception.h b/libyara/exception.h
index 97e11f8..210a7db 100644
--- a/libyara/exception.h
+++ b/libyara/exception.h
@@ -91,6 +91,7 @@ typedef struct sigaction sa;
act.sa_flags = 0; /* SA_ONSTACK? */ \
sigemptyset(&oldmask); \
sigemptyset(&act.sa_mask); \
+ sigemptyset(&oldmask); \
pthread_sigmask(SIG_SETMASK, &act.sa_mask, &oldmask); \
sigaction(SIGBUS, &act, &oldact); \
int tidx = yr_get_tidx(); \
diff --git a/libyara/include/yara/ahocorasick.h b/libyara/include/yara/ahocorasick.h
index f8264fa..e4316ec 100644
--- a/libyara/include/yara/ahocorasick.h
+++ b/libyara/include/yara/ahocorasick.h
@@ -22,29 +22,50 @@ limitations under the License.
#include <yara/types.h>
-int yr_ac_create_automaton(
- YR_ARENA* arena,
+#define YR_AC_ROOT_STATE 0
+#define YR_AC_NEXT_STATE(t) (t >> 32)
+#define YR_AC_INVALID_TRANSITION(t, c) (((t) & 0xFFFF) != c)
+
+#define YR_AC_MAKE_TRANSITION(state, code, flags) \
+ ((uint64_t)((((uint64_t) state) << 32) | ((flags) << 16) | code))
+
+#define YR_AC_USED_FLAG 0x1
+
+#define YR_AC_USED_TRANSITION_SLOT(x) ((x) & (YR_AC_USED_FLAG << 16))
+#define YR_AC_UNUSED_TRANSITION_SLOT(x) (!YR_AC_USED_TRANSITION_SLOT(x))
+
+
+typedef struct ac_tables
+{
+ YR_AC_TRANSITION* transitions;
+ YR_AC_MATCH** matches;
+
+} YR_AC_TABLES;
+
+
+int yr_ac_automaton_create(
YR_AC_AUTOMATON** automaton);
+int yr_ac_automaton_destroy(
+ YR_AC_AUTOMATON* automaton);
+
+
int yr_ac_add_string(
- YR_ARENA* arena,
YR_AC_AUTOMATON* automaton,
YR_STRING* string,
- YR_ATOM_LIST_ITEM* atom);
-
+ YR_ATOM_LIST_ITEM* atom,
+ YR_ARENA* matches_arena);
-YR_AC_STATE* yr_ac_next_state(
- YR_AC_STATE* state,
- uint8_t input);
-
-int yr_ac_create_failure_links(
+int yr_ac_compile(
+ YR_AC_AUTOMATON* automaton,
YR_ARENA* arena,
- YR_AC_AUTOMATON* automaton);
+ YR_AC_TABLES* tables);
void yr_ac_print_automaton(
YR_AC_AUTOMATON* automaton);
+
#endif
diff --git a/libyara/include/yara/compiler.h b/libyara/include/yara/compiler.h
index 237309a..29d6f4d 100644
--- a/libyara/include/yara/compiler.h
+++ b/libyara/include/yara/compiler.h
@@ -61,11 +61,12 @@ typedef struct _YR_COMPILER
YR_ARENA* strings_arena;
YR_ARENA* code_arena;
YR_ARENA* re_code_arena;
- YR_ARENA* automaton_arena;
YR_ARENA* compiled_rules_arena;
YR_ARENA* externals_arena;
YR_ARENA* namespaces_arena;
YR_ARENA* metas_arena;
+ YR_ARENA* matches_arena;
+ YR_ARENA* automaton_arena;
YR_AC_AUTOMATON* automaton;
YR_HASH_TABLE* rules_table;
diff --git a/libyara/include/yara/types.h b/libyara/include/yara/types.h
index a56b956..0cf8241 100644
--- a/libyara/include/yara/types.h
+++ b/libyara/include/yara/types.h
@@ -37,68 +37,11 @@ typedef int32_t tidx_mask_t;
#define DECLARE_REFERENCE(type, name) \
union { type name; int64_t name##_; } YR_ALIGN(8)
-#pragma pack(push)
-#pragma pack(8)
#define NAMESPACE_TFLAGS_UNSATISFIED_GLOBAL 0x01
-typedef struct _YR_NAMESPACE
-{
- int32_t t_flags[MAX_THREADS]; // Thread-specific flags
- DECLARE_REFERENCE(char*, name);
-
-} YR_NAMESPACE;
-
-
-#define META_TYPE_NULL 0
-#define META_TYPE_INTEGER 1
-#define META_TYPE_STRING 2
-#define META_TYPE_BOOLEAN 3
-
-#define META_IS_NULL(x) \
- ((x) != NULL ? (x)->type == META_TYPE_NULL : TRUE)
-
-
-typedef struct _YR_META
-{
- int32_t type;
- YR_ALIGN(8) int64_t integer;
-
- DECLARE_REFERENCE(const char*, identifier);
- DECLARE_REFERENCE(char*, string);
-
-} YR_META;
-
-
-typedef struct _YR_MATCH
-{
- int64_t base;
- int64_t offset;
- int32_t length;
-
- union {
- uint8_t* data; // Confirmed matches use "data",
- int32_t chain_length; // unconfirmed ones use "chain_length"
- } YR_ALIGN(8);
-
- YR_ALIGN(8) struct _YR_MATCH* prev;
- YR_ALIGN(8) struct _YR_MATCH* next;
-
-} YR_MATCH;
-
-
-typedef struct _YR_MATCHES
-{
- int32_t count;
-
- DECLARE_REFERENCE(YR_MATCH*, head);
- DECLARE_REFERENCE(YR_MATCH*, tail);
-
-} YR_MATCHES;
-
-
#define STRING_GFLAGS_REFERENCED 0x01
#define STRING_GFLAGS_HEXADECIMAL 0x02
#define STRING_GFLAGS_NO_CASE 0x04
@@ -117,7 +60,6 @@ typedef struct _YR_MATCHES
#define STRING_GFLAGS_FIXED_OFFSET 0x8000
#define STRING_GFLAGS_GREEDY_REGEXP 0x10000
-
#define STRING_IS_HEX(x) \
(((x)->g_flags) & STRING_GFLAGS_HEXADECIMAL)
@@ -176,6 +118,97 @@ typedef struct _YR_MATCHES
((x)->matches[yr_get_tidx()])
+#define RULE_TFLAGS_MATCH 0x01
+
+#define RULE_GFLAGS_PRIVATE 0x01
+#define RULE_GFLAGS_GLOBAL 0x02
+#define RULE_GFLAGS_REQUIRE_EXECUTABLE 0x04
+#define RULE_GFLAGS_REQUIRE_FILE 0x08
+#define RULE_GFLAGS_NULL 0x1000
+
+#define RULE_IS_PRIVATE(x) \
+ (((x)->g_flags) & RULE_GFLAGS_PRIVATE)
+
+#define RULE_IS_GLOBAL(x) \
+ (((x)->g_flags) & RULE_GFLAGS_GLOBAL)
+
+#define RULE_IS_NULL(x) \
+ (((x)->g_flags) & RULE_GFLAGS_NULL)
+
+#define RULE_MATCHES(x) \
+ ((x)->t_flags[yr_get_tidx()] & RULE_TFLAGS_MATCH)
+
+
+#define META_TYPE_NULL 0
+#define META_TYPE_INTEGER 1
+#define META_TYPE_STRING 2
+#define META_TYPE_BOOLEAN 3
+
+#define META_IS_NULL(x) \
+ ((x) != NULL ? (x)->type == META_TYPE_NULL : TRUE)
+
+
+#define EXTERNAL_VARIABLE_TYPE_NULL 0
+#define EXTERNAL_VARIABLE_TYPE_FLOAT 1
+#define EXTERNAL_VARIABLE_TYPE_INTEGER 2
+#define EXTERNAL_VARIABLE_TYPE_BOOLEAN 3
+#define EXTERNAL_VARIABLE_TYPE_STRING 4
+#define EXTERNAL_VARIABLE_TYPE_MALLOC_STRING 5
+
+#define EXTERNAL_VARIABLE_IS_NULL(x) \
+ ((x) != NULL ? (x)->type == EXTERNAL_VARIABLE_TYPE_NULL : TRUE)
+
+
+#pragma pack(push)
+#pragma pack(8)
+
+
+typedef struct _YR_NAMESPACE
+{
+ int32_t t_flags[MAX_THREADS]; // Thread-specific flags
+ DECLARE_REFERENCE(char*, name);
+
+} YR_NAMESPACE;
+
+
+typedef struct _YR_META
+{
+ int32_t type;
+ YR_ALIGN(8) int64_t integer;
+
+ DECLARE_REFERENCE(const char*, identifier);
+ DECLARE_REFERENCE(char*, string);
+
+} YR_META;
+
+
+typedef struct _YR_MATCH
+{
+ int64_t base;
+ int64_t offset;
+ int32_t length;
+
+ union {
+ uint8_t* data; // Confirmed matches use "data",
+ int32_t chain_length; // unconfirmed ones use "chain_length"
+ } YR_ALIGN(8);
+
+ YR_ALIGN(8) struct _YR_MATCH* prev;
+ YR_ALIGN(8) struct _YR_MATCH* next;
+
+} YR_MATCH;
+
+
+typedef struct _YR_MATCHES
+{
+ int32_t count;
+
+ DECLARE_REFERENCE(YR_MATCH*, head);
+ DECLARE_REFERENCE(YR_MATCH*, tail);
+
+} YR_MATCHES;
+
+
typedef struct _YR_STRING
{
int32_t g_flags;
@@ -200,27 +233,6 @@ typedef struct _YR_STRING
} YR_STRING;
-#define RULE_TFLAGS_MATCH 0x01
-
-#define RULE_GFLAGS_PRIVATE 0x01
-#define RULE_GFLAGS_GLOBAL 0x02
-#define RULE_GFLAGS_REQUIRE_EXECUTABLE 0x04
-#define RULE_GFLAGS_REQUIRE_FILE 0x08
-#define RULE_GFLAGS_NULL 0x1000
-
-#define RULE_IS_PRIVATE(x) \
- (((x)->g_flags) & RULE_GFLAGS_PRIVATE)
-
-#define RULE_IS_GLOBAL(x) \
- (((x)->g_flags) & RULE_GFLAGS_GLOBAL)
-
-#define RULE_IS_NULL(x) \
- (((x)->g_flags) & RULE_GFLAGS_NULL)
-
-#define RULE_MATCHES(x) \
- ((x)->t_flags[yr_get_tidx()] & RULE_TFLAGS_MATCH)
-
-
typedef struct _YR_RULE
{
int32_t g_flags; // Global flags
@@ -239,18 +251,6 @@ typedef struct _YR_RULE
} YR_RULE;
-#define EXTERNAL_VARIABLE_TYPE_NULL 0
-#define EXTERNAL_VARIABLE_TYPE_FLOAT 1
-#define EXTERNAL_VARIABLE_TYPE_INTEGER 2
-#define EXTERNAL_VARIABLE_TYPE_BOOLEAN 3
-#define EXTERNAL_VARIABLE_TYPE_STRING 4
-#define EXTERNAL_VARIABLE_TYPE_MALLOC_STRING 5
-
-
-#define EXTERNAL_VARIABLE_IS_NULL(x) \
- ((x) != NULL ? (x)->type == EXTERNAL_VARIABLE_TYPE_NULL : TRUE)
-
-
typedef struct _YR_EXTERNAL_VARIABLE
{
int32_t type;
@@ -278,69 +278,64 @@ typedef struct _YR_AC_MATCH
} YR_AC_MATCH;
-typedef struct _YR_AC_STATE
-{
- int8_t depth;
-
- DECLARE_REFERENCE(struct _YR_AC_STATE*, failure);
- DECLARE_REFERENCE(YR_AC_MATCH*, matches);
-
-} YR_AC_STATE;
+typedef uint64_t YR_AC_TRANSITION;
+typedef YR_AC_TRANSITION* YR_AC_TRANSITION_TABLE;
+typedef YR_AC_MATCH** YR_AC_MATCH_TABLE;
-typedef struct _YR_AC_STATE_TRANSITION
+typedef struct _YARA_RULES_FILE_HEADER
{
- uint8_t input;
+ uint32_t version;
- DECLARE_REFERENCE(YR_AC_STATE*, state);
- DECLARE_REFERENCE(struct _YR_AC_STATE_TRANSITION*, next);
+ DECLARE_REFERENCE(YR_RULE*, rules_list_head);
+ DECLARE_REFERENCE(YR_EXTERNAL_VARIABLE*, externals_list_head);
+ DECLARE_REFERENCE(uint8_t*, code_start);
+ DECLARE_REFERENCE(YR_AC_MATCH_TABLE, match_table);
+ DECLARE_REFERENCE(YR_AC_TRANSITION_TABLE, transition_table);
-} YR_AC_STATE_TRANSITION;
+} YARA_RULES_FILE_HEADER;
+#pragma pack(pop)
-typedef struct _YR_AC_TABLE_BASED_STATE
-{
- int8_t depth;
- DECLARE_REFERENCE(YR_AC_STATE*, failure);
- DECLARE_REFERENCE(YR_AC_MATCH*, matches);
- DECLARE_REFERENCE(YR_AC_STATE*, state) transitions[256];
+//
+// Structs defined below are never stored in the compiled rules file
+//
-} YR_AC_TABLE_BASED_STATE;
+struct _YR_AC_STATE;
-typedef struct _YR_AC_LIST_BASED_STATE
-{
- int8_t depth;
- DECLARE_REFERENCE(YR_AC_STATE*, failure);
- DECLARE_REFERENCE(YR_AC_MATCH*, matches);
- DECLARE_REFERENCE(YR_AC_STATE_TRANSITION*, transitions);
+typedef struct _YR_AC_STATE
+{
+ uint8_t depth;
+ uint8_t input;
-} YR_AC_LIST_BASED_STATE;
+ uint32_t t_table_slot;
+ struct _YR_AC_STATE* failure;
+ struct _YR_AC_STATE* first_child;
+ struct _YR_AC_STATE* siblings;
-typedef struct _YR_AC_AUTOMATON
-{
- DECLARE_REFERENCE(YR_AC_STATE*, root);
+ YR_AC_MATCH* matches;
-} YR_AC_AUTOMATON;
+} YR_AC_STATE;
-typedef struct _YARA_RULES_FILE_HEADER
+typedef struct _YR_AC_AUTOMATON
{
- uint32_t version;
+ // Both m_table and t_table have the same number of elements, which is
+ // stored in tables_size.
- DECLARE_REFERENCE(YR_RULE*, rules_list_head);
- DECLARE_REFERENCE(YR_EXTERNAL_VARIABLE*, externals_list_head);
- DECLARE_REFERENCE(uint8_t*, code_start);
- DECLARE_REFERENCE(YR_AC_AUTOMATON*, automaton);
-
-} YARA_RULES_FILE_HEADER;
+ uint32_t tables_size;
+ uint32_t t_table_unused_candidate;
+ YR_AC_TRANSITION_TABLE t_table;
+ YR_AC_MATCH_TABLE m_table;
+ YR_AC_STATE* root;
-#pragma pack(pop)
+} YR_AC_AUTOMATON;
typedef struct _YR_RULES {
@@ -352,7 +347,8 @@ typedef struct _YR_RULES {
YR_ARENA* arena;
YR_RULE* rules_list_head;
YR_EXTERNAL_VARIABLE* externals_list_head;
- YR_AC_AUTOMATON* automaton;
+ YR_AC_TRANSITION_TABLE transition_table;
+ YR_AC_MATCH_TABLE match_table;
} YR_RULES;
diff --git a/libyara/parser.c b/libyara/parser.c
index db4fe58..7b849c2 100644
--- a/libyara/parser.c
+++ b/libyara/parser.c
@@ -282,7 +282,6 @@ int _yr_parser_write_string(
int* min_atom_quality)
{
SIZED_STRING* literal_string;
- YR_AC_MATCH* new_match;
YR_ATOM_LIST_ITEM* atom_list = NULL;
int result;
@@ -381,37 +380,11 @@ int _yr_parser_write_string(
if (result == ERROR_SUCCESS)
{
// Add the string to Aho-Corasick automaton.
-
- if (atom_list != NULL)
- {
- result = yr_ac_add_string(
- compiler->automaton_arena,
- compiler->automaton,
- *string,
- atom_list);
- }
- else
- {
- result = yr_arena_allocate_struct(
- compiler->automaton_arena,
- sizeof(YR_AC_MATCH),
- (void**) &new_match,
- offsetof(YR_AC_MATCH, string),
- offsetof(YR_AC_MATCH, forward_code),
- offsetof(YR_AC_MATCH, backward_code),
- offsetof(YR_AC_MATCH, next),
- EOL);
-
- if (result == ERROR_SUCCESS)
- {
- new_match->backtrack = 0;
- new_match->string = *string;
- new_match->forward_code = re->root_node->forward_code;
- new_match->backward_code = NULL;
- new_match->next = compiler->automaton->root->matches;
- compiler->automaton->root->matches = new_match;
- }
- }
+ result = yr_ac_add_string(
+ compiler->automaton,
+ *string,
+ atom_list,
+ compiler->matches_arena);
}
*min_atom_quality = yr_atoms_min_quality(atom_list);
@@ -574,8 +547,8 @@ YR_STRING* yr_parser_reduce_string_declaration(
if (yr_re_contains_dot_star(re))
{
yywarning(
- yyscanner,
- "%s contains .*, consider using .{N} with a reasonable value for N",
+ yyscanner,
+ "%s contains .*, consider using .{N} with a reasonable value for N",
identifier);
}
@@ -676,13 +649,13 @@ YR_STRING* yr_parser_reduce_string_declaration(
string);
if (compiler->last_result != ERROR_SUCCESS)
- goto _exit;
+ goto _exit;
}
if (min_atom_quality < 3 && compiler->callback != NULL)
{
yywarning(
- yyscanner,
+ yyscanner,
"%s is slowing down scanning%s",
string->identifier,
min_atom_quality < 2 ? " (critical!)" : "");
diff --git a/libyara/rules.c b/libyara/rules.c
index c655d90..96b6d15 100644
--- a/libyara/rules.c
+++ b/libyara/rules.c
@@ -220,74 +220,90 @@ int _yr_rules_scan_mem_block(
int timeout,
time_t start_time)
{
- YR_AC_MATCH* ac_match;
- YR_AC_STATE* next_state;
- YR_AC_STATE* current_state = rules->automaton->root;
+ YR_AC_TRANSITION_TABLE transition_table = rules->transition_table;
+ YR_AC_MATCH_TABLE match_table = rules->match_table;
+
+ YR_AC_MATCH* match;
+ YR_AC_TRANSITION transition;
size_t i = 0;
+ uint32_t state = YR_AC_ROOT_STATE;
+ uint16_t index;
while (i < block->size)
{
- ac_match = current_state->matches;
+ match = match_table[state];
- while (ac_match != NULL)
+ while (match != NULL)
{
- if (ac_match->backtrack <= i)
+ if (timeout > 0 && i % 4096 == 0)
+ {
+ if (difftime(time(NULL), start_time) > timeout)
+ return ERROR_SCAN_TIMEOUT;
+ }
+
+ if (match->backtrack <= i)
{
FAIL_ON_ERROR(yr_scan_verify_match(
context,
- ac_match,
+ match,
block->data,
block->size,
block->base,
- i - ac_match->backtrack));
+ i - match->backtrack));
}
- ac_match = ac_match->next;
+ match = match->next;
}
- next_state = yr_ac_next_state(current_state, block->data[i]);
+ index = block->data[i++] + 1;
+ transition = transition_table[state + index];
- while (next_state == NULL && current_state->depth > 0)
+ while (YR_AC_INVALID_TRANSITION(transition, index))
{
- current_state = current_state->failure;
- next_state = yr_ac_next_state(current_state, block->data[i]);
+ if (state != YR_AC_ROOT_STATE)
+ {
+ state = transition_table[state] >> 32;
+ transition = transition_table[state + index];
+ }
+ else
+ {
+ transition = 0;
+ break;
+ }
}
- if (next_state != NULL)
- current_state = next_state;
+ state = transition >> 32;
- i++;
-
- if (timeout > 0 && i % 256 == 0)
- {
- if (difftime(time(NULL), start_time) > timeout)
- return ERROR_SCAN_TIMEOUT;
- }
}
- ac_match = current_state->matches;
- while (ac_match != NULL)
+ match = match_table[state];
+
+ while (match != NULL)
{
- if (ac_match->backtrack <= block->size)
+ if (match->backtrack <= i)
{
FAIL_ON_ERROR(yr_scan_verify_match(
context,
- ac_match,
+ match,
block->data,
block->size,
block->base,
- block->size - ac_match->backtrack));
+ i - match->backtrack));
}
- ac_match = ac_match->next;
+ match = match->next;
}
+
+
return ERROR_SUCCESS;
}
+
+
YR_API int yr_rules_scan_mem_blocks(
YR_RULES* rules,
YR_MEMORY_BLOCK* block,
@@ -631,10 +647,11 @@ YR_API int yr_rules_load_stream(
header = (YARA_RULES_FILE_HEADER*)
yr_arena_base_address(new_rules->arena);
- new_rules->automaton = header->automaton;
new_rules->code_start = header->code_start;
new_rules->externals_list_head = header->externals_list_head;
new_rules->rules_list_head = header->rules_list_head;
+ new_rules->match_table = header->match_table;
+ new_rules->transition_table = header->transition_table;
new_rules->tidx_mask = 0;
FAIL_ON_ERROR(yr_mutex_create(&new_rules->mutex));
diff --git a/tests/test-alignment.c b/tests/test-alignment.c
index 15499a7..6f9fbcb 100644
--- a/tests/test-alignment.c
+++ b/tests/test-alignment.c
@@ -127,9 +127,9 @@ int main (int argc, char **argv)
CHECK_OFFSET(YR_AC_LIST_BASED_STATE, 16, matches);
CHECK_OFFSET(YR_AC_LIST_BASED_STATE, 24, transitions);
- CHECK_SIZE(YR_AC_AUTOMATON, 8);
+ CHECK_SIZE(YR_AC_AUTOMATON, 32);
- CHECK_SIZE(YARA_RULES_FILE_HEADER, 40);
+ CHECK_SIZE(YARA_RULES_FILE_HEADER, 56);
CHECK_OFFSET(YARA_RULES_FILE_HEADER, 8, rules_list_head);
CHECK_OFFSET(YARA_RULES_FILE_HEADER, 16, externals_list_head);
CHECK_OFFSET(YARA_RULES_FILE_HEADER, 24, code_start);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git
More information about the forensics-changes
mailing list