[Forensics-changes] [yara] 222/415: Fix a bug caused by noncontiguous regular expression code
Hilko Bengen
bengen at moszumanska.debian.org
Thu Apr 3 05:43:08 UTC 2014
This is an automated email from the git hooks/post-receive script.
bengen pushed a commit to branch debian
in repository yara.
commit a3703119d92ac738e9c05118b63847733199a704
Author: Victor M. Alvarez <plusvic at gmail.com>
Date: Wed Nov 13 09:01:18 2013 +0000
Fix a bug caused by noncontiguous regular expression code
---
libyara/atoms.c | 251 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
libyara/atoms.h | 7 +-
2 files changed, 244 insertions(+), 14 deletions(-)
diff --git a/libyara/atoms.c b/libyara/atoms.c
index 81e891f..bc6c6d5 100644
--- a/libyara/atoms.c
+++ b/libyara/atoms.c
@@ -23,12 +23,46 @@ hex string:
{ 01 02 03 04 05 ?? 06 07 08 [1-2] 09 0A }
In the above string, byte sequences 0102030405, 060708 and 090A are atoms.
-Similarly, in the regular expression:
+Similarly, in this regexp:
/abc.*ed[0-9]+fgh/
The strings "abc", "ed" and "fgh" are atoms.
+When searching for regexps/hex strings matching a file, YARA uses these
+atoms to find locations inside the file where the regexp/hex string could
+match. If the atom "abc" is found somewhere inside the file, there is a change
+for /abc.*ed[0-9]+fgh/ to match the file, if "abc" doesn't appear in the file
+there's no chance for the regexp to match. When the atom is found in the file
+YARA proceeds to fully evaluate the regexp/hex string to determine if it's
+actually a match.
+
+For each regexp/hex string YARA extracts one or more atoms. Sometimes a
+single atom is enough (like in the previous example "abc" is enough for finding
+/abc.*ed[0-9]+fgh/), but sometimes a single atom isn't enough like in the
+regexp /(abc|efg)/. In this case YARA must search for both "abc" AND "efg" and
+fully evaluate the regexp whenever one of those atoms is found.
+
+In the regexp /Look(at|into)this/ YARA can search for "Look", or search for
+"this", or search for both "at" and "into". This what we call an atoms tree,
+because it can be represented by the following tree structure:
+
+-OR
+ |- "Look"
+ |
+ |- AND
+ | |
+ | |- "at"
+ | - "into"
+ |
+ - "this"
+
+From an atom tree YARA chooses the best combination, trying to minimize the
+number of required atoms, but also using high quality atoms (long atoms with
+not too many zeroes and a bit of byte diversity). In the previous example YARA
+will end up using the "Look" atom alone, but in /a(bcd|efg)h/ atoms "bcd" and
+"efg" will be used because "a" and "h" are too short.
+
*/
#include <assert.h>
@@ -54,6 +88,24 @@ The strings "abc", "ed" and "fgh" are atoms.
} \
+//
+// _yr_atoms_quality
+//
+// Returns a numeric value indicating the quality of an atom. The quality
+// depends on some characteristics of the atom, including its length, number
+// of zeroes and number of unique distinct bytes. Atom 00 00 has a very low
+// quality, because it's only two bytes long and both bytes are zeroes. Atom
+// 01 01 01 01 is better but still not optimal, because the same byte is
+// repeated. Atom 01 02 03 04 is an optimal one.
+//
+// Args:
+// uint8_t* atom - Pointer to the atom's bytes.
+// int atom_length - Atom's length.
+//
+// Returns:
+// An integer indicating the atom's quality
+//
+
int _yr_atoms_quality(
uint8_t* atom,
int atom_length)
@@ -84,6 +136,11 @@ int _yr_atoms_quality(
return atom_length + unique_bytes - null_bytes;
}
+//
+// _yr_atoms_min_quality
+//
+// Returns the quality for the worst quality atom in a list.
+//
int _yr_atoms_min_quality(
ATOM_LIST_ITEM* atom_list)
@@ -111,6 +168,11 @@ int _yr_atoms_min_quality(
return min_quality;
}
+//
+// _yr_atoms_tree_node_create
+//
+// Creates a new node for an atoms tree.
+//
ATOM_TREE_NODE* _yr_atoms_tree_node_create(
uint8_t type)
@@ -130,6 +192,12 @@ ATOM_TREE_NODE* _yr_atoms_tree_node_create(
}
+//
+// _yr_atoms_tree_node_destroy
+//
+// Destroys a node from an atoms tree.
+//
+
void _yr_atoms_tree_node_destroy(
ATOM_TREE_NODE* node)
{
@@ -155,6 +223,12 @@ void _yr_atoms_tree_node_destroy(
}
+//
+// _yr_atoms_tree_node_append
+//
+// Appends a new child node to another atoms tree node.
+//
+
void _yr_atoms_tree_node_append(
ATOM_TREE_NODE* dest,
ATOM_TREE_NODE* node)
@@ -169,6 +243,12 @@ void _yr_atoms_tree_node_append(
}
+//
+// _yr_atoms_tree_destroy
+//
+// Destroys an atoms tree.
+//
+
void _yr_atoms_tree_destroy(
ATOM_TREE* atom_tree)
{
@@ -177,6 +257,12 @@ void _yr_atoms_tree_destroy(
}
+//
+// yr_atoms_list_destroy
+//
+// Destroys an atoms list.
+//
+
void yr_atoms_list_destroy(
ATOM_LIST_ITEM* list_head)
{
@@ -191,6 +277,13 @@ void yr_atoms_list_destroy(
}
}
+
+//
+// yr_atoms_list_destroy
+//
+// Concats two atoms lists.
+//
+
ATOM_LIST_ITEM* _yr_atoms_list_concat(
ATOM_LIST_ITEM* list1,
ATOM_LIST_ITEM* list2)
@@ -212,6 +305,13 @@ ATOM_LIST_ITEM* _yr_atoms_list_concat(
}
+//
+// _yr_atoms_choose
+//
+// Chooses which atoms from an atoms tree will be used to feed the
+// Aho-Corasick automaton, and puts them in a list.
+//
+
int _yr_atoms_choose(
ATOM_TREE_NODE* node,
ATOM_LIST_ITEM** choosen_atoms)
@@ -434,6 +534,14 @@ int _yr_atoms_case_insentive(
}
+//
+// _yr_atoms_wide
+//
+// For a given list of atoms returns another list with the corresponding
+// wide atoms. Wide atoms are just the original atoms with interleaved zeroes,
+// for example: 01 02 -> 01 00 02 00
+//
+
int _yr_atoms_wide(
ATOM_LIST_ITEM* atoms,
ATOM_LIST_ITEM** wide_atoms)
@@ -479,6 +587,13 @@ int _yr_atoms_wide(
}
+//
+// _yr_atoms_extract_from_re_node
+//
+// Extract atoms from a regular expression node. See description for
+// _yr_atoms_extract_from_re for more details.
+//
+
ATOM_TREE_NODE* _yr_atoms_extract_from_re_node(
RE_NODE* re_node,
ATOM_TREE* atom_tree,
@@ -494,6 +609,8 @@ ATOM_TREE_NODE* _yr_atoms_extract_from_re_node(
int new_quality;
int i;
+ uint8_t new_atom[MAX_ATOM_LENGTH];
+
switch(re_node->type)
{
case RE_NODE_LITERAL:
@@ -510,34 +627,37 @@ ATOM_TREE_NODE* _yr_atoms_extract_from_re_node(
if (current_leaf->atom_length < MAX_ATOM_LENGTH)
{
current_leaf->atom[current_leaf->atom_length] = re_node->value;
- current_leaf->recent_bytes[current_leaf->atom_length] = re_node->value;
+ current_leaf->recent_nodes[current_leaf->atom_length] = re_node;
current_leaf->atom_length++;
}
else
{
for (i = 1; i < MAX_ATOM_LENGTH; i++)
- current_leaf->recent_bytes[i - 1] = current_leaf->recent_bytes[i];
+ current_leaf->recent_nodes[i - 1] = current_leaf->recent_nodes[i];
- current_leaf->recent_bytes[MAX_ATOM_LENGTH - 1] = re_node->value;
+ current_leaf->recent_nodes[MAX_ATOM_LENGTH - 1] = re_node;
+
+ for (i = 0; i < MAX_ATOM_LENGTH; i++)
+ new_atom[i] = current_leaf->recent_nodes[i]->value;
quality = _yr_atoms_quality(
current_leaf->atom,
MAX_ATOM_LENGTH);
new_quality = _yr_atoms_quality(
- current_leaf->recent_bytes,
+ new_atom,
MAX_ATOM_LENGTH);
if (new_quality > quality)
{
for (i = 0; i < MAX_ATOM_LENGTH; i++)
- current_leaf->atom[i] = current_leaf->recent_bytes[i];
+ current_leaf->atom[i] = new_atom[i];
- current_leaf->forward_code =
- (uint8_t*) re_node->forward_code - 2 * (MAX_ATOM_LENGTH - 1);
+ current_leaf->forward_code = \
+ current_leaf->recent_nodes[0]->forward_code;
- current_leaf->backward_code =
- (uint8_t*) re_node->backward_code + 2 * (MAX_ATOM_LENGTH - 1);
+ current_leaf->backward_code = \
+ current_leaf->recent_nodes[0]->backward_code;
}
}
@@ -644,6 +764,19 @@ ATOM_TREE_NODE* _yr_atoms_extract_from_re_node(
return NULL;
}
+//
+// yr_atoms_extract_triplets
+//
+// On certain cases YARA can not extract long enough atoms from a regexp, but
+// can infer them. For example, in the hex string { 01 ?? 02 } the only explict
+// atoms are 01 and 02, and both of them are too short to be efficiently used.
+// However YARA can use simultaneously atoms 01 00 02, 01 01 02, 01 02 02,
+// 01 03 02, and so on up to 01 FF 02. Searching for 256 three-bytes atoms is
+// faster than searching for a single one-byte atom.
+//
+// This function extracts these three-bytes atoms from a regexp node if
+// possible.
+//
int yr_atoms_extract_triplets(
RE_NODE* re_node,
@@ -696,6 +829,42 @@ int yr_atoms_extract_triplets(
return ERROR_SUCCESS;
}
+ if (left_child->left->type == RE_NODE_LITERAL &&
+ (left_child->right->type == RE_NODE_MASKED_LITERAL))
+ {
+ int i;
+ int shift;
+
+ ATOM_LIST_ITEM* atom;
+
+ for (i = 0; i < 16; i++)
+ {
+ atom = yr_malloc(sizeof(ATOM_LIST_ITEM));
+
+ if (atom == NULL)
+ return ERROR_INSUFICIENT_MEMORY;
+
+ if (left_child->right->mask == 0xF0)
+ shift = 0;
+ else
+ shift = 4;
+
+ atom->atom[0] = left_child->left->value;
+ atom->atom[1] = left_child->right->value | (i << shift);
+ atom->atom[2] = re_node->right->value;
+
+ atom->atom_length = 3;
+ atom->forward_code = left_child->left->forward_code;
+ atom->backward_code = left_child->left->backward_code;
+ atom->backtrack = 0;
+ atom->next = *atoms;
+
+ *atoms = atom;
+ }
+
+ return ERROR_SUCCESS;
+ }
+
if (left_grand_child->type == RE_NODE_CONCAT &&
left_grand_child->right->type == RE_NODE_LITERAL &&
(left_child->right->type == RE_NODE_ANY))
@@ -725,9 +894,52 @@ int yr_atoms_extract_triplets(
return ERROR_SUCCESS;
}
+ if (left_grand_child->type == RE_NODE_CONCAT &&
+ left_grand_child->right->type == RE_NODE_LITERAL &&
+ (left_child->right->type == RE_NODE_MASKED_LITERAL))
+ {
+ int i;
+ int shift;
+
+ ATOM_LIST_ITEM* atom;
+
+ for (i = 0; i < 16; i++)
+ {
+ atom = yr_malloc(sizeof(ATOM_LIST_ITEM));
+
+ if (atom == NULL)
+ return ERROR_INSUFICIENT_MEMORY;
+
+ if (left_child->right->mask == 0xF0)
+ shift = 0;
+ else
+ shift = 4;
+
+ atom->atom[0] = left_grand_child->right->value;
+ atom->atom[1] = left_child->right->value | (i << shift);
+ atom->atom[2] = re_node->right->value;
+
+ atom->atom_length = 3;
+ atom->forward_code = left_grand_child->right->forward_code;
+ atom->backward_code = left_grand_child->right->backward_code;
+ atom->backtrack = 0;
+ atom->next = *atoms;
+
+ *atoms = atom;
+ }
+
+ return ERROR_SUCCESS;
+ }
+
return yr_atoms_extract_triplets(left_child, atoms);;
}
+//
+// _yr_atoms_extract_from_re
+//
+// Extract atoms from a regular expression.
+//
+
int yr_atoms_extract_from_re(
RE* re,
int flags,
@@ -753,17 +965,24 @@ int yr_atoms_extract_from_re(
if (atom_tree->root_node->children_head ==
atom_tree->root_node->children_tail)
{
+ // The root OR node has a single child, there's no need for the OR node so
+ // we proceed to destroy it and use its child as root.
+
temp = atom_tree->root_node;
atom_tree->root_node = atom_tree->root_node->children_head;
yr_free(temp);
}
+ // Choose the atoms that will be used.
min_atom_quality = _yr_atoms_choose(atom_tree->root_node, atoms);
_yr_atoms_tree_destroy(atom_tree);
if (min_atom_quality <= 2)
{
+ // Choosen atoms contain low quality ones, let's try infering some higher
+ // quality atoms.
+
yr_atoms_extract_triplets(re->root_node, &triplet_atoms);
if (min_atom_quality < _yr_atoms_min_quality(triplet_atoms))
@@ -805,6 +1024,12 @@ int yr_atoms_extract_from_re(
}
+//
+// yr_atoms_extract_from_string
+//
+// Extract atoms from a string.
+//
+
int yr_atoms_extract_from_string(
char* string,
int string_length,
@@ -882,6 +1107,12 @@ int yr_atoms_extract_from_string(
}
+//
+// yr_atoms_tree_node_print
+//
+// Prints an atom tree node. Used only for debugging purposes.
+//
+
void yr_atoms_tree_node_print(
ATOM_TREE_NODE* node)
{
diff --git a/libyara/atoms.h b/libyara/atoms.h
index 2e508fe..c3a18fc 100644
--- a/libyara/atoms.h
+++ b/libyara/atoms.h
@@ -27,18 +27,17 @@ limitations under the License.
#define ATOM_TREE_OR 3
-
-
typedef struct _ATOM_TREE_NODE
{
uint8_t type;
uint8_t atom_length;
uint8_t atom[MAX_ATOM_LENGTH];
- uint8_t recent_bytes[MAX_ATOM_LENGTH];
-
+
void* forward_code;
void* backward_code;
+ RE_NODE* recent_nodes[MAX_ATOM_LENGTH];
+
struct _ATOM_TREE_NODE* children_head;
struct _ATOM_TREE_NODE* children_tail;
struct _ATOM_TREE_NODE* next_sibling;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git
More information about the forensics-changes
mailing list