[Forensics-changes] [yara] 142/160: Fix an issue with certain regular expressions reporting matches longer than expected.
Hilko Bengen
bengen at moszumanska.debian.org
Sat Jul 1 10:29:27 UTC 2017
This is an automated email from the git hooks/post-receive script.
bengen pushed a commit to annotated tag v3.4.0
in repository yara.
commit 10da150de566e52886ebb5d29359c040096402e8
Author: Victor M. Alvarez <plusvic at gmail.com>
Date: Wed Jun 3 17:27:20 2015 +0200
Fix an issue with certain regular expressions reporting matches longer than expected.
While scanning a file with multiple 'a' characters, regexp /.(aa){1,3}/ should report matches up to 9 characters long and it was reporting 11 characters.
---
libyara/re.c | 126 ++++++++++++++++++++++++++++++++++++---------------------
libyara/scan.c | 39 +++++++++++++++++-
2 files changed, 118 insertions(+), 47 deletions(-)
diff --git a/libyara/re.c b/libyara/re.c
index 895ed5f..485b5c8 100644
--- a/libyara/re.c
+++ b/libyara/re.c
@@ -962,14 +962,25 @@ int _yr_re_emit(
case RE_NODE_RANGE:
- // Code for e1{n,m} looks like:
+ // Code for e{n,m} looks like:
//
- // code for e1 (n times)
- // push m-n
+ // code for e (repeated n times)
+ // push m-n-1
// L0: split L1, L2
- // L1: code for e1
- // jnztop L0
+ // L1: code for e
+ // jnz L0
// L2: pop
+ // split L3, L4
+ // L3: code for e
+ // L4:
+ //
+ // Instead of generating a loop with m-n iterations, we generate a loop
+ // with m-n-1 iterations and the last one is unrolled outside the loop.
+ // This is because re_node->backward_code pointers *must* point to code
+ // past the loop. If they point to code before the loop then when some atom
+ // contained inside "e" is found, the loop will be executed in both
+ // forward and backward code. This causes an overlap in forward and backward
+ // matches and the reported matching string will be longer than expected.
if (re_node->start > 0)
{
@@ -1000,57 +1011,80 @@ int _yr_re_emit(
}
}
- // m == n, no more code needed.
- if (re_node->end == re_node->start)
- break;
+ if (re_node->end > re_node->start + 1)
+ {
+ FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
+ arena,
+ RE_OPCODE_PUSH,
+ re_node->end - re_node->start - 1,
+ re_node->start == 0 ? &instruction_addr : NULL,
+ NULL,
+ &inst_size));
- FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
- arena,
- RE_OPCODE_PUSH,
- re_node->end - re_node->start,
- re_node->start == 0 ? &instruction_addr : NULL,
- NULL,
- &inst_size));
+ *code_size += inst_size;
- *code_size += inst_size;
+ FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
+ arena,
+ re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
+ 0,
+ NULL,
+ &split_offset_addr,
+ &split_size));
- FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
- arena,
- re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
- 0,
- NULL,
- &split_offset_addr,
- &split_size));
+ *code_size += split_size;
- *code_size += split_size;
+ FAIL_ON_ERROR(_yr_re_emit(
+ re_node->left,
+ arena,
+ flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
+ NULL,
+ &branch_size));
- FAIL_ON_ERROR(_yr_re_emit(
- re_node->left,
- arena,
- flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
- NULL,
- &branch_size));
+ *code_size += branch_size;
- *code_size += branch_size;
+ FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
+ arena,
+ RE_OPCODE_JNZ,
+ -(branch_size + split_size),
+ NULL,
+ &jmp_offset_addr,
+ &jmp_size));
- FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
- arena,
- RE_OPCODE_JNZ,
- -(branch_size + split_size),
- NULL,
- &jmp_offset_addr,
- &jmp_size));
+ *code_size += jmp_size;
+ *split_offset_addr = split_size + branch_size + jmp_size;
- *code_size += jmp_size;
- *split_offset_addr = split_size + branch_size + jmp_size;
+ FAIL_ON_ERROR(_yr_emit_inst(
+ arena,
+ RE_OPCODE_POP,
+ NULL,
+ &inst_size));
- FAIL_ON_ERROR(_yr_emit_inst(
- arena,
- RE_OPCODE_POP,
- NULL,
- &inst_size));
+ *code_size += inst_size;
+ }
+
+ if (re_node->end > re_node->start)
+ {
+ FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
+ arena,
+ re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
+ 0,
+ NULL,
+ &split_offset_addr,
+ &split_size));
+
+ *code_size += split_size;
+
+ FAIL_ON_ERROR(_yr_re_emit(
+ re_node->left,
+ arena,
+ flags | EMIT_DONT_SET_FORWARDS_CODE,
+ re_node->start == 0 && re_node->end == 1 ? &instruction_addr : NULL,
+ &branch_size));
+
+ *code_size += branch_size;
+ *split_offset_addr = split_size + branch_size;
+ }
- *code_size += inst_size;
break;
}
diff --git a/libyara/scan.c b/libyara/scan.c
index 8a6402b..58d4e56 100644
--- a/libyara/scan.c
+++ b/libyara/scan.c
@@ -222,7 +222,7 @@ int _yr_scan_fast_hex_re_exec(
}
else
{
- return matches;
+ return matches;
}
}
@@ -240,6 +240,7 @@ int _yr_scan_fast_hex_re_exec(
switch(*ip)
{
case RE_OPCODE_LITERAL:
+
if (*current_input == *(ip + 1))
{
matches++;
@@ -250,11 +251,14 @@ int _yr_scan_fast_hex_re_exec(
{
stop = TRUE;
}
+
break;
case RE_OPCODE_MASKED_LITERAL:
+
value = *(int16_t*)(ip + 1) & 0xFF;
mask = *(int16_t*)(ip + 1) >> 8;
+
if ((*current_input & mask) == value)
{
matches++;
@@ -265,15 +269,45 @@ int _yr_scan_fast_hex_re_exec(
{
stop = TRUE;
}
+
break;
case RE_OPCODE_ANY:
+
matches++;
current_input += increment;
ip += 1;
+
+ break;
+
+ case RE_OPCODE_SPLIT_B:
+
+ // This is how the code looks like after the SPLIT:
+ // split L3, L4 (3 bytes long)
+ // L3: any (1 byte long)
+ // L4: ...
+ //
+ // The opcode following the ANY is located at ip + 4
+
+ code_stack[sp] = ip + 4;
+ input_stack[sp] = current_input;
+ matches_stack[sp] = matches;
+ sp++;
+ ip += 3;
+
break;
case RE_OPCODE_PUSH:
+
+ // This is how the code looks like after the PUSH:
+ //
+ // push m-n-1 (3 bytes long)
+ // L0: split L1, L2 (3 bytes long)
+ // L1: any (1 byte long)
+ // jnz L0 (3 bytes long)
+ // L2: pop (1 byte long)
+ // ...
+
for (i = *(uint16_t*)(ip + 1); i > 0; i--)
{
if (flags & RE_FLAGS_BACKWARDS)
@@ -289,6 +323,8 @@ int _yr_scan_fast_hex_re_exec(
continue;
}
+ // The opcode following the POP is located at ip + 11
+
if ( *(ip + 11) != RE_OPCODE_LITERAL ||
(*(ip + 11) == RE_OPCODE_LITERAL &&
*(ip + 12) == *next_input))
@@ -304,6 +340,7 @@ int _yr_scan_fast_hex_re_exec(
sp++;
}
}
+
ip += 11;
break;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git
More information about the forensics-changes
mailing list