[Forensics-changes] [yara] 142/160: Fix an issue with certain regular expressions reporting matches longer than expected.

Hilko Bengen bengen at moszumanska.debian.org
Sat Jul 1 10:29:27 UTC 2017


This is an automated email from the git hooks/post-receive script.

bengen pushed a commit to annotated tag v3.4.0
in repository yara.

commit 10da150de566e52886ebb5d29359c040096402e8
Author: Victor M. Alvarez <plusvic at gmail.com>
Date:   Wed Jun 3 17:27:20 2015 +0200

    Fix an issue with certain regular expressions reporting matches longer than expected.
    
    While scanning a file with multiple 'a' characters, regexp /.(aa){1,3}/ should report matches up to 9 characters long and it was reporting 11 characters.
---
 libyara/re.c   | 126 ++++++++++++++++++++++++++++++++++++---------------------
 libyara/scan.c |  39 +++++++++++++++++-
 2 files changed, 118 insertions(+), 47 deletions(-)

diff --git a/libyara/re.c b/libyara/re.c
index 895ed5f..485b5c8 100644
--- a/libyara/re.c
+++ b/libyara/re.c
@@ -962,14 +962,25 @@ int _yr_re_emit(
 
   case RE_NODE_RANGE:
 
-    // Code for e1{n,m} looks like:
+    // Code for e{n,m} looks like:
     //
-    //            code for e1 (n times)
-    //            push m-n
+    //            code for e       (repeated n times)
+    //            push m-n-1
     //        L0: split L1, L2
-    //        L1: code for e1
-    //            jnztop L0
+    //        L1: code for e
+    //            jnz L0
     //        L2: pop
+    //            split L3, L4
+    //        L3: code for e
+    //        L4:
+    //
+    // Instead of generating a loop with m-n iterations, we generate a loop
+    // with m-n-1 iterations and the last one is unrolled outside the loop.
+    // This is because re_node->backward_code pointers *must* point to code
+    // past the loop. If they point to code before the loop then when some atom
+    // contained inside "e" is found, the loop will be executed in both
+    // forward and backward code. This causes an overlap in forward and backward
+    // matches and the reported matching string will be longer than expected.
 
     if (re_node->start > 0)
     {
@@ -1000,57 +1011,80 @@ int _yr_re_emit(
       }
     }
 
-    // m == n, no more code needed.
-    if (re_node->end == re_node->start)
-      break;
+    if (re_node->end > re_node->start + 1)
+    {
+      FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
+          arena,
+          RE_OPCODE_PUSH,
+          re_node->end - re_node->start - 1,
+          re_node->start == 0 ? &instruction_addr : NULL,
+          NULL,
+          &inst_size));
 
-    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
-        arena,
-        RE_OPCODE_PUSH,
-        re_node->end - re_node->start,
-        re_node->start == 0 ? &instruction_addr : NULL,
-        NULL,
-        &inst_size));
+      *code_size += inst_size;
 
-    *code_size += inst_size;
+      FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
+          arena,
+          re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
+          0,
+          NULL,
+          &split_offset_addr,
+          &split_size));
 
-    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
-        arena,
-        re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
-        0,
-        NULL,
-        &split_offset_addr,
-        &split_size));
+      *code_size += split_size;
 
-    *code_size += split_size;
+      FAIL_ON_ERROR(_yr_re_emit(
+          re_node->left,
+          arena,
+          flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
+          NULL,
+          &branch_size));
 
-    FAIL_ON_ERROR(_yr_re_emit(
-        re_node->left,
-        arena,
-        flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
-        NULL,
-        &branch_size));
+      *code_size += branch_size;
 
-    *code_size += branch_size;
+      FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
+          arena,
+          RE_OPCODE_JNZ,
+          -(branch_size + split_size),
+          NULL,
+          &jmp_offset_addr,
+          &jmp_size));
 
-    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
-        arena,
-        RE_OPCODE_JNZ,
-        -(branch_size + split_size),
-        NULL,
-        &jmp_offset_addr,
-        &jmp_size));
+      *code_size += jmp_size;
+      *split_offset_addr = split_size + branch_size + jmp_size;
 
-    *code_size += jmp_size;
-    *split_offset_addr = split_size + branch_size + jmp_size;
+      FAIL_ON_ERROR(_yr_emit_inst(
+          arena,
+          RE_OPCODE_POP,
+          NULL,
+          &inst_size));
 
-    FAIL_ON_ERROR(_yr_emit_inst(
-        arena,
-        RE_OPCODE_POP,
-        NULL,
-        &inst_size));
+      *code_size += inst_size;
+    }
+
+    if (re_node->end > re_node->start)
+    {
+      FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
+          arena,
+          re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
+          0,
+          NULL,
+          &split_offset_addr,
+          &split_size));
+
+      *code_size += split_size;
+
+      FAIL_ON_ERROR(_yr_re_emit(
+          re_node->left,
+          arena,
+          flags | EMIT_DONT_SET_FORWARDS_CODE,
+          re_node->start == 0 && re_node->end == 1 ? &instruction_addr : NULL,
+          &branch_size));
+
+      *code_size += branch_size;
+      *split_offset_addr = split_size + branch_size;
+    }
 
-    *code_size += inst_size;
     break;
   }
 
diff --git a/libyara/scan.c b/libyara/scan.c
index 8a6402b..58d4e56 100644
--- a/libyara/scan.c
+++ b/libyara/scan.c
@@ -222,7 +222,7 @@ int _yr_scan_fast_hex_re_exec(
         }
         else
         {
-            return matches;
+          return matches;
         }
       }
 
@@ -240,6 +240,7 @@ int _yr_scan_fast_hex_re_exec(
       switch(*ip)
       {
         case RE_OPCODE_LITERAL:
+
           if (*current_input == *(ip + 1))
           {
             matches++;
@@ -250,11 +251,14 @@ int _yr_scan_fast_hex_re_exec(
           {
             stop = TRUE;
           }
+
           break;
 
         case RE_OPCODE_MASKED_LITERAL:
+
           value = *(int16_t*)(ip + 1) & 0xFF;
           mask = *(int16_t*)(ip + 1) >> 8;
+
           if ((*current_input & mask) == value)
           {
             matches++;
@@ -265,15 +269,45 @@ int _yr_scan_fast_hex_re_exec(
           {
             stop = TRUE;
           }
+
           break;
 
         case RE_OPCODE_ANY:
+
           matches++;
           current_input += increment;
           ip += 1;
+
+          break;
+
+        case RE_OPCODE_SPLIT_B:
+
+          // This is how the code looks like after the SPLIT:
+          //            split L3, L4    (3 bytes long)
+          //        L3: any             (1 byte long)
+          //        L4: ...
+          //
+          // The opcode following the ANY is located at ip + 4
+
+          code_stack[sp] = ip + 4;
+          input_stack[sp] = current_input;
+          matches_stack[sp] = matches;
+          sp++;
+          ip += 3;
+
           break;
 
         case RE_OPCODE_PUSH:
+
+          // This is how the code looks like after the PUSH:
+          //
+          //            push m-n-1        (3 bytes long)
+          //        L0: split L1, L2      (3 bytes long)
+          //        L1: any               (1 byte long)
+          //            jnz L0            (3 bytes long)
+          //        L2: pop               (1 byte long)
+          //            ...
+
           for (i = *(uint16_t*)(ip + 1); i > 0; i--)
           {
             if (flags & RE_FLAGS_BACKWARDS)
@@ -289,6 +323,8 @@ int _yr_scan_fast_hex_re_exec(
                 continue;
             }
 
+            // The opcode following the POP is located at ip + 11
+
             if ( *(ip + 11) != RE_OPCODE_LITERAL ||
                 (*(ip + 11) == RE_OPCODE_LITERAL &&
                  *(ip + 12) == *next_input))
@@ -304,6 +340,7 @@ int _yr_scan_fast_hex_re_exec(
               sp++;
             }
           }
+
           ip += 11;
           break;
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git



More information about the forensics-changes mailing list