[Forensics-changes] [yara] 24/368: Fix issue causing ungreedy regular expressions resulting in greedy matches

Sat Jul 1 10:30:07 UTC 2017

This is an automated email from the git hooks/post-receive script.

bengen pushed a commit to annotated tag v3.5.0
in repository yara.

commit 80589581a63e493cbc0301a4a30d045d36e532eb
Author: Victor M. Alvarez <plusvic at gmail.com>
Date:   Wed Jul 15 10:26:50 2015 +0200

    Fix issue causing ungreedy regular expressions resulting in greedy matches
    
    The regular expression /m.*?ssi/ when applied to "mississippi" was matching "mississi" instead of "missi". That's because the chosen atom "ssi" is located twice in the target string causing the regexp being applied backwards two times. The first time it matches "missi" and the second time it matches "mississi". The longer match was replacing the shortest one.
    
    Now the longer match replace the shortest one only if all quantifiers are greedy, if all are ungreedy the shortest match prevail. Mixing greedy and ungreedy quantifiers is forbidden.
---
 libyara/include/yara/re.h    | 18 +++++----
 libyara/include/yara/types.h |  4 ++
 libyara/parser.c             | 21 +++++++++++
 libyara/re_grammar.c         | 90 +++++++++++++++++++++++++++-----------------
 libyara/re_grammar.y         | 28 ++++++++++++--
 libyara/scan.c               | 18 ++++++---
 yara-python/tests.py         |  2 +
 7 files changed, 131 insertions(+), 50 deletions(-)

diff --git a/libyara/include/yara/re.h b/libyara/include/yara/re.h
index 5d1e237..d8895ec 100644
--- a/libyara/include/yara/re.h
+++ b/libyara/include/yara/re.h
@@ -72,14 +72,16 @@ limitations under the License.
 #define RE_OPCODE_JUMP                  0xC5
 
 
-#define RE_FLAGS_FAST_HEX_REGEXP          0x02
-#define RE_FLAGS_BACKWARDS                0x04
-#define RE_FLAGS_EXHAUSTIVE               0x08
-#define RE_FLAGS_WIDE                     0x10
-#define RE_FLAGS_NO_CASE                  0x20
-#define RE_FLAGS_SCAN                     0x40
-#define RE_FLAGS_DOT_ALL                  0x80
-#define RE_FLAGS_NOT_AT_START            0x100
+#define RE_FLAGS_FAST_HEX_REGEXP        0x02
+#define RE_FLAGS_BACKWARDS              0x04
+#define RE_FLAGS_EXHAUSTIVE             0x08
+#define RE_FLAGS_WIDE                   0x10
+#define RE_FLAGS_NO_CASE                0x20
+#define RE_FLAGS_SCAN                   0x40
+#define RE_FLAGS_DOT_ALL                0x80
+#define RE_FLAGS_NOT_AT_START          0x100
+#define RE_FLAGS_GREEDY                0x400
+#define RE_FLAGS_UNGREEDY              0x800
 
 
 typedef struct RE RE;
diff --git a/libyara/include/yara/types.h b/libyara/include/yara/types.h
index 62c3fec..9866c6e 100644
--- a/libyara/include/yara/types.h
+++ b/libyara/include/yara/types.h
@@ -115,6 +115,7 @@ typedef struct _YR_MATCHES
 #define STRING_GFLAGS_CHAIN_PART        0x2000
 #define STRING_GFLAGS_CHAIN_TAIL        0x4000
 #define STRING_GFLAGS_FIXED_OFFSET      0x8000
+#define STRING_GFLAGS_GREEDY_REGEXP     0x10000
 
 
 #define STRING_IS_HEX(x) \
@@ -132,6 +133,9 @@ typedef struct _YR_MATCHES
 #define STRING_IS_REGEXP(x) \
     (((x)->g_flags) & STRING_GFLAGS_REGEXP)
 
+#define STRING_IS_GREEDY_REGEXP(x) \
+    (((x)->g_flags) & STRING_GFLAGS_GREEDY_REGEXP)
+
 #define STRING_IS_FULL_WORD(x) \
     (((x)->g_flags) & STRING_GFLAGS_FULL_WORD)
 
diff --git a/libyara/parser.c b/libyara/parser.c
index f656988..c8e2841 100644
--- a/libyara/parser.c
+++ b/libyara/parser.c
@@ -531,6 +531,27 @@ YR_STRING* yr_parser_reduce_string_declaration(
     if (re->flags & RE_FLAGS_FAST_HEX_REGEXP)
       string_flags |= STRING_GFLAGS_FAST_HEX_REGEXP;
 
+    // Regular expressions in the strings section can't mix greedy and ungreedy
+    // quantifiers like .* and .*?. That's because these regular expressions can
+    // be matched forwards and/or backwards depending on the atom found, and we
+    // need the regexp to be all-greedy or all-ungreedy to be able to properly
+    // calculate the length of the match.
+
+    if ((re->flags & RE_FLAGS_GREEDY) &&
+        (re->flags & RE_FLAGS_UNGREEDY))
+    {
+      compiler->last_result = ERROR_INVALID_REGULAR_EXPRESSION;
+
+      yr_compiler_set_error_extra_info(compiler,
+          "greedy and ungreedy quantifiers can't be mixed in a regular "
+          "expression");
+
+      goto _exit;
+    }
+
+    if (re->flags & RE_FLAGS_GREEDY)
+      string_flags |= STRING_GFLAGS_GREEDY_REGEXP;
+
     if (yr_re_contains_dot_star(re))
     {
       snprintf(
diff --git a/libyara/re_grammar.c b/libyara/re_grammar.c
index a44d708..da9e3b7 100644
--- a/libyara/re_grammar.c
+++ b/libyara/re_grammar.c
@@ -470,10 +470,10 @@ static const yytype_int8 yyrhs[] =
 /* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
 static const yytype_uint16 yyrline[] =
 {
-       0,    88,    88,    93,    97,   101,   110,   126,   130,   141,
-     148,   157,   164,   173,   183,   194,   204,   215,   219,   225,
-     231,   237,   246,   250,   256,   264,   270,   276,   282,   288,
-     294,   300
+       0,    88,    88,    93,    97,   101,   110,   124,   128,   139,
+     149,   161,   171,   183,   196,   210,   223,   237,   241,   247,
+     253,   259,   268,   272,   278,   286,   292,   298,   304,   310,
+     316,   322
 };
 #endif
 
@@ -1465,9 +1465,7 @@ yyreduce:
   case 6:
 #line 111 "re_grammar.y"
     {
-        RE_NODE* node;
-
-        node = yr_re_node_create(RE_NODE_EMPTY, NULL, NULL);
+        RE_NODE* node = yr_re_node_create(RE_NODE_EMPTY, NULL, NULL);
 
         DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (2)].re_node));
         ERROR_IF(node == NULL, ERROR_INSUFICIENT_MEMORY);
@@ -1479,14 +1477,14 @@ yyreduce:
     break;
 
   case 7:
-#line 127 "re_grammar.y"
+#line 125 "re_grammar.y"
     {
         (yyval.re_node) = (yyvsp[(1) - (1)].re_node);
       }
     break;
 
   case 8:
-#line 131 "re_grammar.y"
+#line 129 "re_grammar.y"
     {
         (yyval.re_node) = yr_re_node_create(RE_NODE_CONCAT, (yyvsp[(1) - (2)].re_node), (yyvsp[(2) - (2)].re_node));
 
@@ -1497,8 +1495,11 @@ yyreduce:
     break;
 
   case 9:
-#line 142 "re_grammar.y"
+#line 140 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_STAR, (yyvsp[(1) - (2)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (2)].re_node));
@@ -1507,8 +1508,11 @@ yyreduce:
     break;
 
   case 10:
-#line 149 "re_grammar.y"
+#line 150 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_STAR, (yyvsp[(1) - (3)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (3)].re_node));
@@ -1519,8 +1523,11 @@ yyreduce:
     break;
 
   case 11:
-#line 158 "re_grammar.y"
+#line 162 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_PLUS, (yyvsp[(1) - (2)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (2)].re_node));
@@ -1529,8 +1536,11 @@ yyreduce:
     break;
 
   case 12:
-#line 165 "re_grammar.y"
+#line 172 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_PLUS, (yyvsp[(1) - (3)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (3)].re_node));
@@ -1541,8 +1551,11 @@ yyreduce:
     break;
 
   case 13:
-#line 174 "re_grammar.y"
+#line 184 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_RANGE, (yyvsp[(1) - (2)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (2)].re_node));
@@ -1554,8 +1567,11 @@ yyreduce:
     break;
 
   case 14:
-#line 184 "re_grammar.y"
+#line 197 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_RANGE, (yyvsp[(1) - (3)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (3)].re_node));
@@ -1568,8 +1584,11 @@ yyreduce:
     break;
 
   case 15:
-#line 195 "re_grammar.y"
+#line 211 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_RANGE, (yyvsp[(1) - (2)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (2)].re_node));
@@ -1581,8 +1600,11 @@ yyreduce:
     break;
 
   case 16:
-#line 205 "re_grammar.y"
+#line 224 "re_grammar.y"
     {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          (yyval.re_node) = yr_re_node_create(RE_NODE_RANGE, (yyvsp[(1) - (3)].re_node), NULL);
 
          DESTROY_NODE_IF((yyval.re_node) == NULL, (yyvsp[(1) - (3)].re_node));
@@ -1595,14 +1617,14 @@ yyreduce:
     break;
 
   case 17:
-#line 216 "re_grammar.y"
+#line 238 "re_grammar.y"
     {
          (yyval.re_node) = (yyvsp[(1) - (1)].re_node);
       }
     break;
 
   case 18:
-#line 220 "re_grammar.y"
+#line 242 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_WORD_BOUNDARY, NULL, NULL);
 
@@ -1611,7 +1633,7 @@ yyreduce:
     break;
 
   case 19:
-#line 226 "re_grammar.y"
+#line 248 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_NON_WORD_BOUNDARY, NULL, NULL);
 
@@ -1620,7 +1642,7 @@ yyreduce:
     break;
 
   case 20:
-#line 232 "re_grammar.y"
+#line 254 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_ANCHOR_START, NULL, NULL);
 
@@ -1629,7 +1651,7 @@ yyreduce:
     break;
 
   case 21:
-#line 238 "re_grammar.y"
+#line 260 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_ANCHOR_END, NULL, NULL);
 
@@ -1638,14 +1660,14 @@ yyreduce:
     break;
 
   case 22:
-#line 247 "re_grammar.y"
+#line 269 "re_grammar.y"
     {
          (yyval.re_node) = (yyvsp[(2) - (3)].re_node);
       }
     break;
 
   case 23:
-#line 251 "re_grammar.y"
+#line 273 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_ANY, NULL, NULL);
 
@@ -1654,7 +1676,7 @@ yyreduce:
     break;
 
   case 24:
-#line 257 "re_grammar.y"
+#line 279 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_LITERAL, NULL, NULL);
 
@@ -1665,7 +1687,7 @@ yyreduce:
     break;
 
   case 25:
-#line 265 "re_grammar.y"
+#line 287 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_WORD_CHAR, NULL, NULL);
 
@@ -1674,7 +1696,7 @@ yyreduce:
     break;
 
   case 26:
-#line 271 "re_grammar.y"
+#line 293 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_NON_WORD_CHAR, NULL, NULL);
 
@@ -1683,7 +1705,7 @@ yyreduce:
     break;
 
   case 27:
-#line 277 "re_grammar.y"
+#line 299 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_SPACE, NULL, NULL);
 
@@ -1692,7 +1714,7 @@ yyreduce:
     break;
 
   case 28:
-#line 283 "re_grammar.y"
+#line 305 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_NON_SPACE, NULL, NULL);
 
@@ -1701,7 +1723,7 @@ yyreduce:
     break;
 
   case 29:
-#line 289 "re_grammar.y"
+#line 311 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_DIGIT, NULL, NULL);
 
@@ -1710,7 +1732,7 @@ yyreduce:
     break;
 
   case 30:
-#line 295 "re_grammar.y"
+#line 317 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_NON_DIGIT, NULL, NULL);
 
@@ -1719,7 +1741,7 @@ yyreduce:
     break;
 
   case 31:
-#line 301 "re_grammar.y"
+#line 323 "re_grammar.y"
     {
          (yyval.re_node) = yr_re_node_create(RE_NODE_CLASS, NULL, NULL);
 
@@ -1731,7 +1753,7 @@ yyreduce:
 
 
 /* Line 1267 of yacc.c.  */
-#line 1735 "re_grammar.c"
+#line 1757 "re_grammar.c"
       default: break;
     }
   YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
@@ -1945,6 +1967,6 @@ yyreturn:
 }
 
 
-#line 309 "re_grammar.y"
+#line 331 "re_grammar.y"
 
 
diff --git a/libyara/re_grammar.y b/libyara/re_grammar.y
index 1f6b7df..3d91520 100644
--- a/libyara/re_grammar.y
+++ b/libyara/re_grammar.y
@@ -109,9 +109,7 @@ alternative
       }
     | alternative '|'
       {
-        RE_NODE* node;
-
-        node = yr_re_node_create(RE_NODE_EMPTY, NULL, NULL);
+        RE_NODE* node = yr_re_node_create(RE_NODE_EMPTY, NULL, NULL);
 
         DESTROY_NODE_IF($$ == NULL, $1);
         ERROR_IF(node == NULL, ERROR_INSUFICIENT_MEMORY);
@@ -140,6 +138,9 @@ concatenation
 repeat
     : single '*'
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          $$ = yr_re_node_create(RE_NODE_STAR, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
@@ -147,6 +148,9 @@ repeat
       }
     | single '*' '?'
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          $$ = yr_re_node_create(RE_NODE_STAR, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
@@ -156,6 +160,9 @@ repeat
       }
     | single '+'
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          $$ = yr_re_node_create(RE_NODE_PLUS, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
@@ -163,6 +170,9 @@ repeat
       }
     | single '+' '?'
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          $$ = yr_re_node_create(RE_NODE_PLUS, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
@@ -172,6 +182,9 @@ repeat
       }
     | single '?'
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          $$ = yr_re_node_create(RE_NODE_RANGE, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
@@ -182,6 +195,9 @@ repeat
       }
     | single '?' '?'
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          $$ = yr_re_node_create(RE_NODE_RANGE, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
@@ -193,6 +209,9 @@ repeat
       }
     | single _RANGE_
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_GREEDY;
+
          $$ = yr_re_node_create(RE_NODE_RANGE, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
@@ -203,6 +222,9 @@ repeat
       }
     | single _RANGE_ '?'
       {
+         RE* re = yyget_extra(yyscanner);
+         re->flags |= RE_FLAGS_UNGREEDY;
+
          $$ = yr_re_node_create(RE_NODE_RANGE, $1, NULL);
 
          DESTROY_NODE_IF($$ == NULL, $1);
diff --git a/libyara/scan.c b/libyara/scan.c
index 58d4e56..39336cf 100644
--- a/libyara/scan.c
+++ b/libyara/scan.c
@@ -391,7 +391,8 @@ void _yr_scan_update_match_chain_length(
 
 int _yr_scan_add_match_to_list(
     YR_MATCH* match,
-    YR_MATCHES* matches_list)
+    YR_MATCHES* matches_list,
+    int replace_if_exists)
 {
   YR_MATCH* insertion_point = matches_list->tail;
 
@@ -402,7 +403,9 @@ int _yr_scan_add_match_to_list(
   {
     if (match->offset == insertion_point->offset)
     {
-      insertion_point->length = match->length;
+      if (replace_if_exists)
+        insertion_point->length = match->length;
+
       return ERROR_SUCCESS;
     }
 
@@ -563,7 +566,7 @@ int _yr_scan_verify_chained_string_match(
           match->next = NULL;
 
           FAIL_ON_ERROR(_yr_scan_add_match_to_list(
-              match, &string->matches[tidx]));
+              match, &string->matches[tidx], FALSE));
         }
 
         match = next_match;
@@ -599,7 +602,8 @@ int _yr_scan_verify_chained_string_match(
 
       FAIL_ON_ERROR(_yr_scan_add_match_to_list(
           new_match,
-          &matching_string->unconfirmed_matches[tidx]));
+          &matching_string->unconfirmed_matches[tidx],
+          FALSE));
     }
   }
 
@@ -692,7 +696,8 @@ int _yr_scan_match_callback(
 
       FAIL_ON_ERROR(_yr_scan_add_match_to_list(
           new_match,
-          &string->matches[tidx]));
+          &string->matches[tidx],
+          STRING_IS_GREEDY_REGEXP(string)));
     }
   }
 
@@ -724,6 +729,9 @@ int _yr_scan_verify_re_match(
   int backward_matches = -1;
   int flags = 0;
 
+  if (STRING_IS_GREEDY_REGEXP(ac_match->string))
+    flags |= RE_FLAGS_GREEDY;
+
   if (STRING_IS_FAST_HEX_REGEXP(ac_match->string))
     exec = _yr_scan_fast_hex_re_exec;
   else
diff --git a/yara-python/tests.py b/yara-python/tests.py
index 78b9d44..eba156b 100644
--- a/yara-python/tests.py
+++ b/yara-python/tests.py
@@ -94,6 +94,8 @@ RE_TESTS = [
   ('ab*c', 'ac', SUCCEED, 'ac'),
   ('ab*bc', 'abc', SUCCEED, 'abc'),
   ('ab*bc', 'abbc', SUCCEED, 'abbc'),
+  ('a.*bb', 'abbbb', SUCCEED, 'abbbb'),
+  ('a.*?bbb', 'abbbbbb', SUCCEED, 'abbb'),
   ('a.*c', 'ac', SUCCEED, 'ac'),
   ('a.*c', 'axyzc', SUCCEED, 'axyzc'),
   ('ab+c', 'abbc', SUCCEED, 'abbc'),

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git