From aae46fab1f41df0cce9da9fc3c17eea776e487b1 Mon Sep 17 00:00:00 2001
From: Cyrille Bagard <nocbos@gmail.com>
Date: Wed, 25 Oct 2023 00:13:29 +0200
Subject: Handle big alloctions for strings in conditions with regular
 expressions.

---
 src/analysis/scan/grammar.y    | 43 +++++++++++++++---------------
 src/analysis/scan/tokens.l     | 59 ++++++++++++------------------------------
 tests/analysis/scan/fuzzing.py | 15 +++++++++++
 3 files changed, 53 insertions(+), 64 deletions(-)

diff --git a/src/analysis/scan/grammar.y b/src/analysis/scan/grammar.y
index 02e5973..f31a5d1 100644
--- a/src/analysis/scan/grammar.y
+++ b/src/analysis/scan/grammar.y
@@ -6,10 +6,10 @@
 
 
 /* Affiche un message d'erreur suite à l'analyse en échec. */
-static int yyerror(GContentScanner *, yyscan_t, GScanRule **, sized_string_t *, sized_string_t *, void/*GBytesPattern*/ **, char **, size_t *, size_t *, char *);
+static int yyerror(GContentScanner *, yyscan_t, GScanRule **, sized_string_t *, sized_string_t *, void/*GBytesPattern*/ **, char *);
 
 #define raise_error(msg) \
-    yyerror(scanner, yyscanner, built_rule, tmp_0, tmp_1, NULL, buf, allocated, used, msg)
+    yyerror(scanner, yyscanner, built_rule, tmp_0, tmp_1, NULL, msg)
 
 %}
 
@@ -103,13 +103,13 @@ typedef void *yyscan_t;
 
 %define api.pure full
 
-%parse-param { GContentScanner *scanner } { yyscan_t yyscanner } { GScanRule **built_rule } { sized_string_t *tmp_0} { sized_string_t *tmp_1} { void /*GBytesPattern*/ **built_pattern } { char **buf } { size_t *allocated } { size_t *used }
-%lex-param { yyscan_t yyscanner } { sized_string_t *tmp_0} { sized_string_t *tmp_1} { void/*GBytesPattern*/ **built_pattern } { char **buf } { size_t *allocated } { size_t *used }
+%parse-param { GContentScanner *scanner } { yyscan_t yyscanner } { GScanRule **built_rule } { sized_string_t *tmp_0} { sized_string_t *tmp_1} { void /*GBytesPattern*/ **built_pattern }
+%lex-param { yyscan_t yyscanner } { sized_string_t *tmp_0} { sized_string_t *tmp_1} { void/*GBytesPattern*/ **built_pattern }
 
 %code provides {
 
 #define YY_DECL \
-    int rost_lex(YYSTYPE *yylval_param, yyscan_t yyscanner, sized_string_t *tmp_0, sized_string_t *tmp_1, void/*GBytesPattern*/ **built_pattern, char **buf, size_t *allocated, size_t *used)
+    int rost_lex(YYSTYPE *yylval_param, yyscan_t yyscanner, sized_string_t *tmp_0, sized_string_t *tmp_1, void/*GBytesPattern*/ **built_pattern)
 
 YY_DECL;
 
@@ -172,7 +172,6 @@ YY_DECL;
 %token FALSE_           "false"
 %token SIGNED_INTEGER
 %token UNSIGNED_INTEGER
-%token STRING
 
 %token KB MB GB
 
@@ -244,7 +243,6 @@ YY_DECL;
 
 %type <signed_integer> SIGNED_INTEGER
 %type <unsigned_integer> UNSIGNED_INTEGER
-%type <sized_cstring> STRING
 
 %type <rule_flags> rule_flags
 %type <rule_flags> rule_flag
@@ -1033,11 +1031,23 @@ YY_DECL;
                     __converted = $1 * 1073741824;
                     $$ = g_scan_literal_expression_new(LVT_UNSIGNED_INTEGER, &__converted);
                 }
-                | STRING
+                | PLAIN_TEXT
                 {
                     $$ = g_scan_literal_expression_new(LVT_STRING, &$1);
                 }
-                | STRING "[" cexpression "]"
+                | PLAIN_TEXT "[" cexpression "]"
+                {
+                    GScanExpression *__src;
+                    __src = g_scan_literal_expression_new(LVT_STRING, &$1);
+                    $$ = g_scan_set_item_new(__src, $3);
+                    g_object_unref(G_OBJECT(__src));
+                    g_object_unref(G_OBJECT($3));
+                }
+                | ESCAPED_TEXT
+                {
+                    $$ = g_scan_literal_expression_new(LVT_STRING, &$1);
+                }
+                | ESCAPED_TEXT "[" cexpression "]"
                 {
                     GScanExpression *__src;
                     __src = g_scan_literal_expression_new(LVT_STRING, &$1);
@@ -1735,7 +1745,7 @@ relational_expr : cexpression "<" cexpression  { $$ = g_scan_relational_operatio
 *                                                                             *
 ******************************************************************************/
 
-static int yyerror(GContentScanner *scanner, yyscan_t yyscanner, GScanRule **built_rule, sized_string_t *tmp_0, sized_string_t *tmp_1, void/*GBytesPattern*/ **built_pattern, char **buf, size_t *allocated, size_t *used, char *msg)
+static int yyerror(GContentScanner *scanner, yyscan_t yyscanner, GScanRule **built_rule, sized_string_t *tmp_0, sized_string_t *tmp_1, void/*GBytesPattern*/ **built_pattern, char *msg)
 {
 	printf("YYERROR line %d: %s\n", yyget_lineno(yyscanner), msg);
 
@@ -1765,9 +1775,6 @@ bool process_rules_definitions(GContentScanner *scanner, const char *text, size_
     sized_string_t tmp_0;                   /* Zone tampon #1              */
     sized_string_t tmp_1;                   /* Zone tampon #2              */
     void /*GBytesPattern*/ *built_pattern;           /* Motif en construction       */
-    char *buf;                              /* Zone de travail temporaire  */
-    size_t allocated;                       /* Taille de mémoire allouée   */
-    size_t used;                            /* Quantité utilisée           */
     yyscan_t lexstate;                      /* Gestion d'analyse lexicale  */
     YY_BUFFER_STATE state;                  /* Contexte d'analyse          */
     int status;                             /* Bilan d'une analyse         */
@@ -1784,17 +1791,11 @@ bool process_rules_definitions(GContentScanner *scanner, const char *text, size_
 
     built_pattern = NULL;
 
-    allocated = 256;
-    used = 0;
-
-    buf = malloc(allocated * sizeof(char));
-    buf[0] = '\0';
-
     rost_lex_init(&lexstate);
 
     state = rost__scan_bytes(text, length, lexstate);
 
-    status = yyparse(scanner, lexstate, &built_rule, &tmp_0, &tmp_1, &built_pattern, &buf, &allocated, &used);
+    status = yyparse(scanner, lexstate, &built_rule, &tmp_0, &tmp_1, &built_pattern);
 
     result = (status == EXIT_SUCCESS);
 
@@ -1805,8 +1806,6 @@ bool process_rules_definitions(GContentScanner *scanner, const char *text, size_
     exit_szstr(&tmp_0);
     exit_szstr(&tmp_1);
 
-    free(buf);
-
     return result;
 
 }
diff --git a/src/analysis/scan/tokens.l b/src/analysis/scan/tokens.l
index 1174ae7..11f5d9e 100644
--- a/src/analysis/scan/tokens.l
+++ b/src/analysis/scan/tokens.l
@@ -8,7 +8,6 @@
 
 %{
 
-//#include "manual.h"
 
 #include <assert.h>
 #include <stdbool.h>
@@ -16,8 +15,6 @@
 
 
 
-
-
 /******************************************************************************
 *                                                                             *
 *  Paramètres  : src = liste d'octets à traiter.                              *
@@ -296,20 +293,10 @@ static void rost_unescape_bytes(const char *src, size_t len, sized_string_t *out
 }
 
 
-
 #define PUSH_STATE(s) yy_push_state(s, yyscanner)
 #define POP_STATE     yy_pop_state(yyscanner)
 
 
-
-#define EXTEND_BUFFER_IF_NEEDED(extra)      \
-    if ((*used + extra) > *allocated)       \
-    {                                       \
-        *allocated *= 2;                    \
-        *buf = realloc(*buf, *allocated);   \
-    }
-
-
 %}
 
 
@@ -342,7 +329,6 @@ static void rost_unescape_bytes(const char *src, size_t len, sized_string_t *out
 %x bytes_regex_range
 
 %x condition
-%x strlit
 
 %x wait_for_colon
 
@@ -537,38 +523,27 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
 <condition>[mM][bB]             { return MB; }
 <condition>[gG][bB]             { return GB; }
 
-<condition>"\""                 {
-                                    *used = 0;
-                                    PUSH_STATE(strlit);
-                                }
+<condition>\"{str_not_escaped}+\"   {
+                                        yylval->sized_cstring.data = yytext + 1;
+                                        yylval->sized_cstring.len = yyleng - 2;
 
-<strlit>"\""                    {
-                                    POP_STATE; 
-                                    yylval->sized_cstring.data = *buf;
-                                    yylval->sized_cstring.len = *used;
-                                    return STRING;
-                                }
+                                        return PLAIN_TEXT;
+                                    }
 
-<strlit>"\\\""                  { EXTEND_BUFFER_IF_NEEDED(1); (*buf)[(*used)++] = '"'; }
-<strlit>"\\t"                   { EXTEND_BUFFER_IF_NEEDED(1); (*buf)[(*used)++] = '\t'; }
-<strlit>"\\r"                   { EXTEND_BUFFER_IF_NEEDED(1); (*buf)[(*used)++] = '\r'; }
-<strlit>"\\n"                   { EXTEND_BUFFER_IF_NEEDED(1); (*buf)[(*used)++] = '\n'; }
-<strlit>"\\\\"                  { EXTEND_BUFFER_IF_NEEDED(1); (*buf)[(*used)++] = '\\'; }
+<condition>\"{str_mixed}+\"         {
+                                        POP_STATE;
 
-<strlit>\\x[0-9a-fA-F]{2}       {
-                                    char __ch;
-                                    __ch = strtol(yytext + 2, NULL, 16);
-                                    EXTEND_BUFFER_IF_NEEDED(1);
-                                    (*buf)[(*used)++] = __ch;
-                                }
+                                        rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
 
-<strlit>[^\\\"]+                {
-                                    size_t __len;
-                                    __len = strlen(yytext);
-                                    EXTEND_BUFFER_IF_NEEDED(__len);
-                                    strcpy(&(*buf)[*used], yytext);
-                                    *used += __len;
-                                }
+#ifndef NDEBUG
+                                        /* Pour rendre plus lisibles les impressions de débogage */
+                                        tmp_0->data[tmp_0->len] = '\0';
+#endif
+
+                                        yylval->tmp_cstring = tmp_0;
+
+                                        return ESCAPED_TEXT;
+                                    }
 
 
 %{ /* Définitions communes pour la section "bytes:" */ %}
diff --git a/tests/analysis/scan/fuzzing.py b/tests/analysis/scan/fuzzing.py
index 044fe54..e26c496 100644
--- a/tests/analysis/scan/fuzzing.py
+++ b/tests/analysis/scan/fuzzing.py
@@ -177,3 +177,18 @@ rule test {
 '''
 
         self.check_rule_failure(rule)
+
+
+    def testAllocations(self):
+        """Handle big alloctions for strings in conditions with regular expressions."""
+
+        rule = '''
+rule test {
+
+   condition:
+      "%s" == "%s"
+
+}
+''' % ("0" * (256 * 2 + 8), "0" * (256 * 2 + 8))
+
+        self.check_rule_success(rule)
-- 
cgit v0.11.2-87-g4458