From 28ef52f37784817c6590cdafc94aa9b356123802 Mon Sep 17 00:00:00 2001
From: Cyrille Bagard <nocbos@gmail.com>
Date: Sun, 3 Mar 2024 12:29:53 +0100
Subject: Restore mixed hexadecimal pattern support.

---
 src/analysis/scan/tokens.l     | 253 ++++++++++++++++++++---------------------
 tests/analysis/scan/grammar.py | 202 ++++++++++++++++++++++++++++++++
 2 files changed, 327 insertions(+), 128 deletions(-)

diff --git a/src/analysis/scan/tokens.l b/src/analysis/scan/tokens.l
index ab881c1..e075cee 100644
--- a/src/analysis/scan/tokens.l
+++ b/src/analysis/scan/tokens.l
@@ -697,171 +697,171 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
 
 %{ /* Définition de motif en hexadécimal */ %}
 
-                   <bytes_value>"{" {
-                                        POP_STATE;
-                                        PUSH_STATE(bytes_hex);
-                                    }
-
-                     <bytes_hex>"}" { POP_STATE; }
+                       <bytes_value>"{" {
+                                            POP_STATE;
+                                            PUSH_STATE(bytes_hex);
+                                        }
 
-                     <bytes_hex>"[" {
-                                        PUSH_STATE(bytes_hex_range);
-                                        return HOOK_O;
-                                    }
+                         <bytes_hex>"}" { POP_STATE; }
 
-               <bytes_hex_range>"-" { return MINUS; }
+                         <bytes_hex>"[" {
+                                            PUSH_STATE(bytes_hex_range);
+                                            return HOOK_O;
+                                        }
 
-               <bytes_hex_range>"]" {
-                                        POP_STATE;
-                                        return HOOK_C;
-                                    }
+                   <bytes_hex_range>"-" { return MINUS; }
 
-                     <bytes_hex>"(" { return PAREN_O; }
+                   <bytes_hex_range>"]" {
+                                            POP_STATE;
+                                            return HOOK_C;
+                                        }
 
-                     <bytes_hex>")" { return PAREN_C; }
+                         <bytes_hex>"(" { return PAREN_O; }
 
-                     <bytes_hex>"|" { return PIPE; }
+                         <bytes_hex>")" { return PAREN_C; }
 
-                     <bytes_hex>"~" { return TILDE; }
+                         <bytes_hex>"|" { return PIPE; }
 
-   <bytes_hex>{hbyte}([ ]*{hbyte})* {
-                                        bool even;
-                                        size_t i;
-                                        bin_t byte;
-                                        bin_t value;
+                         <bytes_hex>"~" { return TILDE; }
 
-                                        tmp_0->len = 0;
+   <bytes_hex>{hbyte}([ ]*{hbyte})*[ ]* {
+                                            bool even;
+                                            size_t i;
+                                            bin_t byte;
+                                            bin_t value;
 
-                                        even = true;
+                                            tmp_0->len = 0;
 
-                                        for (i = 0; i < yyleng; i++)
-                                        {
-                                            byte = yytext[i];
+                                            even = true;
 
-                                            switch (byte)
+                                            for (i = 0; i < yyleng; i++)
                                             {
-                                                case ' ':
-                                                    continue;
-                                                    break;
+                                                byte = yytext[i];
 
-                                                case '0' ... '9':
-                                                    value = (byte - '0');
-                                                    break;
+                                                switch (byte)
+                                                {
+                                                    case ' ':
+                                                        continue;
+                                                        break;
 
-                                                case 'A' ... 'F':
-                                                    value = 0xa + (byte - 'A');
-                                                    break;
+                                                    case '0' ... '9':
+                                                        value = (byte - '0');
+                                                        break;
 
-                                                case 'a' ... 'f':
-                                                    value = 0xa + (byte - 'a');
-                                                    break;
+                                                    case 'A' ... 'F':
+                                                        value = 0xa + (byte - 'A');
+                                                        break;
 
-                                            }
+                                                    case 'a' ... 'f':
+                                                        value = 0xa + (byte - 'a');
+                                                        break;
 
-                                            if (even)
-                                                tmp_0->data[tmp_0->len] = (value << 4);
-                                            else
-                                                tmp_0->data[tmp_0->len++] |= value;
+                                                }
 
-                                            even = !even;
+                                                if (even)
+                                                    tmp_0->data[tmp_0->len] = (value << 4);
+                                                else
+                                                    tmp_0->data[tmp_0->len++] |= value;
 
-                                        }
+                                                even = !even;
+
+                                            }
 
-                                        assert(even);
+                                            assert(even);
 
 #ifndef NDEBUG
-                                        /* Pour rendre plus lisibles les impressions de débogage */
-                                        tmp_0->data[tmp_0->len] = '\0';
+                                            /* Pour rendre plus lisibles les impressions de débogage */
+                                            tmp_0->data[tmp_0->len] = '\0';
 #endif
 
-                                        yylval->tmp_cstring = tmp_0;
-                                        return HEX_BYTES;
-
-                                    }
+                                            yylval->tmp_cstring = tmp_0;
+                                            return HEX_BYTES;
 
-   <bytes_hex>[\?]{2}([ ]*[\?]{2})* {
-                                        unsigned long long counter;
-                                        size_t i;
+                                        }
 
-                                        counter = 0;
+   <bytes_hex>[\?]{2}([ ]*[\?]{2})*[ ]* {
+                                            unsigned long long counter;
+                                            size_t i;
 
-                                        for (i = 0; i < yyleng; i++)
-                                            if (yytext[i] == '?')
-                                                counter++;
+                                            counter = 0;
 
-                                        assert(counter % 2 == 0);
+                                            for (i = 0; i < yyleng; i++)
+                                                if (yytext[i] == '?')
+                                                    counter++;
 
-                                        yylval->unsigned_integer = counter / 2;
-                                        return FULL_MASK;
+                                            assert(counter % 2 == 0);
 
-                                    }
+                                            yylval->unsigned_integer = counter / 2;
+                                            return FULL_MASK;
 
-   <bytes_hex>{mbyte}([ ]*{mbyte})* {
-                                        bool even;
-                                        size_t i;
-                                        bin_t byte;
-                                        bin_t value;
+                                        }
 
-                                        tmp_0->len = 0;
-                                        tmp_1->len = 0;
+   <bytes_hex>{mbyte}([ ]*{mbyte})*[ ]* {
+                                            bool even;
+                                            size_t i;
+                                            bin_t byte;
+                                            bin_t value;
 
-                                        even = true;
+                                            tmp_0->len = 0;
+                                            tmp_1->len = 0;
 
-                                        for (i = 0; i < yyleng; i++)
-                                        {
-                                            byte = yytext[i];
+                                            even = true;
 
-                                            switch (byte)
+                                            for (i = 0; i < yyleng; i++)
                                             {
-                                                case ' ':
-                                                    continue;
-                                                    break;
-
-                                                case '?':
-                                                    even = !even;
-                                                    continue;
-                                                    break;
-
-                                                case '0' ... '9':
-                                                    value = (byte - '0');
-                                                    break;
-
-                                                case 'A' ... 'F':
-                                                    value = 0xa + (byte - 'A');
-                                                    break;
-
-                                                case 'a' ... 'f':
-                                                    value = 0xa + (byte - 'a');
-                                                    break;
+                                                byte = yytext[i];
+
+                                                switch (byte)
+                                                {
+                                                    case ' ':
+                                                        continue;
+                                                        break;
+
+                                                    case '?':
+                                                        even = !even;
+                                                        continue;
+                                                        break;
+
+                                                    case '0' ... '9':
+                                                        value = (byte - '0');
+                                                        break;
+
+                                                    case 'A' ... 'F':
+                                                        value = 0xa + (byte - 'A');
+                                                        break;
+
+                                                    case 'a' ... 'f':
+                                                        value = 0xa + (byte - 'a');
+                                                        break;
+
+                                                }
+
+                                                if (even)
+                                                {
+                                                    tmp_0->data[tmp_0->len++] = (value << 4);
+                                                    tmp_1->data[tmp_1->len++] = 0xf0;
+                                                }
+                                                else
+                                                {
+                                                    tmp_0->data[tmp_0->len++] = value;
+                                                    tmp_1->data[tmp_1->len++] = 0x0f;
+                                                }
+
+                                                even = !even;
 
                                             }
 
-                                            if (even)
-                                            {
-                                                tmp_0->data[tmp_0->len++] = (value << 4);
-                                                tmp_1->data[tmp_1->len++] = 0xf0;
-                                            }
-                                            else
-                                            {
-                                                tmp_0->data[tmp_0->len++] = value;
-                                                tmp_1->data[tmp_1->len++] = 0x0f;
-                                            }
-
-                                            even = !even;
-
-                                        }
-
 #ifndef NDEBUG
-                                        /* Pour rendre plus lisibles les impressions de débogage */
-                                        tmp_0->data[tmp_0->len] = '\0';
-                                        tmp_1->data[tmp_1->len] = '\0';
+                                            /* Pour rendre plus lisibles les impressions de débogage */
+                                            tmp_0->data[tmp_0->len] = '\0';
+                                            tmp_1->data[tmp_1->len] = '\0';
 #endif
 
-                                        yylval->masked.tmp_values = tmp_0;
-                                        yylval->masked.tmp_masks = tmp_1;
-                                        return SEMI_MASK;
+                                            yylval->masked.tmp_values = tmp_0;
+                                            yylval->masked.tmp_masks = tmp_1;
+                                            return SEMI_MASK;
 
-                                    }
+                                        }
 
 
 %{ /* Définition d'expressions régulières */ %}
@@ -1185,17 +1185,14 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
 <bytes>\"{str_not_escaped}+ { HANDLE_UNCOMPLETED_TOKEN; }
 
 
-<bytes_hex>{hbyte}([ ]*{hbyte})*[ ]* { HANDLE_UNCOMPLETED_TOKEN; }
-<bytes_hex>{hbyte}([ ]*{hbyte})*[ ]*[0-9a-fA-F] { HANDLE_UNCOMPLETED_TOKEN; }
+<bytes_hex>{hbyte}([ ]*{hbyte})*[ ]*[0-9a-fA-F]/[^?] { HANDLE_UNCOMPLETED_TOKEN; }
 
 
-<bytes_hex>[\?]{2}([ ]*[\?]{2})*[ ]* { HANDLE_UNCOMPLETED_TOKEN; }
-<bytes_hex>[\?]{2}([ ]*[\?]{2})*[ ]*[\?] { HANDLE_UNCOMPLETED_TOKEN; }
+<bytes_hex>[\?]{2}([ ]*[\?]{2})*[ ]*[\?]/[^0-9a-fA-F] { HANDLE_UNCOMPLETED_TOKEN; }
 
 
-<bytes_hex>{mbyte}([ ]*{mbyte})*[ ]* { HANDLE_UNCOMPLETED_TOKEN; }
-<bytes_hex>{mbyte}([ ]*{mbyte})*[ ]*\? { HANDLE_UNCOMPLETED_TOKEN; }
-<bytes_hex>{mbyte}([ ]*{mbyte})*[ ]*[0-9a-fA-F] { HANDLE_UNCOMPLETED_TOKEN; }
+<bytes_hex>{mbyte}([ ]*{mbyte})*[ ]*\?/[^?] { HANDLE_UNCOMPLETED_TOKEN; }
+<bytes_hex>{mbyte}([ ]*{mbyte})*[ ]*[0-9a-fA-F]/[^0-9a-fA-F] { HANDLE_UNCOMPLETED_TOKEN; }
 
 
 <bytes_regex>\\ { HANDLE_UNCOMPLETED_TOKEN; }
diff --git a/tests/analysis/scan/grammar.py b/tests/analysis/scan/grammar.py
index 3a8196a..14f67fa 100644
--- a/tests/analysis/scan/grammar.py
+++ b/tests/analysis/scan/grammar.py
@@ -276,6 +276,208 @@ rule test {
         self.check_rule_success(rule, cnt)
 
 
+    def testBackingUpHandlers(self):
+        """Ensure handlers for backing up removals do not limit the grammar."""
+
+        cnt = MemoryContent(b'AB12')
+
+        # Uncompleted token in rule definition: '?? ?? '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?? ?? }
+
+   condition:
+      #a == 3
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '?? '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?? 4? }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '?? ?'
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?? ?2 }
+
+   condition:
+      #a == 2
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '?? '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?? 42 }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+
+        # Uncompleted token in rule definition: '?1 ?'
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?1 ?? }
+
+   condition:
+      #a == 2
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '?1 4? '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?1 4? }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '?1 ?2 '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?1 ?2 }
+
+   condition:
+      #a == 2
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '?1 4'
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { ?1 42 }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+
+        # Uncompleted token in rule definition: '41 '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { 41 ?? }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '41 4'
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { 41 4? }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '41 '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { 41 ?2 }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+        # Uncompleted token in rule definition: '41 42 '
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = { 41 42 }
+
+   condition:
+      #a == 1
+
+}
+'''
+
+        self.check_rule_success(rule, content=cnt)
+
+
+
+
 # TODO : test     <haystack> matches <regex>
 
 
-- 
cgit v0.11.2-87-g4458