From ea9be67a9ddec2b4b96b3114bab5f192e53c7911 Mon Sep 17 00:00:00 2001
From: Cyrille Bagard <nocbos@gmail.com>
Date: Sat, 24 Feb 2024 18:19:31 +0100
Subject: Rely on the ACISM tree to detect identical patterns.

---
 src/analysis/scan/patterns/backends/acism-int.h |  20 ++-
 src/analysis/scan/patterns/backends/acism.c     | 189 ++++++++++++++----------
 tests/analysis/scan/matches.py                  |  39 ++++-
 3 files changed, 169 insertions(+), 79 deletions(-)

diff --git a/src/analysis/scan/patterns/backends/acism-int.h b/src/analysis/scan/patterns/backends/acism-int.h
index af8080f..c4a72ca 100644
--- a/src/analysis/scan/patterns/backends/acism-int.h
+++ b/src/analysis/scan/patterns/backends/acism-int.h
@@ -47,10 +47,28 @@
 /* Définition d'une portion de cible */
 typedef struct _acism_source_t
 {
+    /**
+     * Champs renseignés dans g_acism_backend_setup_for().
+     */
+
     const uint8_t *atoms;                   /* Motif remarquable           */
     size_t len;                             /* Nombre d'octets considérés  */
 
-    uint32_t coverage[2];                   /* Départ et quantité de suivis*/
+    /**
+     * Champs renseignés dans g_acism_backend_build_trie().
+     */
+
+    bool is_first;                          /* Première instance rencontrée*/
+
+    union
+    {
+        uint32_t coverage[2];               /* Départ et quantité de suivis*/
+        struct
+        {
+            size_t first_source;            /* Indice de première source   */
+            uint32_t index;                 /* Position dans la liste      */
+        };
+    };
 
 } acism_source_t;
 
diff --git a/src/analysis/scan/patterns/backends/acism.c b/src/analysis/scan/patterns/backends/acism.c
index aa462a7..3f742fd 100644
--- a/src/analysis/scan/patterns/backends/acism.c
+++ b/src/analysis/scan/patterns/backends/acism.c
@@ -314,64 +314,43 @@ size_t g_acism_backend_get_atom_max_size(const GAcismBackend *backend)
 
 static void g_acism_backend_setup_for(GAcismBackend *backend, const uint8_t *pattern, size_t len, uint32_t tmp_id[2])
 {
-    size_t i;                               /* Boucle de parcours          */
-    int ret;                                /* Bilan d'une comparaison     */
+    size_t current;                         /* Indice de source courante   */
     acism_source_t *source;                 /* Définition à mémoriser      */
 
-    /*Recherche d'un motif déjà sollicité */
-
     /**
-     * '\x00\x00\x00\x00abcd1234' '\x01\x01\x01\x01abcd1234' peuvent en effet
-     * constituer deux cibles différentes, mais elles comportent normalement
-     * la même séquence atomique à rechercher : 'abcd1234'.
+     * Les motifs '\x00\x00\x00\x00abcd1234' et '\x01\x01\x01\x01abcd1234'
+     * peuvent constituer deux cibles différentes, mais ils comportent
+     * normalement la même séquence atomique à rechercher : 'abcd1234'.
+     *
+     * Chaque motif 'abcd1234' proposé est enregistré ici, et se verra in fine
+     * attribuer un identifiant propre. Le regroupement des motifs identiques
+     * et la constitution des identifiants finaux sont réalisés au moment de la
+     * construction de l'arborescence de recherche.
      */
 
-    for (i = 0; i < backend->sources_count; i++)
-    {
-        source = backend->sources + i;
+    /* Pré-inscription pour l'extérieur */
 
-        if (source->len != len)
-            continue;
+    current = backend->sources_count;
 
-        ret = memcmp(source->atoms, pattern, len);
+    tmp_id[0] = current;
+    tmp_id[1] = 0;  /* Non utilisé */
 
-        if (ret == 0)
-        {
-            tmp_id[0] = i;
-            tmp_id[1] = source->coverage[SOURCE_COVERAGE_COUNT];
+    /* Inscription en interne */
 
-            source->coverage[SOURCE_COVERAGE_COUNT]++;
-            break;
+    backend->sources = realloc(backend->sources, ++backend->sources_count * sizeof(acism_source_t));
 
-        }
+    source = &backend->sources[current];
 
-    }
+    source->atoms = pattern;
+    source->len = len;
 
-    /* Introduction d'un nouveau motif au besoin */
-
-    if (i == backend->sources_count)
-    {
-        backend->sources = realloc(backend->sources, ++backend->sources_count * sizeof(acism_source_t));
-
-        source = &backend->sources[backend->sources_count - 1];
-
-        source->atoms = pattern;
-        source->len = len;
-
-        source->coverage[SOURCE_COVERAGE_COUNT] = 1;
-
-        tmp_id[0] = i;
-        tmp_id[1] = 0;
-
-        backend->nchars += len;
+    backend->nchars += len;
 
 #ifdef __USE_BYTE_FREQ
-        for (i = 0; i < len; i++)
-            backend->frequencies[pattern[i]].frequency++;
+    for (i = 0; i < len; i++)
+        backend->frequencies[pattern[i]].frequency++;
 #endif
 
-    }
-
 }
 
 
@@ -516,13 +495,13 @@ static void g_acism_backend_define_codes(GAcismBackend *backend)
 static void g_acism_backend_build_trie(GAcismBackend *backend)
 {
     size_t i;                               /* Boucle de parcours #1       */
-    uint32_t current_start;                 /* Indice de gestionnaire      */
     acism_trie_node_t *next;                /* Prochain noeud disponible   */
     acism_trie_node_t *node;                /* Tête de parcours            */
     acism_source_t *source;                 /* Définition à mémoriser      */
     size_t k;                               /* Boucle de parcours #2       */
     acism_code_t code;                      /* Identifiant de symbole      */
     acism_trie_node_t *parent;              /* Sauvegarde d'un accès       */
+    uint32_t current_start;                 /* Indice de gestionnaire      */
 
     backend->nodes = calloc(backend->nchars + 1, sizeof(acism_trie_node_t));
 
@@ -532,8 +511,6 @@ static void g_acism_backend_build_trie(GAcismBackend *backend)
         backend->nodes[i].max_child_code = MIN_ACISM_CODE;
     }
 
-    current_start = 0;
-
     next = backend->nodes + 1;
 
     for (i = 0; i < backend->sources_count; i++)
@@ -542,13 +519,6 @@ static void g_acism_backend_build_trie(GAcismBackend *backend)
 
         source = &backend->sources[i];
 
-        /* Mise à jour de la couverture des gestionnaires de suivi */
-
-        source->coverage[SOURCE_COVERAGE_START] = current_start;
-        source->coverage[SOURCE_COVERAGE_END] += current_start;
-
-        current_start = source->coverage[SOURCE_COVERAGE_END];
-
         /* Parcours des noeuds contenus */
 
         for (k = 0; k < source->len && node->child != NULL; k++)
@@ -612,36 +582,91 @@ static void g_acism_backend_build_trie(GAcismBackend *backend)
 
         }
 
-        /* Creéation d'une nouvelle branche avec le reliquat */
-        for (; k < source->len; k++)
+        /* Si un atome (partiellement ?) identique a déjà été inscrit */
+        if (k == source->len)
+        {
+            /**
+             * L'atome déjà inscrit est plus long, et on se trouve ici dans le
+             * parcours de l'arborescence qui mène à sa conclusion, sans
+             * inscription d'une fin de parcours au point courant.
+             */
+            if (node->matched_atom == 0)
+                goto register_leaf;
+
+            /**
+             * Rattachement au motif strictement identique.
+             */
+            else
+            {
+                assert(backend->sources[node->matched_atom - 1].first);
+
+                source->is_first = false;
+                source->first_source = node->matched_atom - 1;
+                source->index = backend->sources[source->first_source].coverage[SOURCE_COVERAGE_COUNT]++;
+
+            }
+
+        }
+
+        else
         {
+            /* Creéation d'une nouvelle branche avec le reliquat */
+            for (; k < source->len; k++)
+            {
 #ifdef __USE_BYTE_FREQ
-            code = backend->codes_for_bytes[source->atoms[k]];
+                code = backend->codes_for_bytes[source->atoms[k]];
 #else
-            code = 1 + source->atoms[k];
+                code = 1 + source->atoms[k];
 #endif
 
-            next->parent = node;
-            next->suffix_link = node;
-            next->data = source->atoms[k];
-            next->code = code;
+                next->parent = node;
+                next->suffix_link = node;
+                next->data = source->atoms[k];
+                next->code = code;
+
+                node->child = next++;
 
-            node->child = next++;
+                if (code < node->min_child_code) node->min_child_code = code;
+                if (code > node->max_child_code) node->max_child_code = code;
+                node->children_count++;
 
-            if (code < node->min_child_code) node->min_child_code = code;
-            if (code > node->max_child_code) node->max_child_code = code;
-            node->children_count++;
+                node = node->child;
 
-            node = node->child;
+            }
 
-        }
+ register_leaf:
 
-        node->matched_atom = i + 1;
+            node->matched_atom = i + 1;
+
+            source->is_first = true;
+            source->coverage[SOURCE_COVERAGE_COUNT] = 1;
+
+        }
 
     }
 
     backend->nodes_used = next - backend->nodes;
 
+    /* Construction des bases pour identifiants */
+
+    current_start = 0;
+
+    for (i = 0; i < backend->sources_count; i++)
+    {
+        source = &backend->sources[i];
+
+        if (source->is_first)
+        {
+            source->coverage[SOURCE_COVERAGE_START] = current_start;
+
+            current_start += source->coverage[SOURCE_COVERAGE_COUNT];
+
+        }
+
+    }
+
+    assert(current_start == backend->source_count);
+
 }
 
 
@@ -776,9 +801,6 @@ static int compare_node_according_to_code_range(const acism_trie_node_t **a, con
         if (result == 0)
             result = sort_unsigned_long(_b->children_count, _a->children_count);
 
-
-
-
     }
 
     return result;
@@ -1118,10 +1140,12 @@ static void g_acism_backend_build_interleave_array(GAcismBackend *backend)
             base[0].match = 1;
             base[0].single_source = source->coverage[SOURCE_COVERAGE_COUNT] == 1 ? 1 : 0;
             base[0].atom_size = backend->sources[node->matched_atom - 1].len - 1;
+
             coverage = &backend->coverages[node->state_index * 2];
 
             coverage[SOURCE_COVERAGE_START] = source->coverage[SOURCE_COVERAGE_START];
-            coverage[SOURCE_COVERAGE_COUNT] = source->coverage[SOURCE_COVERAGE_COUNT];
+            coverage[SOURCE_COVERAGE_END] = coverage[SOURCE_COVERAGE_START] \
+                + source->coverage[SOURCE_COVERAGE_COUNT];
 
             for (iter = node->parent->suffix_link; iter != NULL; iter = iter->suffix_link)
             {
@@ -1232,14 +1256,28 @@ static patid_t g_acism_backend_build_plain_pattern_id(const GAcismBackend *backe
 {
     patid_t result;                         /* Identifiant à retourner     */
     acism_source_t *source;                 /* Motif d'origine concerné    */
+    acism_source_t *reference;              /* Renvoi vers un même motif   */
 
     assert(tmp_id[0] < backend->sources_count);
 
+    /**
+     * L'indicateur tmp_id[1] n'est pas utilisé ici.
+     */
+
     source = backend->sources + tmp_id[0];
 
-    result = source->coverage[SOURCE_COVERAGE_START] + tmp_id[1];
+    if (source->is_first)
+        result = source->coverage[SOURCE_COVERAGE_START];
+
+    else
+    {
+        reference = backend->sources + source->first_source;
+
+        assert(source->index < reference->coverage[SOURCE_COVERAGE_COUNT]);
 
-    assert(result < source->coverage[SOURCE_COVERAGE_END]);
+        result = reference->coverage[SOURCE_COVERAGE_START] + source->index;
+
+    }
 
     return result;
 
@@ -1262,10 +1300,7 @@ static size_t g_acism_backend_count_plain_pattern_ids(const GAcismBackend *backe
 {
     size_t result;                          /* Quantité à retourner        */
 
-    if (backend->sources_count == 0)
-        result = 0;
-    else
-        result = backend->sources[backend->sources_count - 1].coverage[SOURCE_COVERAGE_END];
+    result = backend->sources_count;
 
     return result;
 
diff --git a/tests/analysis/scan/matches.py b/tests/analysis/scan/matches.py
index 768531b..efcae4f 100644
--- a/tests/analysis/scan/matches.py
+++ b/tests/analysis/scan/matches.py
@@ -7,7 +7,7 @@ class TestRostMatchs(RostTestClass):
     """TestCases for the ROST pattern matching engine."""
 
     def testCountMatches(self):
-        """Count matches patterns."""
+        """Count matched patterns."""
 
         cnt = MemoryContent(b'aaa aaa bbb aaa')
 
@@ -25,3 +25,40 @@ rule test {
 '''
 
         self.check_rule_success(rule, cnt)
+
+
+    def testCountSameMatches(self):
+        """Count matches of similar patterns."""
+
+        cnt = MemoryContent(b'ABCDabcdABCDabcd')
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = "\x61\x62\x63\x64"
+      $b = "\x61\x62\x63\x64"
+
+   condition:
+      #a == 2 and #b == 2
+
+}
+'''
+
+        self.check_rule_success(rule, cnt)
+
+
+        rule = '''
+rule test {
+
+   bytes:
+      $a = "\x61\x62\x63\x64"
+      $b = "\x61\x62\x63"
+
+   condition:
+      #a == 2 and #b == 2
+
+}
+'''
+
+        self.check_rule_success(rule, cnt)
-- 
cgit v0.11.2-87-g4458