From ea9be67a9ddec2b4b96b3114bab5f192e53c7911 Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Sat, 24 Feb 2024 18:19:31 +0100 Subject: Rely on the ACISM tree to detect identical patterns. --- src/analysis/scan/patterns/backends/acism-int.h | 20 ++- src/analysis/scan/patterns/backends/acism.c | 189 ++++++++++++++---------- tests/analysis/scan/matches.py | 39 ++++- 3 files changed, 169 insertions(+), 79 deletions(-) diff --git a/src/analysis/scan/patterns/backends/acism-int.h b/src/analysis/scan/patterns/backends/acism-int.h index af8080f..c4a72ca 100644 --- a/src/analysis/scan/patterns/backends/acism-int.h +++ b/src/analysis/scan/patterns/backends/acism-int.h @@ -47,10 +47,28 @@ /* Définition d'une portion de cible */ typedef struct _acism_source_t { + /** + * Champs renseignés dans g_acism_backend_setup_for(). + */ + const uint8_t *atoms; /* Motif remarquable */ size_t len; /* Nombre d'octets considérés */ - uint32_t coverage[2]; /* Départ et quantité de suivis*/ + /** + * Champs renseignés dans g_acism_backend_build_trie(). + */ + + bool is_first; /* Première instance rencontrée*/ + + union + { + uint32_t coverage[2]; /* Départ et quantité de suivis*/ + struct + { + size_t first_source; /* Indice de première source */ + uint32_t index; /* Position dans la liste */ + }; + }; } acism_source_t; diff --git a/src/analysis/scan/patterns/backends/acism.c b/src/analysis/scan/patterns/backends/acism.c index aa462a7..3f742fd 100644 --- a/src/analysis/scan/patterns/backends/acism.c +++ b/src/analysis/scan/patterns/backends/acism.c @@ -314,64 +314,43 @@ size_t g_acism_backend_get_atom_max_size(const GAcismBackend *backend) static void g_acism_backend_setup_for(GAcismBackend *backend, const uint8_t *pattern, size_t len, uint32_t tmp_id[2]) { - size_t i; /* Boucle de parcours */ - int ret; /* Bilan d'une comparaison */ + size_t current; /* Indice de source courante */ acism_source_t *source; /* Définition à mémoriser */ - /*Recherche d'un motif déjà sollicité */ - /** - * '\x00\x00\x00\x00abcd1234' '\x01\x01\x01\x01abcd1234' peuvent en effet - * constituer deux cibles différentes, mais elles comportent normalement - * la même séquence atomique à rechercher : 'abcd1234'. + * Les motifs '\x00\x00\x00\x00abcd1234' et '\x01\x01\x01\x01abcd1234' + * peuvent constituer deux cibles différentes, mais ils comportent + * normalement la même séquence atomique à rechercher : 'abcd1234'. + * + * Chaque motif 'abcd1234' proposé est enregistré ici, et se verra in fine + * attribuer un identifiant propre. Le regroupement des motifs identiques + * et la constitution des identifiants finaux sont réalisés au moment de la + * construction de l'arborescence de recherche. */ - for (i = 0; i < backend->sources_count; i++) - { - source = backend->sources + i; + /* Pré-inscription pour l'extérieur */ - if (source->len != len) - continue; + current = backend->sources_count; - ret = memcmp(source->atoms, pattern, len); + tmp_id[0] = current; + tmp_id[1] = 0; /* Non utilisé */ - if (ret == 0) - { - tmp_id[0] = i; - tmp_id[1] = source->coverage[SOURCE_COVERAGE_COUNT]; + /* Inscription en interne */ - source->coverage[SOURCE_COVERAGE_COUNT]++; - break; + backend->sources = realloc(backend->sources, ++backend->sources_count * sizeof(acism_source_t)); - } + source = &backend->sources[current]; - } + source->atoms = pattern; + source->len = len; - /* Introduction d'un nouveau motif au besoin */ - - if (i == backend->sources_count) - { - backend->sources = realloc(backend->sources, ++backend->sources_count * sizeof(acism_source_t)); - - source = &backend->sources[backend->sources_count - 1]; - - source->atoms = pattern; - source->len = len; - - source->coverage[SOURCE_COVERAGE_COUNT] = 1; - - tmp_id[0] = i; - tmp_id[1] = 0; - - backend->nchars += len; + backend->nchars += len; #ifdef __USE_BYTE_FREQ - for (i = 0; i < len; i++) - backend->frequencies[pattern[i]].frequency++; + for (i = 0; i < len; i++) + backend->frequencies[pattern[i]].frequency++; #endif - } - } @@ -516,13 +495,13 @@ static void g_acism_backend_define_codes(GAcismBackend *backend) static void g_acism_backend_build_trie(GAcismBackend *backend) { size_t i; /* Boucle de parcours #1 */ - uint32_t current_start; /* Indice de gestionnaire */ acism_trie_node_t *next; /* Prochain noeud disponible */ acism_trie_node_t *node; /* Tête de parcours */ acism_source_t *source; /* Définition à mémoriser */ size_t k; /* Boucle de parcours #2 */ acism_code_t code; /* Identifiant de symbole */ acism_trie_node_t *parent; /* Sauvegarde d'un accès */ + uint32_t current_start; /* Indice de gestionnaire */ backend->nodes = calloc(backend->nchars + 1, sizeof(acism_trie_node_t)); @@ -532,8 +511,6 @@ static void g_acism_backend_build_trie(GAcismBackend *backend) backend->nodes[i].max_child_code = MIN_ACISM_CODE; } - current_start = 0; - next = backend->nodes + 1; for (i = 0; i < backend->sources_count; i++) @@ -542,13 +519,6 @@ static void g_acism_backend_build_trie(GAcismBackend *backend) source = &backend->sources[i]; - /* Mise à jour de la couverture des gestionnaires de suivi */ - - source->coverage[SOURCE_COVERAGE_START] = current_start; - source->coverage[SOURCE_COVERAGE_END] += current_start; - - current_start = source->coverage[SOURCE_COVERAGE_END]; - /* Parcours des noeuds contenus */ for (k = 0; k < source->len && node->child != NULL; k++) @@ -612,36 +582,91 @@ static void g_acism_backend_build_trie(GAcismBackend *backend) } - /* Creéation d'une nouvelle branche avec le reliquat */ - for (; k < source->len; k++) + /* Si un atome (partiellement ?) identique a déjà été inscrit */ + if (k == source->len) + { + /** + * L'atome déjà inscrit est plus long, et on se trouve ici dans le + * parcours de l'arborescence qui mène à sa conclusion, sans + * inscription d'une fin de parcours au point courant. + */ + if (node->matched_atom == 0) + goto register_leaf; + + /** + * Rattachement au motif strictement identique. + */ + else + { + assert(backend->sources[node->matched_atom - 1].first); + + source->is_first = false; + source->first_source = node->matched_atom - 1; + source->index = backend->sources[source->first_source].coverage[SOURCE_COVERAGE_COUNT]++; + + } + + } + + else { + /* Creéation d'une nouvelle branche avec le reliquat */ + for (; k < source->len; k++) + { #ifdef __USE_BYTE_FREQ - code = backend->codes_for_bytes[source->atoms[k]]; + code = backend->codes_for_bytes[source->atoms[k]]; #else - code = 1 + source->atoms[k]; + code = 1 + source->atoms[k]; #endif - next->parent = node; - next->suffix_link = node; - next->data = source->atoms[k]; - next->code = code; + next->parent = node; + next->suffix_link = node; + next->data = source->atoms[k]; + next->code = code; + + node->child = next++; - node->child = next++; + if (code < node->min_child_code) node->min_child_code = code; + if (code > node->max_child_code) node->max_child_code = code; + node->children_count++; - if (code < node->min_child_code) node->min_child_code = code; - if (code > node->max_child_code) node->max_child_code = code; - node->children_count++; + node = node->child; - node = node->child; + } - } + register_leaf: - node->matched_atom = i + 1; + node->matched_atom = i + 1; + + source->is_first = true; + source->coverage[SOURCE_COVERAGE_COUNT] = 1; + + } } backend->nodes_used = next - backend->nodes; + /* Construction des bases pour identifiants */ + + current_start = 0; + + for (i = 0; i < backend->sources_count; i++) + { + source = &backend->sources[i]; + + if (source->is_first) + { + source->coverage[SOURCE_COVERAGE_START] = current_start; + + current_start += source->coverage[SOURCE_COVERAGE_COUNT]; + + } + + } + + assert(current_start == backend->source_count); + } @@ -776,9 +801,6 @@ static int compare_node_according_to_code_range(const acism_trie_node_t **a, con if (result == 0) result = sort_unsigned_long(_b->children_count, _a->children_count); - - - } return result; @@ -1118,10 +1140,12 @@ static void g_acism_backend_build_interleave_array(GAcismBackend *backend) base[0].match = 1; base[0].single_source = source->coverage[SOURCE_COVERAGE_COUNT] == 1 ? 1 : 0; base[0].atom_size = backend->sources[node->matched_atom - 1].len - 1; + coverage = &backend->coverages[node->state_index * 2]; coverage[SOURCE_COVERAGE_START] = source->coverage[SOURCE_COVERAGE_START]; - coverage[SOURCE_COVERAGE_COUNT] = source->coverage[SOURCE_COVERAGE_COUNT]; + coverage[SOURCE_COVERAGE_END] = coverage[SOURCE_COVERAGE_START] \ + + source->coverage[SOURCE_COVERAGE_COUNT]; for (iter = node->parent->suffix_link; iter != NULL; iter = iter->suffix_link) { @@ -1232,14 +1256,28 @@ static patid_t g_acism_backend_build_plain_pattern_id(const GAcismBackend *backe { patid_t result; /* Identifiant à retourner */ acism_source_t *source; /* Motif d'origine concerné */ + acism_source_t *reference; /* Renvoi vers un même motif */ assert(tmp_id[0] < backend->sources_count); + /** + * L'indicateur tmp_id[1] n'est pas utilisé ici. + */ + source = backend->sources + tmp_id[0]; - result = source->coverage[SOURCE_COVERAGE_START] + tmp_id[1]; + if (source->is_first) + result = source->coverage[SOURCE_COVERAGE_START]; + + else + { + reference = backend->sources + source->first_source; + + assert(source->index < reference->coverage[SOURCE_COVERAGE_COUNT]); - assert(result < source->coverage[SOURCE_COVERAGE_END]); + result = reference->coverage[SOURCE_COVERAGE_START] + source->index; + + } return result; @@ -1262,10 +1300,7 @@ static size_t g_acism_backend_count_plain_pattern_ids(const GAcismBackend *backe { size_t result; /* Quantité à retourner */ - if (backend->sources_count == 0) - result = 0; - else - result = backend->sources[backend->sources_count - 1].coverage[SOURCE_COVERAGE_END]; + result = backend->sources_count; return result; diff --git a/tests/analysis/scan/matches.py b/tests/analysis/scan/matches.py index 768531b..efcae4f 100644 --- a/tests/analysis/scan/matches.py +++ b/tests/analysis/scan/matches.py @@ -7,7 +7,7 @@ class TestRostMatchs(RostTestClass): """TestCases for the ROST pattern matching engine.""" def testCountMatches(self): - """Count matches patterns.""" + """Count matched patterns.""" cnt = MemoryContent(b'aaa aaa bbb aaa') @@ -25,3 +25,40 @@ rule test { ''' self.check_rule_success(rule, cnt) + + + def testCountSameMatches(self): + """Count matches of similar patterns.""" + + cnt = MemoryContent(b'ABCDabcdABCDabcd') + + rule = ''' +rule test { + + bytes: + $a = "\x61\x62\x63\x64" + $b = "\x61\x62\x63\x64" + + condition: + #a == 2 and #b == 2 + +} +''' + + self.check_rule_success(rule, cnt) + + + rule = ''' +rule test { + + bytes: + $a = "\x61\x62\x63\x64" + $b = "\x61\x62\x63" + + condition: + #a == 2 and #b == 2 + +} +''' + + self.check_rule_success(rule, cnt) -- cgit v0.11.2-87-g4458