/* Chrysalide - Outil d'analyse de fichiers binaires
* bitap.c - méthode de recherche basée sur l'algorithme Bitap
*
* Copyright (C) 2022 Cyrille Bagard
*
* This file is part of Chrysalide.
*
* Chrysalide is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Chrysalide is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Foobar. If not, see .
*/
#include "bitap.h"
#include
#include
#include
#include
#include "bitap-int.h"
#include "../../../../core/logs.h"
//#include "../../matches/bytes.h"
/* ---------------------- IMPLANTATION D'UNE NOUVELLE APPROCHE ---------------------- */
/* Initialise la classe des méthodes basée sur Bitmap. */
static void g_bitap_backend_class_init(GBitapBackendClass *);
/* Initialise une instance de méthodes basée sur Bitmap. */
static void g_bitap_backend_init(GBitapBackend *);
/* Supprime toutes les références externes. */
static void g_bitap_backend_dispose(GBitapBackend *);
/* Procède à la libération totale de la mémoire. */
static void g_bitap_backend_finalize(GBitapBackend *);
/* --------------------- IMPLEMENTATION DES FONCTIONS DE CLASSE --------------------- */
/* Indique la taille maximale des suites d'octets recherchées. */
size_t g_bitap_backend_get_atom_max_size(const GBitapBackend *);
/* Inscrit dans le moteur une chaîne de caractères à rechercher. */
static patid_t g_bitap_backend_enroll_plain_pattern(GBitapBackend *, GScanContext *, const uint8_t *, size_t);
/* Parcours un contenu binaire à la recherche de motifs. */
static void g_bitap_backend_run_scan(const GBitapBackend *, GScanContext *, GBinContent *);
/* Imprime quelques faits quant aux éléments mis en place. */
static void g_bitap_backend_output_stats(const GBitapBackend *);
/* ---------------------- OPTIMISATIONS POUR ARCHITECTURE AVX2 ---------------------- */
/* Indique la valeur portée par une expression rationnelle. */
static void extend_grouped_strings_avx2(grouped_strings_avx2_t ***, size_t *);
/* Inscrit dans le moteur une chaîne de caractères à rechercher. */
static patid_t enroll_plain_pattern_avx2(GBitapBackend *, GScanContext *, const bin_t *, size_t);
/* Parcours un contenu binaire à la recherche de motifs. */
static void run_scan_avx2(const GBitapBackend *, GScanContext *, GBinContent *);
/* --------------------- OPTIMISATIONS POUR ARCHITECTURE AVX512 --------------------- */
/* Indique la valeur portée par une expression rationnelle. */
static void extend_grouped_strings_avx512(grouped_strings_avx512_t ***, size_t *);
/* Inscrit dans le moteur une chaîne de caractères à rechercher. */
static patid_t enroll_plain_pattern_avx512(GBitapBackend *, GScanContext *, const bin_t *, size_t);
/* Parcours un contenu binaire à la recherche de motifs. */
static void run_scan_avx512(const GBitapBackend *, GScanContext *, GBinContent *);
/* ---------------------------------------------------------------------------------- */
/* IMPLANTATION D'UNE NOUVELLE APPROCHE */
/* ---------------------------------------------------------------------------------- */
/* Indique le type défini pour un moteur de recherche pour données. */
G_DEFINE_TYPE(GBitapBackend, g_bitap_backend, G_TYPE_ENGINE_BACKEND);
/******************************************************************************
* *
* Paramètres : klass = classe à initialiser. *
* *
* Description : Initialise la classe des méthodes basée sur Bitmap. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void g_bitap_backend_class_init(GBitapBackendClass *klass)
{
GObjectClass *object; /* Autre version de la classe */
GEngineBackendClass *backend; /* Version de classe parente */
object = G_OBJECT_CLASS(klass);
object->dispose = (GObjectFinalizeFunc/* ! */)g_bitap_backend_dispose;
object->finalize = (GObjectFinalizeFunc)g_bitap_backend_finalize;
backend = G_ENGINE_BACKEND_CLASS(klass);
backend->get_max_size = (get_backend_atom_max_size_fc)g_bitap_backend_get_atom_max_size;
backend->enroll_plain = (enroll_plain_into_backend_fc)g_bitap_backend_enroll_plain_pattern;
backend->run_scan = (run_backend_scan_fc)g_bitap_backend_run_scan;
backend->output = (output_backend_stats_fc)g_bitap_backend_output_stats;
}
/******************************************************************************
* *
* Paramètres : backend = instance à initialiser. *
* *
* Description : Initialise une instance de méthodes basée sur Bitmap. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void g_bitap_backend_init(GBitapBackend *backend)
{
}
/******************************************************************************
* *
* Paramètres : backend = instance d'objet GLib à traiter. *
* *
* Description : Supprime toutes les références externes. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void g_bitap_backend_dispose(GBitapBackend *backend)
{
G_OBJECT_CLASS(g_bitap_backend_parent_class)->dispose(G_OBJECT(backend));
}
/******************************************************************************
* *
* Paramètres : backend = instance d'objet GLib à traiter. *
* *
* Description : Procède à la libération totale de la mémoire. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void g_bitap_backend_finalize(GBitapBackend *backend)
{
G_OBJECT_CLASS(g_bitap_backend_parent_class)->finalize(G_OBJECT(backend));
}
/******************************************************************************
* *
* Paramètres : - *
* *
* Description : Crée une méthode de recherche basée sur l'algorithme Bitap. *
* *
* Retour : Méthode mise en place. *
* *
* Remarques : - *
* *
******************************************************************************/
GEngineBackend *g_bitap_backend_new(void)
{
GBitapBackend *result; /* Structure à retourner */
result = g_object_new(G_TYPE_BITAP_BACKEND, NULL);
return G_ENGINE_BACKEND(result);
}
/* ---------------------------------------------------------------------------------- */
/* IMPLEMENTATION DES FONCTIONS DE CLASSE */
/* ---------------------------------------------------------------------------------- */
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à consulter. *
* *
* Description : Indique la taille maximale des suites d'octets recherchées. *
* *
* Retour : Valeur strictement positive. *
* *
* Remarques : - *
* *
******************************************************************************/
size_t g_bitap_backend_get_atom_max_size(const GBitapBackend *backend)
{
size_t result; /* Taille à faire connaître */
result = BITAP_ATOM_SIZE;
return result;
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = contexte de l'analyse à mener. *
* plain = chaîne de caractères classique à intégrer. *
* len = taille de cette chaîne. *
* *
* Description : Inscrit dans le moteur une chaîne de caractères à rechercher.*
* *
* Retour : Bilan de l'opération. *
* *
* Remarques : - *
* *
******************************************************************************/
static patid_t g_bitap_backend_enroll_plain_pattern(GBitapBackend *backend, GScanContext *context, const uint8_t *plain, size_t len)
{
patid_t result; /* Identifiant à retourner */
result = INVALID_PATTERN_ID;
if (0)
result = enroll_plain_pattern_avx2(backend, context, plain, len);
else
result = enroll_plain_pattern_avx512(backend, context, plain, len);
return result;
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = lieu d'enregistrement des résultats. *
* content = données binaires à analyser. *
* *
* Description : Parcours un contenu binaire à la recherche de motifs. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void g_bitap_backend_run_scan(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
cpu_set_t old_mask; /* Cartographie des CPU #1 */
int ret; /* Bilan d'un appel */
unsigned int cpu; /* Processeur courant */
cpu_set_t new_mask; /* Cartographie des CPU #2 */
ret = sched_getaffinity(0, sizeof(cpu_set_t), &old_mask);
if (ret != 0)
{
LOG_ERROR_N("sched_getaffinity");
goto exit;
}
ret = getcpu(&cpu, NULL);
if (ret != 0)
{
LOG_ERROR_N("get_cpu");
goto exit;
}
CPU_ZERO(&new_mask);
CPU_SET(cpu, &new_mask);
ret = sched_setaffinity(0, sizeof(cpu_set_t), &new_mask);
if (ret != 0)
{
LOG_ERROR_N("sched_setaffinity");
goto exit;
}
if (0)
run_scan_avx2(backend, context, content);
else
run_scan_avx512(backend, context, content);
exit:
;
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à consulter. *
* *
* Description : Imprime quelques faits quant aux éléments mis en place. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void g_bitap_backend_output_stats(const GBitapBackend *backend)
{
printf("hello here!\n");
}
/* ---------------------------------------------------------------------------------- */
/* OPTIMISATIONS POUR ARCHITECTURE AVX2 */
/* ---------------------------------------------------------------------------------- */
/**
* Cf. https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX,AVX2
*/
/******************************************************************************
* *
* Paramètres : strings = ensemble de groupes constitués. [OUT] *
* count = nombre de groupes courant. [OUT] *
* *
* Description : Indique la valeur portée par une expression rationnelle. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void extend_grouped_strings_avx2(grouped_strings_avx2_t ***strings, size_t *count)
{
grouped_strings_avx2_t *new; /* Zone supplémentaire */
size_t i; /* Boucle de parcours */
/* Définition d'un nouvel élément vierge */
new = aligned_alloc(256, sizeof(grouped_strings_avx2_t));
for (i = 0; i < 256; i++)
new->pattern_masks[i] = _mm256_set1_epi8(~0);
new->found_masks = _mm256_set1_epi8(~0);
new->R = _mm256_set1_epi8(~1);
for (i = 0; i < 32; i++)
{
new->m[i] = 0;
new->found_id[i] = INVALID_PATTERN_ID;
}
new->available = 32;
new->used = 0;
/* Inscription */
*strings = realloc(*strings, ++(*count) * sizeof(grouped_strings_avx2_t *));
(*strings)[*count - 1] = new;
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = contexte de l'analyse à mener. *
* plain = chaîne de caractères classique à intégrer. *
* plen = taille de cette chaîne. *
* *
* Description : Inscrit dans le moteur une chaîne de caractères à rechercher.*
* *
* Retour : Indice de résultats pour le motif. *
* *
* Remarques : - *
* *
******************************************************************************/
static patid_t enroll_plain_pattern_avx2(GBitapBackend *backend, GScanContext *context, const bin_t *plain, size_t plen)
{
patid_t result; /* Identifiant à retourner */
grouped_strings_avx2_t ***strings; /* Groupe de chaînes visé */
size_t *count; /* Taille de ce groupe */
grouped_strings_avx2_t *last; /* Dernier groupe à remplir */
size_t n; /* Indice dans le groupe */
size_t i; /* Boucle de parcours */
__m256i *letter; /* Lettre à marquer */
/* Sélection du groupe de travail adéquat */
strings = &backend->manager_avx2.strings_8;
count = &backend->manager_avx2.count_8;
/* Préparation de la place nécessaire */
if (*count == 0)
{
extend_grouped_strings_avx2(strings, count);
last = (*strings)[0];
}
else
{
last = (*strings)[*count - 1];
if (last->used == last->available)
{
extend_grouped_strings_avx2(strings, count);
last = (*strings)[*count - 1];
}
}
/* Intégration d'une nouvelle chaîne */
n = last->used++;
last->m[n] = plen;
result = g_scan_context_get_new_pattern_id(context);
last->found_id[n] = result;
((uint8_t *)&last->found_masks)[n] = (1 << plen);
for (i = 0; i < plen; i++)
{
letter = last->pattern_masks + plain[i];
((uint8_t *)letter)[n] &= ~(1 << i);
}
return result;
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = lieu d'enregistrement des résultats. *
* content = données binaires à analyser. *
* *
* Description : Parcours un contenu binaire à la recherche de motifs. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void run_scan_avx2(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
const group_manager_avx2_t *manager; /* Accès simplifié */
phys_t dlen; /* Quantité de données */
vmpa2t pos; /* Point de départ ciblé */
const bin_t *data; /* Données à analyser */
register __m256i zero asm("ymm11"); /* Constante 0 sur 256 bits */
size_t k; /* Boucle de parcours #1 */
grouped_strings_avx2_t group; /* Copie pour accès locaux */
register __m256i R asm("ymm12"); /* Résultats courants */
register __m256i found_masks asm("ymm10"); /* Vérifications accélérées */
//__m256i pre_shift_mask; /* Préparation de décalage */
//phys_t i; /* Boucle de parcours #2 */
const bin_t *iter;
const bin_t *maxiter;
//phys_t i; /* Boucle de parcours #2 */
volatile register __m256i xxxx; /* Test de correspondances */
__m256i test; /* Test de correspondances */
__m256i test2; /* Test de correspondances */
__m256i status; /* Statut d'une comparaison */
int masks[10];
int mask; /* Masque d'accès rapide */
size_t j; /* Boucle de parcours #3 */
int ret;
//return;
/* Initialisations diverses */
manager = &backend->manager_avx2;
dlen = g_binary_content_compute_size(content);
g_binary_content_compute_start_pos(content, &pos);
data = g_binary_content_get_raw_access(content, &pos, dlen);
zero = _mm256_set1_epi16(0);
asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop;nop;");
xxxx = _mm256_set1_epi8(~1);
asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop;nop;");
/* Recherches des chaînes de moins de 8 caractères */
printf(" --- manager->count_8: %zu\n", manager->count_8);
ret = 0;
for (k = 0; k < manager->count_8; k++)
{
memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx2_t));
//printf(" --- group.used: %zu\n", group.used);
asm volatile
(
/*
* R = _mm256_set1_epi8(~1);
*
*/
"movabs $0xfefefefefefefefe, %%rax ; "
"vpbroadcastq %%rax, %[STATE] ; "
/*
*
*/
"vmovdqa %[FOUND_SRC], %[FOUND_DST] ; "
: [STATE] "=v"(R),
[FOUND_DST] "=v"(found_masks)
: [FOUND_SRC] "m"(group.found_masks)
: "memory", "rax"
);
//pre_shift_mask = _mm256_set1_epi8(0xef);
maxiter = data + dlen;
for (iter = data; (iter + 10) < maxiter; iter += 10)
{
//printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);
asm volatile
(
#if 0
/*
* R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
*
* Latency : 1-9
* Throughput : 0.5
* #Uops : 1-2
* Port Usage : 1*p015+1*p23
*
*/
"vpor %[PATTERN], %[STATE], %[STATE] ; "
#else
/*
* %ymm = group.pattern_masks[data[i]];
*
* Latency : 5-8
* Throughput : 0.5
* #Uops : 1
* Port Usage : 1*p23
*
*/
"vmovdqa %[PATTERN0], %%ymm0 ; "
"vmovdqa %[PATTERN1], %%ymm1 ; "
"vmovdqa %[PATTERN2], %%ymm2 ; "
"vmovdqa %[PATTERN3], %%ymm3 ; "
"vmovdqa %[PATTERN4], %%ymm4 ; "
"vmovdqa %[PATTERN5], %%ymm5 ; "
"vmovdqa %[PATTERN6], %%ymm6 ; "
"vmovdqa %[PATTERN7], %%ymm7 ; "
"vmovdqa %[PATTERN7], %%ymm8 ; "
"vmovdqa %[PATTERN7], %%ymm9 ; "
/*
* R = _mm256_or_si256(R, %ymm);
*
* Latency : 1
* Throughput : 0.33
* #Uops : 1
* Port Usage : 1*p015
*
*/
"vpor %%ymm0, %[STATE], %[STATE] ; "
#endif
/*
* R = _mm256_add_epi8(R, R);
*
* Latency : 1
* Throughput : 0.3
* #Uops : 1
* Port Usage : 1*p015
*
*/
"vpaddb %[STATE], %[STATE], %[STATE] ; "
/*
* test = _mm256_and_si256(R, group.found_masks);
*
* Latency : 1
* Throughput : 0.33
* #Uops : 1
* Port Usage : 1*p015
*
*/
"vpand %[FOUND], %[STATE], %%ymm0 ; "
/* Déroulemets... */
"vpor %%ymm1, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm2, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm3, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm4, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm5, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm6, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm7, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm8, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpor %%ymm9, %[STATE], %[STATE] ; "
"vpaddb %[STATE], %[STATE], %[STATE] ; "
"vpand %[FOUND], %[STATE], %%ymm1 ; "
"vpand %[FOUND], %[STATE], %%ymm2 ; "
"vpand %[FOUND], %[STATE], %%ymm3 ; "
"vpand %[FOUND], %[STATE], %%ymm4 ; "
"vpand %[FOUND], %[STATE], %%ymm5 ; "
"vpand %[FOUND], %[STATE], %%ymm6 ; "
"vpand %[FOUND], %[STATE], %%ymm7 ; "
"vpand %[FOUND], %[STATE], %%ymm8 ; "
"vpand %[FOUND], %[STATE], %%ymm9 ; "
/*
* status = _mm256_cmpeq_epi8(test, zero);
*
* Latency : 1
* Throughput : 0.5
* #Uops : 1
* Port Usage : 1*p01
*
*/
"vpcmpeqb %%ymm0, %[NUL], %%ymm0 ; "
/*
* mask = _mm256_movemask_epi8(status);
*
* Latency : <5
* Throughput : 1
* #Uops : 1
* Port Usage : 1*p0
*
*/
"vpmovmskb %%ymm0, %[MASK0] ; "
"vpcmpeqb %%ymm1, %[NUL], %%ymm1 ; "
"vpcmpeqb %%ymm2, %[NUL], %%ymm2 ; "
"vpcmpeqb %%ymm3, %[NUL], %%ymm3 ; "
"vpcmpeqb %%ymm4, %[NUL], %%ymm4 ; "
"vpcmpeqb %%ymm5, %[NUL], %%ymm5 ; "
"vpcmpeqb %%ymm6, %[NUL], %%ymm6 ; "
"vpcmpeqb %%ymm7, %[NUL], %%ymm7 ; "
"vpcmpeqb %%ymm8, %[NUL], %%ymm8 ; "
"vpcmpeqb %%ymm9, %[NUL], %%ymm9 ; "
"vpmovmskb %%ymm1, %[MASK1] ; "
"vpmovmskb %%ymm2, %[MASK2] ; "
"vpmovmskb %%ymm3, %[MASK3] ; "
"vpmovmskb %%ymm4, %[MASK4] ; "
"vpmovmskb %%ymm5, %[MASK5] ; "
"vpmovmskb %%ymm6, %[MASK6] ; "
"vpmovmskb %%ymm7, %[MASK7] ; "
"vpmovmskb %%ymm8, %[MASK8] ; "
"vpmovmskb %%ymm9, %[MASK9] ; "
//"vmovdqa %%ymm7, %[OUTPUT] ; "
//"vmovdqa %%ymm8, %[OUTPUT2] ; "
: [STATE] "+v"(R),
[OUTPUT] "=v"(test),
[OUTPUT2] "=v"(test2),
[MASK0] "=r"(mask),
[MASK1] "=r"(mask),
[MASK2] "=r"(mask),
[MASK3] "=r"(mask),
[MASK4] "=r"(mask),
[MASK5] "=r"(mask),
[MASK6] "=r"(mask),
[MASK7] "=r"(mask),
[MASK8] "=r"(mask),
[MASK9] "=r"(mask),
[NUL] "+v"(zero)
: [PATTERN0] "m"(group./*manager->strings_8[k]->*/pattern_masks[*iter]),
[PATTERN1] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 1)]),
[PATTERN2] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 2)]),
[PATTERN3] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 3)]),
[PATTERN4] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 4)]),
[PATTERN5] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 5)]),
[PATTERN6] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 6)]),
[PATTERN7] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 7)]),
[PATTERN8] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 8)]),
[PATTERN9] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 9)]),
[FOUND] "v"(found_masks)
: "memory", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9"
);
/*
printf(" test: %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx ... %02hhx %02hhx %02hhx %02hhx\n",
((uint8_t *)&test)[0],
((uint8_t *)&test)[1],
((uint8_t *)&test)[2],
((uint8_t *)&test)[3],
((uint8_t *)&test)[4],
((uint8_t *)&test)[5],
((uint8_t *)&test)[6],
((uint8_t *)&test)[7],
((uint8_t *)&test)[16],
((uint8_t *)&test)[17],
((uint8_t *)&test)[18],
((uint8_t *)&test)[19]);
printf(" test2: %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx ... %02hhx %02hhx %02hhx %02hhx\n",
((uint8_t *)&test2)[0],
((uint8_t *)&test2)[1],
((uint8_t *)&test2)[2],
((uint8_t *)&test2)[3],
((uint8_t *)&test2)[4],
((uint8_t *)&test2)[5],
((uint8_t *)&test2)[6],
((uint8_t *)&test2)[7],
((uint8_t *)&test2)[16],
((uint8_t *)&test2)[17],
((uint8_t *)&test2)[18],
((uint8_t *)&test2)[19]);
*/
#if 0
//printf(" > %c\n", data[i]);
R = _mm256_or_si256(R, group.pattern_masks[*iter]);
//printf("group pattern: %hhx\n", *((uint8_t *)&group.pattern_masks[data[i]]));
//printf("R: %hhx\n", *((uint8_t *)&R));
//R = _mm256_and_si256(R, pre_shift_mask);
//printf("R after and: %hhx\n", *((uint8_t *)&R));
R = _mm256_add_epi8(R, R);
//R = _mm256_slli_si256(R, 1);
//printf("R after shift: %hhx\n", *((uint8_t *)&R));
test = _mm256_and_si256(R, group.found_masks);
#if 1
status = _mm256_cmpeq_epi8(test, zero);
mask = _mm256_movemask_epi8(status);
#else
//mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
mask = _mm256_movemask_epi8(test);
#endif
#endif
//printf(" mask : %x\n", mask);
if (mask != 0)
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == 1)
{
//assert((i + 1) >= group.m[j]);
g_scan_context_register_atom_match(context,
group.found_id[j],
(iter - data) + 1 - group.m[j]);
}
mask >>= 1;
}
}
#if 0
for (; iter < maxiter; iter++)
{
//printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);
asm volatile
(
/*
* R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
*
* Latency : 1
* Throughput : 0.33
* #Uops : 1
* Port Usage : 1*p015
*
*/
"vpor %[PATTERN], %[STATE], %[STATE] ; "
/*
* R = _mm256_add_epi8(R, R);
*
* Latency : 1
* Throughput : 0.3
* #Uops : 1
* Port Usage : 1*p015
*
*/
"vpaddb %[STATE], %[STATE], %[STATE] ; "
/*
* test = _mm256_and_si256(R, group.found_masks);
*
* Latency : 1
* Throughput : 0.33
* #Uops : 1
* Port Usage : 1*p015
*
*/
"vpand %[FOUND], %[STATE], %%ymm7 ; "
/*
* status = _mm256_cmpeq_epi8(test, zero);
*
* Latency : 1
* Throughput : 0.5
* #Uops : 1
* Port Usage : 1*p01
*
*/
"vpcmpeqb %%ymm7, %[NUL], %%ymm8 ; "
/*
* mask = _mm256_movemask_epi8(status);
*
* Latency : <5
* Throughput : 1
* #Uops : 1
* Port Usage : 1*p0
*
*/
"vpmovmskb %%ymm8, %[MASK0] ; "
//"vmovdqa %%ymm7, %[OUTPUT] ; "
//"vmovdqa %%ymm8, %[OUTPUT2] ; "
: [STATE] "+v"(R),
[OUTPUT] "=v"(test),
[OUTPUT2] "=v"(test2),
[MASK0] "=r"(mask),
[NUL] "+v"(zero)
: [PATTERN] "m"(group./*manager->strings_8[k]->*/pattern_masks[*iter]),
[FOUND] "v"(found_masks)
: "memory", "ymm7", "ymm8"
);
/*
printf(" test: %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx ... %02hhx %02hhx %02hhx %02hhx\n",
((uint8_t *)&test)[0],
((uint8_t *)&test)[1],
((uint8_t *)&test)[2],
((uint8_t *)&test)[3],
((uint8_t *)&test)[4],
((uint8_t *)&test)[5],
((uint8_t *)&test)[6],
((uint8_t *)&test)[7],
((uint8_t *)&test)[16],
((uint8_t *)&test)[17],
((uint8_t *)&test)[18],
((uint8_t *)&test)[19]);
printf(" test2: %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx ... %02hhx %02hhx %02hhx %02hhx\n",
((uint8_t *)&test2)[0],
((uint8_t *)&test2)[1],
((uint8_t *)&test2)[2],
((uint8_t *)&test2)[3],
((uint8_t *)&test2)[4],
((uint8_t *)&test2)[5],
((uint8_t *)&test2)[6],
((uint8_t *)&test2)[7],
((uint8_t *)&test2)[16],
((uint8_t *)&test2)[17],
((uint8_t *)&test2)[18],
((uint8_t *)&test2)[19]);
*/
#if 0
//printf(" > %c\n", data[i]);
R = _mm256_or_si256(R, group.pattern_masks[*iter]);
//printf("group pattern: %hhx\n", *((uint8_t *)&group.pattern_masks[data[i]]));
//printf("R: %hhx\n", *((uint8_t *)&R));
//R = _mm256_and_si256(R, pre_shift_mask);
//printf("R after and: %hhx\n", *((uint8_t *)&R));
R = _mm256_add_epi8(R, R);
//R = _mm256_slli_si256(R, 1);
//printf("R after shift: %hhx\n", *((uint8_t *)&R));
test = _mm256_and_si256(R, group.found_masks);
#if 1
status = _mm256_cmpeq_epi8(test, zero);
mask = _mm256_movemask_epi8(status);
#else
//mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
mask = _mm256_movemask_epi8(test);
#endif
#endif
//printf(" mask : %x\n", mask);
if (mask != 0)
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == 1)
{
//assert((i + 1) >= group.m[j]);
g_scan_context_register_atom_match(context,
group.found_id[j],
(iter - data) + 1 - group.m[j]);
}
mask >>= 1;
}
}
#endif
}
}
#if 0
#if 0
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = lieu d'enregistrement des résultats. *
* content = données binaires à analyser. *
* *
* Description : Parcours un contenu binaire à la recherche de motifs. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void run_scan_avx2(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
const group_manager_avx2_t *manager; /* Accès simplifié */
grouped_strings_avx2_t groups[10]; /* Copie pour accès locaux */
phys_t dlen; /* Quantité de données */
vmpa2t pos; /* Point de départ ciblé */
const bin_t *data; /* Données à analyser */
__m256i zero; /* Constante 0 sur 256 bits */
size_t k; /* Boucle de parcours #1 */
grouped_strings_avx2_t group; /* Copie pour accès locaux */
__m256i R; /* Résultats courants */
__m256i pre_shift_mask; /* Préparation de décalage */
phys_t i; /* Boucle de parcours #2 */
__m256i test; /* Test de correspondances */
__m256i status; /* Statut d'une comparaison */
int mask; /* Masque d'accès rapide */
size_t j; /* Boucle de parcours #3 */
uint32_t leaves;
int ret;
phys_t old_i;
phys_t p;
//return;
/* Initialisations diverses */
manager = &backend->manager_avx2;
dlen = g_binary_content_compute_size(content);
g_binary_content_compute_start_pos(content, &pos);
data = g_binary_content_get_raw_access(content, &pos, dlen);
zero = _mm256_set1_epi16(0);
/* Recherches des chaînes de moins de 8 caractères */
printf(" --- manager->count_8: %zu\n", manager->count_8);
ret = 0;
//for (k = 0; k < manager->count_8; k++)
// memcpy(&groups[k], manager->strings_8[k], sizeof(grouped_strings_avx2_t));
for (i = 0; i < dlen; )
{
//printf(" --- %llx\n", (unsigned long long)i);
p = i + 4096;
if (p > dlen)
p = dlen;
old_i = i;
printf("old_i: %llx\n", (unsigned long long)old_i);
for (k = 0; k < manager->count_8; k++)
{
group = *manager->strings_8[k];
R = group.R;
for (i = old_i ; i < p; i++)
{
//group = &groups[k];
//printf(" k: %zu i: %llx\n", k, (unsigned long long)i);
//R = group.R;//_mm256_set1_epi8(~1);
R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
R = _mm256_add_epi8(R, R);
test = _mm256_and_si256(R, group.found_masks);
#if 0
status = _mm256_cmpeq_epi8(test, zero);
mask = _mm256_movemask_epi8(status);
#else
//mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
mask = _mm256_movemask_epi8(test);
#endif
if (mask != 0xffffffff)
{
leaves = group.leaves;
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == 0)
{
if (leaves & 0x1) //group.leaves & (1u << j))
;//define_full_match_avx2(backend, context, content, &group, j, i + 1);
}
mask >>= 1;
leaves >>= 1;
}
}
group.R = R;//_mm256_set1_epi8(~1);
memcpy(manager->strings_8[k], &group, sizeof(grouped_strings_avx2_t));
}
}
}
printf("oh: %d\n", ret);
}
#else
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = lieu d'enregistrement des résultats. *
* content = données binaires à analyser. *
* *
* Description : Parcours un contenu binaire à la recherche de motifs. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void run_scan_avx2(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
const group_manager_avx2_t *manager; /* Accès simplifié */
phys_t dlen; /* Quantité de données */
vmpa2t pos; /* Point de départ ciblé */
const bin_t *data; /* Données à analyser */
__m256i zero; /* Constante 0 sur 256 bits */
size_t k; /* Boucle de parcours #1 */
grouped_strings_avx2_t group; /* Copie pour accès locaux */
__m256i R; /* Résultats courants */
__m256i pre_shift_mask; /* Préparation de décalage */
phys_t i; /* Boucle de parcours #2 */
__m256i test; /* Test de correspondances */
__m256i status; /* Statut d'une comparaison */
int mask; /* Masque d'accès rapide */
size_t j; /* Boucle de parcours #3 */
uint32_t leaves;
int ret;
//return;
/* Initialisations diverses */
manager = &backend->manager_avx2;
dlen = g_binary_content_compute_size(content);
g_binary_content_compute_start_pos(content, &pos);
data = g_binary_content_get_raw_access(content, &pos, dlen);
zero = _mm256_set1_epi16(0);
/* Recherches des chaînes de moins de 8 caractères */
printf(" --- manager->count_8: %zu\n", manager->count_8);
ret = 0;
for (k = 0; k < manager->count_8; k++)
{
memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx2_t));
//printf(" --- group.used: %zu\n", group.used);
R = _mm256_set1_epi8(~1);
//pre_shift_mask = _mm256_set1_epi8(0xef);
for (i = 0; i < dlen; ++i)
{
//printf(" > %c\n", data[i]);
R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
//printf("group pattern: %hhx\n", *((uint8_t *)&group.pattern_masks[data[i]]));
//printf("R: %hhx\n", *((uint8_t *)&R));
//R = _mm256_and_si256(R, pre_shift_mask);
//printf("R after and: %hhx\n", *((uint8_t *)&R));
R = _mm256_add_epi8(R, R);
//R = _mm256_slli_si256(R, 1);
//printf("R after shift: %hhx\n", *((uint8_t *)&R));
test = _mm256_and_si256(R, group.found_masks);
#if 0
status = _mm256_cmpeq_epi8(test, zero);
mask = _mm256_movemask_epi8(status);
#else
//mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
mask = _mm256_movemask_epi8(test);
#endif
if (mask != 0xffffffff)
{
leaves = group.leaves;
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == 0)
{
//assert((i + 1) >= group.m[j]);
if (leaves & 0x1) //group.leaves & (1u << j))
define_full_match_avx2(backend, context, content, &group, j, i + 1);
//else
//{
// ret++;
//printf("%x\n", (unsigned int)i + 1);
//}
//else
// g_scan_context_register_sub_match(context, group.found_id[j], i + 1 - group.m[j]);
}
mask >>= 1;
leaves >>= 1;
}
}
}
}
printf("oh: %d\n", ret);
/* Recherches des chaînes de moins de 16 caractères */
for (k = 0; k < manager->count_16; k++)
{
memcpy(&group, manager->strings_16[k], sizeof(grouped_strings_avx2_t));
R = _mm256_set1_epi16(~1);
for (i = 0; i < dlen; ++i)
{
R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
R = _mm256_slli_epi16(R, 1);
test = _mm256_and_si256(R, group.found_masks);
status = _mm256_cmpeq_epi16(test, zero);
mask = _mm256_movemask_epi8(status);
if (mask != 0)
for (j = 0; j < group.used; j++)
{
if (mask & 0x3)
{
assert((i + 1) >= group.m[j]);
if (group.leaves & (1llu << j))
define_full_match_avx2(backend, context, content, &group, j, i + 1);
else
;//g_scan_context_register_sub_match(context, group.found_id[j], i + 1 - group.m[j]);
}
mask >>= 2;
}
}
}
}
#endif
#endif
/* ---------------------------------------------------------------------------------- */
/* OPTIMISATIONS POUR ARCHITECTURE AVX512 */
/* ---------------------------------------------------------------------------------- */
/**
* Cf. https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX_512
* - https://agner.org/optimize/
* - https://uops.info/table.html
*/
/******************************************************************************
* *
* Paramètres : strings = ensemble de groupes constitués. [OUT] *
* count = nombre de groupes courant. [OUT] *
* *
* Description : Indique la valeur portée par une expression rationnelle. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void extend_grouped_strings_avx512(grouped_strings_avx512_t ***strings, size_t *count)
{
grouped_strings_avx512_t *new; /* Zone supplémentaire */
size_t i; /* Boucle de parcours */
/* Définition d'un nouvel élément vierge */
new = aligned_alloc(0x1000, sizeof(grouped_strings_avx512_t));
for (i = 0; i < 256; i++)
new->pattern_masks[i] = _mm512_set1_epi8(~0);
new->found_masks = _mm512_set1_epi8(~0);
new->R = _mm512_set1_epi8(~1);
for (i = 0; i < 64; i++)
{
new->m[i] = 0;
new->found_id[i] = INVALID_PATTERN_ID;
}
new->available = 64;
new->used = 0;
/* Inscription */
*strings = realloc(*strings, ++(*count) * sizeof(grouped_strings_avx512_t *));
(*strings)[*count - 1] = new;
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = contexte de l'analyse à mener. *
* plain = chaîne de caractères classique à intégrer. *
* plen = taille de cette chaîne. *
* *
* Description : Inscrit dans le moteur une chaîne de caractères à rechercher.*
* *
* Retour : Indice de résultats pour le motif. *
* *
* Remarques : - *
* *
******************************************************************************/
static patid_t enroll_plain_pattern_avx512(GBitapBackend *backend, GScanContext *context, const bin_t *plain, size_t plen)
{
patid_t result; /* Identifiant à retourner */
grouped_strings_avx512_t ***strings; /* Groupe de chaînes visé */
size_t *count; /* Taille de ce groupe */
grouped_strings_avx512_t *last; /* Dernier groupe à remplir */
size_t n; /* Indice dans le groupe */
size_t i; /* Boucle de parcours */
__m512i *letter; /* Lettre à marquer */
/* Sélection du groupe de travail adéquat */
strings = &backend->manager_avx512.strings_8;
count = &backend->manager_avx512.count_8;
/* Préparation de la place nécessaire */
if (*count == 0)
{
extend_grouped_strings_avx512(strings, count);
last = (*strings)[0];
}
else
{
last = (*strings)[*count - 1];
if (last->used == last->available)
{
extend_grouped_strings_avx512(strings, count);
last = (*strings)[*count - 1];
}
}
/* Intégration d'une nouvelle chaîne */
n = last->used++;
last->m[n] = plen;
result = g_scan_context_get_new_pattern_id(context);
last->found_id[n] = result;
((uint8_t *)&last->found_masks)[n] = (1 << plen);
for (i = 0; i < plen; i++)
{
letter = last->pattern_masks + plain[i];
((uint8_t *)letter)[n] &= ~(1 << i);
}
return result;
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = lieu d'enregistrement des résultats. *
* content = données binaires à analyser. *
* *
* Description : Parcours un contenu binaire à la recherche de motifs. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void run_scan_avx512(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
const group_manager_avx512_t *manager; /* Accès simplifié */
phys_t dlen; /* Quantité de données */
vmpa2t pos; /* Point de départ ciblé */
const bin_t *data; /* Données à analyser */
//register __m512i zero asm("zmm19"); /* Constante 0 sur 512 bits */
//__m512i shift8_mask; /* Masque pour décalage manuel */
size_t k; /* Boucle de parcours #1 */
/*__attribute__((aligned(0x1000)))*/ grouped_strings_avx512_t group; /* Copie pour accès locaux */
//void *grpptr;
//grouped_strings_avx512_t *_group; /* Copie pour accès locaux */
int ret;
register __m512i R asm("zmm28"); /* Résultats courants */
register __m512i found_masks asm("zmm21"); /* Vérifications accélérées */
register __mmask64 test_mask asm("k6");
register const bin_t *iter asm("rsi");
register const bin_t *maxiter/* asm("rdi")*/;
//phys_t i; /* Boucle de parcours #2 */
//__m512i test;
__mmask64 mask; /* Masque d'accès rapide */
size_t j; /* Boucle de parcours #3 */
/* Initialisations diverses */
manager = &backend->manager_avx512;
dlen = g_binary_content_compute_size(content);
g_binary_content_compute_start_pos(content, &pos);
data = g_binary_content_get_raw_access(content, &pos, dlen);
/* Recherches des chaînes de moins de 8 caractères */
//asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");
//zero = _mm512_set1_epi8(0);
//asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");
//shift8_mask = _mm512_set1_epi8(0x7f);
#define WORK_ON_COPY
for (k = 0; k < manager->count_8; k++)
{
#ifdef WORK_ON_COPY
memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));
#else
grpptr = alloca(sizeof(grouped_strings_avx512_t) + 0x1000);
_group = grpptr + 0x1000 - (((unsigned long)grpptr) % 0x1000);
//_group = manager->strings_8[k];
memcpy(_group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));
ret = mlock(_group, sizeof(grouped_strings_avx512_t));
printf("ret = %d\n", ret);
#endif
//printf(" --- group %p -- used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
//printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));
asm volatile
(
/*
* R = _mm512_set1_epi8(~1);
*
*/
"movabs $0xfefefefefefefefe, %%rax ; "
"vpbroadcastq %%rax, %[STATE] ; "
"movabs $0xffffffffffffffff, %%rax ; "
"kmovq %%rax, %[KMASK] ; "
/*
*
*/
"vmovdqa64 %[FOUND_SRC], %[FOUND_DST] ; "
: [STATE] "=v"(R),
[KMASK] "=Yk"(test_mask),
[FOUND_DST] "=v"(found_masks)
#ifdef WORK_ON_COPY
: [FOUND_SRC] "m"(group.found_masks)
#else
: [FOUND_SRC] "m"(_group->found_masks)
#endif
: "memory", "rax"
);
//for (i = 0; i < dlen; i++)
maxiter = data + dlen;
for (iter = data; iter < maxiter; iter++)
{
//printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);
asm volatile goto
(
/*
* R = _mm512_or_si512(R, group.pattern_masks[*iter]);
*
* Latency : 1-9
* Throughput : 0.5
* #Uops : 1-2
* Port Usage : 1*p05+1*p23
*
*/
"vpord %[PATTERN], %[STATE], %[STATE] ; "
/*
* R = _mm512_add_epi8(R, R);
*
* Latency : 1
* Throughput : 0.5
* #Uops : 1
* Port Usage : 1*p05
*
*/
"vpaddb %[STATE], %[STATE], %[STATE] ; "
/*
* mask = _mm512_test_epi8_mask(R, group.found_masks);
*
* Latency : 3
* Throughput : 1
* #Uops : 2
* Port Usage : 1*p23+1*p5
*
*/
/******************************
* Version 0
******************/
//"vptestmb %[FOUND], %[STATE], %%k7 ; "
/******************************
* Version 1
"vmovdqa64 %[STATE], %%zmm12 ; "
"vptestmb %[FOUND], %%zmm12, %%k7 ; "
******************/
/******************************
* Version 2
"vpandd %[STATE], %[FOUND], %%zmm12 ; "
"vpcmpneqb %[NUL], %%zmm12, %%k7 ; "
******************/
"vmovdqa64 %[STATE], %%zmm12 ; "
"vptestmb %[FOUND], %%zmm12, %%k7 ; "
"ktestq %[KMASK], %%k7 ; "
"jc %l[next_iter] ; "
/*
* (suite)
*
* Latency : 3
* Throughput : 1
* #Uops : 1
* Port Usage : 1*p5
*
*/
"kmovq %%k7, %[MASK0] ; "
//"vmovdqa64 %%zmm12, %[OUTPUT] ; "
//"nop; nop; nop; nop; nop; nop; nop; nop; "
//"nop; nop; nop; nop; nop; nop; nop; nop; "
: [STATE] "+v"(R),
//[OUTPUT] "=v"(test),
[MASK0] "=r"(mask)
//[NUL] "=v"(zero)
#ifdef WORK_ON_COPY
: [PATTERN] "m"(group.pattern_masks[*iter]),
#else
: [PATTERN] "m"(_group->pattern_masks[*iter]),
#endif
[FOUND] "v"(found_masks),
[KMASK] "Yk"(test_mask)
: "memory", "k7", "zmm12"
: next_iter
);
/*
printf(" found mask: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&group.found_masks)[0],
((uint8_t *)&group.found_masks)[1],
((uint8_t *)&group.found_masks)[2],
((uint8_t *)&group.found_masks)[3],
((uint8_t *)&group.found_masks)[4],
((uint8_t *)&group.found_masks)[5],
((uint8_t *)&group.found_masks)[6],
((uint8_t *)&group.found_masks)[7]);
printf(" test: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&test)[0],
((uint8_t *)&test)[1],
((uint8_t *)&test)[2],
((uint8_t *)&test)[3],
((uint8_t *)&test)[4],
((uint8_t *)&test)[5],
((uint8_t *)&test)[6],
((uint8_t *)&test)[7]);
printf(" -> mask: 0x%llx\n", (unsigned long long)mask);
*/
#ifdef WORK_ON_COPY
//if (mask != 0xffffffffffffffffllu)
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == 0)
{
//assert((i + 1) >= group.m[j]);
g_scan_context_register_atom_match(context,
group.found_id[j],
(iter - data) + 1 - group.m[j]);
}
mask >>= 1;
}
#else
# error "WEFEF"
if (mask != 0xffffffffffffffffllu)
for (j = 0; j < _group->used; j++)
{
if ((mask & 0x1) == 0)
{
//assert((i + 1) >= group.m[j]);
g_scan_context_register_atom_match(context,
_group->found_id[j],
(iter - data) + 1 - _group->m[j]);
}
mask >>= 1;
}
#endif
next_iter:
//;
//iter++;
}
}
}
#if 0
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = lieu d'enregistrement des résultats. *
* content = données binaires à analyser. *
* *
* Description : Parcours un contenu binaire à la recherche de motifs. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void run_scan_avx512____good_asm_perfs(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
const group_manager_avx512_t *manager; /* Accès simplifié */
phys_t dlen; /* Quantité de données */
vmpa2t pos; /* Point de départ ciblé */
const bin_t *data; /* Données à analyser */
//__m512i shift8_mask; /* Masque pour décalage manuel */
size_t k; /* Boucle de parcours #1 */
grouped_strings_avx512_t group; /* Copie pour accès locaux */
register __m512i found_masks asm("zmm21"); /* Vérifications accélérées */
//register volatile __m512i zero/* asm("zmm19")*/; /* Constante 0 sur 512 bits */
register __m512i R asm("zmm28"); /* Résultats courants */
//int counter;
const bin_t *iter;
const bin_t *maxiter;
//phys_t i; /* Boucle de parcours #2 */
__m512i test;
__mmask64 mask; /* Masque d'accès rapide */
size_t j; /* Boucle de parcours #3 */
//register __m512i z30 asm("zmm30");
//return;
//counter = 0;
//return;
/* Initialisations diverses */
manager = &backend->manager_avx512;
dlen = g_binary_content_compute_size(content);
g_binary_content_compute_start_pos(content, &pos);
data = g_binary_content_get_raw_access(content, &pos, dlen);
/* Recherches des chaînes de moins de 8 caractères */
printf(" --- manager512->count_8: %zu\n", manager->count_8);
asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");
//zero = _mm512_set1_epi8(0);
asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");
//shift8_mask = _mm512_set1_epi8(0x7f);
for (k = 0; k < manager->count_8; k++)
{
memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));
//printf(" --- group %p -- used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
//printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));
asm volatile
(
/*
* R = _mm512_set1_epi8(~1);
*
*/
"movabs $0xfefefefefefefefe, %%rax ; "
"vpbroadcastq %%rax, %[STATE] ; "
/*
*
*/
"vmovdqa64 %[FOUND_SRC], %[FOUND_DST] ; "
: [STATE] "=v"(R),
[FOUND_DST] "=v"(found_masks)
: [FOUND_SRC] "m"(group.found_masks)
: "memory", "rax"
);
//for (i = 0; i < dlen; i++)
maxiter = data + dlen;
for (iter = data; iter < maxiter; iter++)
{
//printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);
asm volatile
(
/*
* R = _mm512_or_si512(R, group.pattern_masks[*iter]);
*
* Latency : 1-9
* Throughput : 0.5
* #Uops : 1-2
* Port Usage : 1*p05+1*p23
*
*/
"vpord %[PATTERN], %[STATE], %[STATE] ; "
/*
* R = _mm512_add_epi8(R, R);
*
* Latency : 1
* Throughput : 0.5
* #Uops : 1
* Port Usage : 1*p05
*
*/
"vpaddb %[STATE], %[STATE], %[STATE] ; "
/*
* mask = _mm512_test_epi8_mask(R, group.found_masks);
*
* Latency : 3
* Throughput : 1
* #Uops : 2
* Port Usage : 1*p23+1*p5
*
*/
/******************************
* Version 0
******************/
"vptestmb %[FOUND], %[STATE], %%k7 ; "
/******************************
* Version 1
"vmovdqa64 %[STATE], %%zmm12 ; "
"vptestmb %[FOUND], %%zmm12, %%k0 ; "
******************/
/******************************
* Version 2
"vpandd %[STATE], %[FOUND], %%zmm12 ; "
"vpcmpneqb %[NUL], %%zmm12, %%k7 ; "
******************/
/*
* (suite)
*
* Latency : 3
* Throughput : 1
* #Uops : 1
* Port Usage : 1*p5
*
*/
"kmovq %%k7, %[MASK0] ; "
//"vmovdqa64 %%zmm12, %[OUTPUT] ; "
//"nop; nop; nop; nop; nop; nop; nop; nop; "
//"nop; nop; nop; nop; nop; nop; nop; nop; "
: [STATE] "+v"(R),
[OUTPUT] "=v"(test),
[MASK0] "=r"(mask)/*,
[NUL] "+v"(zero)*/
: [PATTERN] "v"(group.pattern_masks[*iter]),
[FOUND] "v"(found_masks)
: "memory", "k0", "zmm12"
);
/*
printf(" found mask: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&group.found_masks)[0],
((uint8_t *)&group.found_masks)[1],
((uint8_t *)&group.found_masks)[2],
((uint8_t *)&group.found_masks)[3],
((uint8_t *)&group.found_masks)[4],
((uint8_t *)&group.found_masks)[5],
((uint8_t *)&group.found_masks)[6],
((uint8_t *)&group.found_masks)[7]);
printf(" test: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&test)[0],
((uint8_t *)&test)[1],
((uint8_t *)&test)[2],
((uint8_t *)&test)[3],
((uint8_t *)&test)[4],
((uint8_t *)&test)[5],
((uint8_t *)&test)[6],
((uint8_t *)&test)[7]);
printf(" -> mask: 0x%llx\n", (unsigned long long)mask);
*/
#if 0
/*
printf(" R: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&R)[0],
((uint8_t *)&R)[1],
((uint8_t *)&R)[2],
((uint8_t *)&R)[3],
((uint8_t *)&R)[4],
((uint8_t *)&R)[5],
((uint8_t *)&R)[6],
((uint8_t *)&R)[7]);
printf(" found mask: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&group.found_masks)[0],
((uint8_t *)&group.found_masks)[1],
((uint8_t *)&group.found_masks)[2],
((uint8_t *)&group.found_masks)[3],
((uint8_t *)&group.found_masks)[4],
((uint8_t *)&group.found_masks)[5],
((uint8_t *)&group.found_masks)[6],
((uint8_t *)&group.found_masks)[7]);
*/
/*
printf(" test: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&test)[0],
((uint8_t *)&test)[1],
((uint8_t *)&test)[2],
((uint8_t *)&test)[3],
((uint8_t *)&test)[4],
((uint8_t *)&test)[5],
((uint8_t *)&test)[6],
((uint8_t *)&test)[7]);
*/
#endif
# define TEST_MASK 0xffffffffffffffffllu
# define TEST_BIT 0
//printf("mask: %llx\n", (unsigned long long)mask);
if (mask != TEST_MASK)
{
//printf("mask: %llx\n", (unsigned long long)mask);
//counter++;
//printf("Ouhc: %p - %x\n", &group, *((uint8_t *)&mask));
//printf("Ouhc: %x\n", 1);
//asm("vzeroupper;");
//printf("Ouhc: %hhx\n", R[0]);
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == TEST_BIT)
{
//assert((i + 1) >= group.m[j]);
//printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);
printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)(iter - data) + 1);
}
mask >>= 1;
//printf("> mask: %llx\n", (unsigned long long)mask);
}
}
}
//printf("%hhx\n", ((uint8_t *)&R)[0], ((uint8_t *)&mask)[0]);
}
//printf("counter=%d\n", counter);
}
/******************************************************************************
* *
* Paramètres : backend = moteur de recherche à manipuler. *
* context = lieu d'enregistrement des résultats. *
* content = données binaires à analyser. *
* *
* Description : Parcours un contenu binaire à la recherche de motifs. *
* *
* Retour : - *
* *
* Remarques : - *
* *
******************************************************************************/
static void run_scan_avx512_best_test(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
const group_manager_avx512_t *manager; /* Accès simplifié */
phys_t dlen; /* Quantité de données */
vmpa2t pos; /* Point de départ ciblé */
const bin_t *data; /* Données à analyser */
//__m512i shift8_mask; /* Masque pour décalage manuel */
size_t k; /* Boucle de parcours #1 */
grouped_strings_avx512_t group; /* Copie pour accès locaux */
//register __m512i zero; /* Constante 0 sur 512 bits */
register __m512i R; /* Résultats courants */
//int counter;
const bin_t *iter;
const bin_t *maxiter;
//phys_t i; /* Boucle de parcours #2 */
//__m512i test;
__mmask64 mask; /* Masque d'accès rapide */
size_t j; /* Boucle de parcours #3 */
//return;
//counter = 0;
//return;
/* Initialisations diverses */
manager = &backend->manager_avx512;
dlen = g_binary_content_compute_size(content);
g_binary_content_compute_start_pos(content, &pos);
data = g_binary_content_get_raw_access(content, &pos, dlen);
/* Recherches des chaînes de moins de 8 caractères */
printf(" --- manager512->count_8: %zu\n", manager->count_8);
//zero = _mm512_set1_epi8(0);
//shift8_mask = _mm512_set1_epi8(0x7f);
for (k = 0; k < manager->count_8; k++)
{
memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));
//printf(" --- group %p -- used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
//printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));
R = _mm512_set1_epi8(~1);
/* vpord zmm, zmm, zmm : latence 1, 1*p05 */
//R = _mm512_or_si512(R, group.pattern_masks[data[0]]);
//for (i = 0; i < dlen; i++)
maxiter = data + dlen;
for (iter = data; iter < maxiter; iter++)
{
//printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);
//R = _mm512_or_si512(R, group.pattern_masks[data[i]]);
R = _mm512_or_si512(R, group.pattern_masks[*iter]);
#if 1
/* vpaddb zmm, zmm, zmm : latence 1, 1*p05 */
R = _mm512_add_epi8(R, R);
#else
/* vpandd zmm, zmm, zmm : latence 1, 1*p5 */
R = _mm512_and_si512(R, shift8_mask);
/* vpslldq zmm, zmm, imm8 : latence 1, 1*p5 */
R = _mm512_bslli_epi128(R, 1);
#endif
/*
printf(" R: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&R)[0],
((uint8_t *)&R)[1],
((uint8_t *)&R)[2],
((uint8_t *)&R)[3],
((uint8_t *)&R)[4],
((uint8_t *)&R)[5],
((uint8_t *)&R)[6],
((uint8_t *)&R)[7]);
printf(" found mask: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&group.found_masks)[0],
((uint8_t *)&group.found_masks)[1],
((uint8_t *)&group.found_masks)[2],
((uint8_t *)&group.found_masks)[3],
((uint8_t *)&group.found_masks)[4],
((uint8_t *)&group.found_masks)[5],
((uint8_t *)&group.found_masks)[6],
((uint8_t *)&group.found_masks)[7]);
*/
#if 1
/* vptestmb k, zmm, zmm : latence 3, 1*p5 */
mask = _mm512_test_epi8_mask(R, group.found_masks);
//test = _mm512_add_epi64(R, zero);
//mask = _mm512_test_epi8_mask(test, group.found_masks);
# define TEST_MASK 0xffffffffffffffffllu
# define TEST_BIT 0
/* comparaison : != */
#else
/* vpandd zmm, zmm, zmm : latence 1, 1*p05 */
test = _mm512_and_si512(R, group.found_masks);
printf(" test: %hhx %hhx %hhx %hhx %hhx %hhx %hhx %hhx\n",
((uint8_t *)&test)[0],
((uint8_t *)&test)[1],
((uint8_t *)&test)[2],
((uint8_t *)&test)[3],
((uint8_t *)&test)[4],
((uint8_t *)&test)[5],
((uint8_t *)&test)[6],
((uint8_t *)&test)[7]);
/* vpmovb2m k, zmm : latence 3 (au lieu de 1 !?), 1*p0 */
//mask = _mm512_movepi8_mask(test);
# define TEST_MASK 0
# define TEST_BIT 0
//test = _mm512_popcnt_epi8(test);
#endif
//printf(" final mask: %16llx\n", (unsigned long long)mask);
//R = _mm512_or_si512(R, group.pattern_masks[data[i + 1]]);
#if 1
if (mask != TEST_MASK)
{
//counter++;
//printf("Ouhc: %p - %x\n", &group, *((uint8_t *)&mask));
printf("Ouhc: %p\n", &group);
//printf("Ouhc: %hhx\n", R[0]);
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == TEST_BIT)
{
//assert((i + 1) >= group.m[j]);
//printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);
printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)(iter - data) + 1);
}
mask >>= 1;
}
}
#else
if (_mm512_reduce_or_epi64(test) != 0)
{
for (j = 0; j < group.used; j++)
{
if (((uint8_t *)&test)[j] == 0)
{
//assert((i + 1) >= group.m[j]);
printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);
}
}
}
#endif
}
//printf("%hhx\n", ((uint8_t *)&R)[0], ((uint8_t *)&mask)[0]);
}
//printf("counter=%d\n", counter);
}
static void run_scan_avx512__saved(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
const group_manager_avx512_t *manager; /* Accès simplifié */
phys_t dlen; /* Quantité de données */
vmpa2t pos; /* Point de départ ciblé */
const bin_t *data; /* Données à analyser */
__m512i shift8_mask; /* Masque pour décalage manuel */
size_t k; /* Boucle de parcours #1 */
grouped_strings_avx512_t group; /* Copie pour accès locaux */
__m512i R; /* Résultats courants */
//int counter;
phys_t i; /* Boucle de parcours #2 */
__m512i test;
__mmask64 mask; /* Masque d'accès rapide */
size_t j; /* Boucle de parcours #3 */
//counter = 0;
//return;
/* Initialisations diverses */
manager = &backend->manager_avx512;
dlen = g_binary_content_compute_size(content);
g_binary_content_compute_start_pos(content, &pos);
data = g_binary_content_get_raw_access(content, &pos, dlen);
/* Recherches des chaînes de moins de 8 caractères */
printf(" --- manager512->count_8: %zu\n", manager->count_8);
shift8_mask = _mm512_set1_epi8(0x7f);
for (k = 0; k < manager->count_8; k++)
{
memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));
//printf(" --- group %p -- used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
//printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));
R = _mm512_set1_epi8(~1);
/* vpord zmm, zmm, zmm : latence 1, 1*p05 */
R = _mm512_or_si512(R, group.pattern_masks[data[0]]);
for (i = 0; i < dlen; i++)
{
/*
printf("--- %llx <-> %c\n", (unsigned long long)i, data[i]);
printf(" R: %hhx %hhx %hhx %hhx\n",
((uint8_t *)&R)[0],
((uint8_t *)&R)[1],
((uint8_t *)&R)[2],
((uint8_t *)&R)[3]);
printf(" mask: %hhx %hhx %hhx %hhx\n",
((uint8_t *)&group.pattern_masks[data[i]])[0],
((uint8_t *)&group.pattern_masks[data[i]])[1],
((uint8_t *)&group.pattern_masks[data[i]])[2],
((uint8_t *)&group.pattern_masks[data[i]])[3]);
*/
//R = _mm512_or_si512(R, group.pattern_masks[data[i]]);
/*
printf(" R: %hhx %hhx %hhx %hhx\n",
((uint8_t *)&R)[0],
((uint8_t *)&R)[1],
((uint8_t *)&R)[2],
((uint8_t *)&R)[3]);
*/
#if 1
/* vpaddb zmm, zmm, zmm : latence 1, 1*p05 */
R = _mm512_add_epi8(R, R);
#else
/* vpandd zmm, zmm, zmm : latence 1, 1*p5 */
R = _mm512_and_si512(R, shift8_mask);
/* vpslldq zmm, zmm, imm8 : latence 1, 1*p5 */
R = _mm512_bslli_epi128(R, 1);
#endif
#if 1
/* vptestmb k, zmm, zmm : latence 3, 1*p5 */
mask = _mm512_test_epi8_mask(R, group.found_masks);
#else
test = _mm512_and_si512(R, group.found_masks);
test = _mm512_popcnt_epi8(test);
#endif
/*
printf(" found mask: %hhx %hhx %hhx %hhx\n",
((uint8_t *)&group.found_masks)[0],
((uint8_t *)&group.found_masks)[1],
((uint8_t *)&group.found_masks)[2],
((uint8_t *)&group.found_masks)[3]);
printf(" final mask: %16llx\n", (unsigned long long)mask);
*/
R = _mm512_or_si512(R, group.pattern_masks[data[i + 1]]);
#if 1
if (mask != 0xffffffffffffffffllu)
{
//counter++;
//printf("Ouhc: %p - %x\n", &group, *((uint8_t *)&mask));
//printf("Ouhc: %p\n", &group);
for (j = 0; j < group.used; j++)
{
if ((mask & 0x1) == 0)
{
//assert((i + 1) >= group.m[j]);
printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);
}
mask >>= 1;
}
}
#else
if (_mm512_reduce_or_epi64(test) != 0)
{
for (j = 0; j < group.used; j++)
{
if (((uint8_t *)&test)[j] == 0)
{
//assert((i + 1) >= group.m[j]);
printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);
}
}
}
#endif
}
//printf("%hhx\n", ((uint8_t *)&R)[0], ((uint8_t *)&mask)[0]);
}
//printf("counter=%d\n", counter);
}
#endif