/* Chrysalide - Outil d'analyse de fichiers binaires
 * bitap.c - méthode de recherche basée sur l'algorithme Bitap
 *
 * Copyright (C) 2022 Cyrille Bagard
 *
 *  This file is part of Chrysalide.
 *
 *  Chrysalide is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  Chrysalide is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
 */


#include "bitap.h"


#include <alloca.h>
#include <assert.h>
#include <sys/mman.h>
#include <sched.h>


#include "bitap-int.h"
#include "../../../../core/logs.h"
//#include "../../matches/bytes.h"



/* ---------------------- IMPLANTATION D'UNE NOUVELLE APPROCHE ---------------------- */


/* Initialise la classe des méthodes basée sur Bitmap. */
static void g_bitap_backend_class_init(GBitapBackendClass *);

/* Initialise une instance de méthodes basée sur Bitmap. */
static void g_bitap_backend_init(GBitapBackend *);

/* Supprime toutes les références externes. */
static void g_bitap_backend_dispose(GBitapBackend *);

/* Procède à la libération totale de la mémoire. */
static void g_bitap_backend_finalize(GBitapBackend *);



/* --------------------- IMPLEMENTATION DES FONCTIONS DE CLASSE --------------------- */


/* Indique la taille maximale des suites d'octets recherchées. */
size_t g_bitap_backend_get_atom_max_size(const GBitapBackend *);

/* Inscrit dans le moteur une chaîne de caractères à rechercher. */
static patid_t g_bitap_backend_enroll_plain_pattern(GBitapBackend *, GScanContext *, const uint8_t *, size_t);

/* Parcours un contenu binaire à la recherche de motifs. */
static void g_bitap_backend_run_scan(const GBitapBackend *, GScanContext *);

/* Imprime quelques faits quant aux éléments mis en place. */
static void g_bitap_backend_output_stats(const GBitapBackend *);



/* ---------------------- OPTIMISATIONS POUR ARCHITECTURE AVX2 ---------------------- */


#ifdef HAVE_AVX2

/* Indique la valeur portée par une expression rationnelle. */
static void extend_grouped_strings_avx2(grouped_strings_avx2_t ***, size_t *);

/* Inscrit dans le moteur une chaîne de caractères à rechercher. */
static patid_t enroll_plain_pattern_avx2(GBitapBackend *, GScanContext *, const bin_t *, size_t);

/* Parcours un contenu binaire à la recherche de motifs. */
static void run_scan_avx2(const GBitapBackend *, GScanContext *, const bin_t *, phys_t);

#endif



/* --------------------- OPTIMISATIONS POUR ARCHITECTURE AVX512 --------------------- */


#ifdef HAVE_AVX512_F

/* Indique la valeur portée par une expression rationnelle. */
static void extend_grouped_strings_avx512(grouped_strings_avx512_t ***, size_t *);

/* Inscrit dans le moteur une chaîne de caractères à rechercher. */
static patid_t enroll_plain_pattern_avx512(GBitapBackend *, GScanContext *, const bin_t *, size_t);

/* Parcours un contenu binaire à la recherche de motifs. */
static void run_scan_avx512(const GBitapBackend *, GScanContext *, const bin_t *, phys_t);

#endif



/* ---------------------------------------------------------------------------------- */
/*                        IMPLANTATION D'UNE NOUVELLE APPROCHE                        */
/* ---------------------------------------------------------------------------------- */


/* Indique le type défini pour un moteur de recherche pour données. */
G_DEFINE_TYPE(GBitapBackend, g_bitap_backend, G_TYPE_ENGINE_BACKEND);


/******************************************************************************
*                                                                             *
*  Paramètres  : klass = classe à initialiser.                                *
*                                                                             *
*  Description : Initialise la classe des méthodes basée sur Bitmap.          *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void g_bitap_backend_class_init(GBitapBackendClass *klass)
{
    GObjectClass *object;                   /* Autre version de la classe  */
    GEngineBackendClass *backend;           /* Version de classe parente   */

    object = G_OBJECT_CLASS(klass);

    object->dispose = (GObjectFinalizeFunc/* ! */)g_bitap_backend_dispose;
    object->finalize = (GObjectFinalizeFunc)g_bitap_backend_finalize;

    backend = G_ENGINE_BACKEND_CLASS(klass);

    backend->get_max_size = (get_backend_atom_max_size_fc)g_bitap_backend_get_atom_max_size;
    backend->enroll_plain = (enroll_plain_into_backend_fc)g_bitap_backend_enroll_plain_pattern;
    backend->run_scan = (run_backend_scan_fc)g_bitap_backend_run_scan;
    backend->output = (output_backend_stats_fc)g_bitap_backend_output_stats;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = instance à initialiser.                            *
*                                                                             *
*  Description : Initialise une instance de méthodes basée sur Bitmap.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void g_bitap_backend_init(GBitapBackend *backend)
{

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = instance d'objet GLib à traiter.                   *
*                                                                             *
*  Description : Supprime toutes les références externes.                     *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void g_bitap_backend_dispose(GBitapBackend *backend)
{
    G_OBJECT_CLASS(g_bitap_backend_parent_class)->dispose(G_OBJECT(backend));

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = instance d'objet GLib à traiter.                   *
*                                                                             *
*  Description : Procède à la libération totale de la mémoire.                *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void g_bitap_backend_finalize(GBitapBackend *backend)
{
    G_OBJECT_CLASS(g_bitap_backend_parent_class)->finalize(G_OBJECT(backend));

}


/******************************************************************************
*                                                                             *
*  Paramètres  : -                                                            *
*                                                                             *
*  Description : Crée une méthode de recherche basée sur l'algorithme Bitap.  *
*                                                                             *
*  Retour      : Méthode mise en place.                                       *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

GEngineBackend *g_bitap_backend_new(void)
{
    GBitapBackend *result;                  /* Structure à retourner       */

    result = g_object_new(G_TYPE_BITAP_BACKEND, NULL);

    return G_ENGINE_BACKEND(result);

}



/* ---------------------------------------------------------------------------------- */
/*                       IMPLEMENTATION DES FONCTIONS DE CLASSE                       */
/* ---------------------------------------------------------------------------------- */


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à consulter.                   *
*                                                                             *
*  Description : Indique la taille maximale des suites d'octets recherchées.  *
*                                                                             *
*  Retour      : Valeur strictement positive.                                 *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

size_t g_bitap_backend_get_atom_max_size(const GBitapBackend *backend)
{
    size_t result;                          /* Taille à faire connaître    */

    result = BITAP_ATOM_SIZE;

    return result;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = contexte de l'analyse à mener.                     *
*                plain   = chaîne de caractères classique à intégrer.         *
*                len     = taille de cette chaîne.                            *
*                                                                             *
*  Description : Inscrit dans le moteur une chaîne de caractères à rechercher.*
*                                                                             *
*  Retour      : Bilan de l'opération.                                        *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static patid_t g_bitap_backend_enroll_plain_pattern(GBitapBackend *backend, GScanContext *context, const uint8_t *plain, size_t len)
{
    patid_t result;                         /* Identifiant à retourner     */

#ifdef HAVE_AVX512_F
    if (0)
        result = enroll_plain_pattern_avx512(backend, context, plain, len);
    else
#endif

#ifdef HAVE_AVX2
    if (0)
        result = enroll_plain_pattern_avx2(backend, context, plain, len);
    else
#endif

        result = INVALID_PATTERN_ID;

    return result;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = lieu d'enregistrement des résultats.               *
*                                                                             *
*  Description : Parcours un contenu binaire à la recherche de motifs.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void g_bitap_backend_run_scan(const GBitapBackend *backend, GScanContext *context)
{
    cpu_set_t old_mask;                     /* Cartographie des CPU #1     */
    int ret;                                /* Bilan d'un appel            */
    unsigned int cpu;                       /* Processeur courant          */
    cpu_set_t new_mask;                     /* Cartographie des CPU #2     */
    GBinContent *content;                   /* Contenu binaire manipulé    */
    phys_t dlen;                            /* Quantité de données         */
    vmpa2t pos;                             /* Point de départ ciblé       */
    const bin_t *data;                      /* Données à analyser          */

    ret = sched_getaffinity(0, sizeof(cpu_set_t), &old_mask);

    if (ret != 0)
    {
        LOG_ERROR_N("sched_getaffinity");
        goto exit;
    }

    ret = getcpu(&cpu, NULL);

    if (ret != 0)
    {
        LOG_ERROR_N("get_cpu");
        goto exit;
    }

    CPU_ZERO(&new_mask);
    CPU_SET(cpu, &new_mask);

    ret = sched_setaffinity(0, sizeof(cpu_set_t), &new_mask);

    if (ret != 0)
    {
        LOG_ERROR_N("sched_setaffinity");
        goto exit;
    }

    content = g_scan_context_get_content(context);

    dlen = g_binary_content_compute_size(content);

    g_binary_content_compute_start_pos(content, &pos);
    data = g_binary_content_get_raw_access(content, &pos, dlen);

    assert(data != NULL);




#ifdef HAVE_AVX512_F
    if (0)
        run_scan_avx512(backend, context, data, dlen);
    else
#endif

#ifdef HAVE_AVX2
    if (0)
        run_scan_avx2(backend, context, data, dlen);
    else
#endif

        ;

    g_object_unref(G_OBJECT(content));

 exit:

    ;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à consulter.                   *
*                                                                             *
*  Description : Imprime quelques faits quant aux éléments mis en place.      *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void g_bitap_backend_output_stats(const GBitapBackend *backend)
{
    printf("hello here!\n");

}



/* ---------------------------------------------------------------------------------- */
/*                        OPTIMISATIONS POUR ARCHITECTURE AVX2                        */
/* ---------------------------------------------------------------------------------- */


/**
 * Cf. https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX,AVX2
 */

#ifdef HAVE_AVX2

/******************************************************************************
*                                                                             *
*  Paramètres  : strings = ensemble de groupes constitués. [OUT]              *
*                count   = nombre de groupes courant. [OUT]                   *
*                                                                             *
*  Description : Indique la valeur portée par une expression rationnelle.     *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void extend_grouped_strings_avx2(grouped_strings_avx2_t ***strings, size_t *count)
{
    grouped_strings_avx2_t *new;            /* Zone supplémentaire         */
    size_t i;                               /* Boucle de parcours          */

    /* Définition d'un nouvel élément vierge */

    new = aligned_alloc(256, sizeof(grouped_strings_avx2_t));

    for (i = 0; i < 256; i++)
        new->pattern_masks[i] = _mm256_set1_epi8(~0);

    new->found_masks = _mm256_set1_epi8(~0);

    new->R = _mm256_set1_epi8(~1);

    for (i = 0; i < 32; i++)
    {
        new->m[i] = 0;

        new->found_id[i] = INVALID_PATTERN_ID;

    }

    new->available = 32;
    new->used = 0;

    /* Inscription */

    *strings = realloc(*strings, ++(*count) * sizeof(grouped_strings_avx2_t *));

    (*strings)[*count - 1] = new;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend    = moteur de recherche à manipuler.                *
*                context    = contexte de l'analyse à mener.                  *
*                plain      = chaîne de caractères classique à intégrer.      *
*                plen       = taille de cette chaîne.                         *
*                                                                             *
*  Description : Inscrit dans le moteur une chaîne de caractères à rechercher.*
*                                                                             *
*  Retour      : Indice de résultats pour le motif.                           *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static patid_t enroll_plain_pattern_avx2(GBitapBackend *backend, GScanContext *context, const bin_t *plain, size_t plen)
{
    patid_t result;                         /* Identifiant à retourner     */
    grouped_strings_avx2_t ***strings;      /* Groupe de chaînes visé      */
    size_t *count;                          /* Taille de ce groupe         */
    grouped_strings_avx2_t *last;           /* Dernier groupe à remplir    */
    size_t n;                               /* Indice dans le groupe       */
    size_t i;                               /* Boucle de parcours          */
    __m256i *letter;                        /* Lettre à marquer            */

    /* Sélection du groupe de travail adéquat */

    strings = &backend->manager_avx2.strings_8;
    count = &backend->manager_avx2.count_8;

    /* Préparation de la place nécessaire */

    if (*count == 0)
    {
        extend_grouped_strings_avx2(strings, count);

        last = (*strings)[0];

    }

    else
    {
        last = (*strings)[*count - 1];

        if (last->used == last->available)
        {
            extend_grouped_strings_avx2(strings, count);
            last = (*strings)[*count - 1];
        }

    }

    /* Intégration d'une nouvelle chaîne */

    n = last->used++;

    last->m[n] = plen;

    result = 0; // FIXME g_scan_context_get_new_pattern_id(context);

    last->found_id[n] = result;

    ((uint8_t *)&last->found_masks)[n] = (1 << plen);

    for (i = 0; i < plen; i++)
    {
        letter = last->pattern_masks + plain[i];
        ((uint8_t *)letter)[n] &= ~(1 << i);
    }

    return result;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = lieu d'enregistrement des résultats.               *
*                data    = données à analyser.                                *
*                dlen    = quantité de ces données.                           *
*                                                                             *
*  Description : Parcours un contenu binaire à la recherche de motifs.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void run_scan_avx2(const GBitapBackend *backend, GScanContext *context, const bin_t *data, phys_t dlen)
{
    const group_manager_avx2_t *manager;    /* Accès simplifié             */

    register __m256i zero asm("ymm11");                           /* Constante 0 sur 256 bits    */
    size_t k;                               /* Boucle de parcours #1       */
    grouped_strings_avx2_t group;           /* Copie pour accès locaux     */

    register __m256i R asm("ymm12");                              /* Résultats courants          */
    register __m256i found_masks asm("ymm10"); /* Vérifications accélérées  */

    //__m256i pre_shift_mask;                 /* Préparation de décalage     */
    //phys_t i;                               /* Boucle de parcours #2       */




    const bin_t *iter;
    const bin_t *maxiter;
    //phys_t i;                               /* Boucle de parcours #2       */

    volatile register __m256i xxxx;                           /* Test de correspondances     */


    __m256i test;                           /* Test de correspondances     */
    __m256i test2;                           /* Test de correspondances     */
    __m256i status;                         /* Statut d'une comparaison    */

    int masks[10];

    int mask;                               /* Masque d'accès rapide       */
    size_t j;                               /* Boucle de parcours #3       */


    int ret;

    //return;

    /* Initialisations diverses */

    manager = &backend->manager_avx2;

    zero = _mm256_set1_epi16(0);

    asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop;nop;");

    xxxx = _mm256_set1_epi8(~1);

    asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop;nop;");

    /* Recherches des chaînes de moins de 8 caractères */

    printf(" --- manager->count_8: %zu\n", manager->count_8);

    ret = 0;

    for (k = 0; k < manager->count_8; k++)
    {
        memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx2_t));

        //printf(" --- group.used: %zu\n", group.used);


        asm volatile
        (
            /*
             * R = _mm256_set1_epi8(~1);
             *
             */

            "movabs         $0xfefefefefefefefe, %%rax ; "
            "vpbroadcastq   %%rax, %[STATE] ; "

            /*
             *
             */

            "vmovdqa %[FOUND_SRC], %[FOUND_DST] ; "

            : [STATE] "=v"(R),
              [FOUND_DST] "=v"(found_masks)
            : [FOUND_SRC] "m"(group.found_masks)
            : "memory", "rax"

         );




        //pre_shift_mask = _mm256_set1_epi8(0xef);

        maxiter = data + dlen;



        for (iter = data; (iter + 10) < maxiter; iter += 10)
        {

            //printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);


            asm volatile
            (
#if 0

                /*
                 * R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
                 *
                 * Latency : 1-9
                 * Throughput : 0.5
                 * #Uops : 1-2
                 * Port Usage : 1*p015+1*p23
                 *
                 */

                "vpor   %[PATTERN], %[STATE], %[STATE] ; "

#else

                /*
                 * %ymm = group.pattern_masks[data[i]];
                 *
                 * Latency : 5-8
                 * Throughput : 0.5
                 * #Uops : 1
                 * Port Usage : 1*p23
                 *
                 */

                "vmovdqa    %[PATTERN0], %%ymm0 ; "
                "vmovdqa    %[PATTERN1], %%ymm1 ; "
                "vmovdqa    %[PATTERN2], %%ymm2 ; "
                "vmovdqa    %[PATTERN3], %%ymm3 ; "
                "vmovdqa    %[PATTERN4], %%ymm4 ; "
                "vmovdqa    %[PATTERN5], %%ymm5 ; "
                "vmovdqa    %[PATTERN6], %%ymm6 ; "
                "vmovdqa    %[PATTERN7], %%ymm7 ; "
                "vmovdqa    %[PATTERN7], %%ymm8 ; "
                "vmovdqa    %[PATTERN7], %%ymm9 ; "

                /*
                 * R = _mm256_or_si256(R, %ymm);
                 *
                 * Latency : 1
                 * Throughput : 0.33
                 * #Uops : 1
                 * Port Usage : 1*p015
                 *
                 */

                "vpor   %%ymm0, %[STATE], %[STATE] ; "

#endif

                /*
                 * R = _mm256_add_epi8(R, R);
                 *
                 * Latency : 1
                 * Throughput : 0.3
                 * #Uops : 1
                 * Port Usage : 1*p015
                 *
                 */

                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                /*
                 * test = _mm256_and_si256(R, group.found_masks);
                 *
                 * Latency : 1
                 * Throughput : 0.33
                 * #Uops : 1
                 * Port Usage : 1*p015
                 *
                 */

                "vpand  %[FOUND], %[STATE], %%ymm0 ; "

                /* Déroulemets... */

                "vpor   %%ymm1, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm2, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm3, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm4, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm5, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm6, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm7, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm8, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpor   %%ymm9, %[STATE], %[STATE] ; "
                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                "vpand  %[FOUND], %[STATE], %%ymm1 ; "
                "vpand  %[FOUND], %[STATE], %%ymm2 ; "
                "vpand  %[FOUND], %[STATE], %%ymm3 ; "
                "vpand  %[FOUND], %[STATE], %%ymm4 ; "
                "vpand  %[FOUND], %[STATE], %%ymm5 ; "
                "vpand  %[FOUND], %[STATE], %%ymm6 ; "
                "vpand  %[FOUND], %[STATE], %%ymm7 ; "
                "vpand  %[FOUND], %[STATE], %%ymm8 ; "
                "vpand  %[FOUND], %[STATE], %%ymm9 ; "





                /*
                 * status = _mm256_cmpeq_epi8(test, zero);
                 *
                 * Latency : 1
                 * Throughput : 0.5
                 * #Uops : 1
                 * Port Usage : 1*p01
                 *
                 */

                "vpcmpeqb   %%ymm0, %[NUL], %%ymm0 ; "

                /*
                 * mask = _mm256_movemask_epi8(status);
                 *
                 * Latency : <5
                 * Throughput : 1
                 * #Uops : 1
                 * Port Usage : 1*p0
                 *
                 */

                "vpmovmskb  %%ymm0, %[MASK0] ; "





                "vpcmpeqb   %%ymm1, %[NUL], %%ymm1 ; "
                "vpcmpeqb   %%ymm2, %[NUL], %%ymm2 ; "
                "vpcmpeqb   %%ymm3, %[NUL], %%ymm3 ; "
                "vpcmpeqb   %%ymm4, %[NUL], %%ymm4 ; "
                "vpcmpeqb   %%ymm5, %[NUL], %%ymm5 ; "
                "vpcmpeqb   %%ymm6, %[NUL], %%ymm6 ; "
                "vpcmpeqb   %%ymm7, %[NUL], %%ymm7 ; "
                "vpcmpeqb   %%ymm8, %[NUL], %%ymm8 ; "
                "vpcmpeqb   %%ymm9, %[NUL], %%ymm9 ; "


                "vpmovmskb  %%ymm1, %[MASK1] ; "
                "vpmovmskb  %%ymm2, %[MASK2] ; "
                "vpmovmskb  %%ymm3, %[MASK3] ; "
                "vpmovmskb  %%ymm4, %[MASK4] ; "
                "vpmovmskb  %%ymm5, %[MASK5] ; "
                "vpmovmskb  %%ymm6, %[MASK6] ; "
                "vpmovmskb  %%ymm7, %[MASK7] ; "
                "vpmovmskb  %%ymm8, %[MASK8] ; "
                "vpmovmskb  %%ymm9, %[MASK9] ; "










                //"vmovdqa  %%ymm7, %[OUTPUT] ; "

                //"vmovdqa  %%ymm8, %[OUTPUT2] ; "

                : [STATE] "+v"(R),
                  [OUTPUT] "=v"(test),
                  [OUTPUT2] "=v"(test2),
                  [MASK0] "=r"(mask),
                  [MASK1] "=r"(mask),
                  [MASK2] "=r"(mask),
                  [MASK3] "=r"(mask),
                  [MASK4] "=r"(mask),
                  [MASK5] "=r"(mask),
                  [MASK6] "=r"(mask),
                  [MASK7] "=r"(mask),
                  [MASK8] "=r"(mask),
                  [MASK9] "=r"(mask),
                  [NUL] "+v"(zero)
                : [PATTERN0] "m"(group./*manager->strings_8[k]->*/pattern_masks[*iter]),
                  [PATTERN1] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 1)]),
                  [PATTERN2] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 2)]),
                  [PATTERN3] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 3)]),
                  [PATTERN4] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 4)]),
                  [PATTERN5] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 5)]),
                  [PATTERN6] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 6)]),
                  [PATTERN7] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 7)]),
                  [PATTERN8] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 8)]),
                  [PATTERN9] "m"(group./*manager->strings_8[k]->*/pattern_masks[*(iter + 9)]),
                  [FOUND] "v"(found_masks)
                : "memory", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9"

             );


            /*
            printf("        test: %02hhx %02hhx %02hhx %02hhx   %02hhx %02hhx %02hhx %02hhx   ...   %02hhx %02hhx %02hhx %02hhx\n",
                   ((uint8_t *)&test)[0],
                   ((uint8_t *)&test)[1],
                   ((uint8_t *)&test)[2],
                   ((uint8_t *)&test)[3],
                   ((uint8_t *)&test)[4],
                   ((uint8_t *)&test)[5],
                   ((uint8_t *)&test)[6],
                   ((uint8_t *)&test)[7],
                   ((uint8_t *)&test)[16],
                   ((uint8_t *)&test)[17],
                   ((uint8_t *)&test)[18],
                   ((uint8_t *)&test)[19]);

            printf("       test2: %02hhx %02hhx %02hhx %02hhx   %02hhx %02hhx %02hhx %02hhx   ...   %02hhx %02hhx %02hhx %02hhx\n",
                   ((uint8_t *)&test2)[0],
                   ((uint8_t *)&test2)[1],
                   ((uint8_t *)&test2)[2],
                   ((uint8_t *)&test2)[3],
                   ((uint8_t *)&test2)[4],
                   ((uint8_t *)&test2)[5],
                   ((uint8_t *)&test2)[6],
                   ((uint8_t *)&test2)[7],
                   ((uint8_t *)&test2)[16],
                   ((uint8_t *)&test2)[17],
                   ((uint8_t *)&test2)[18],
                   ((uint8_t *)&test2)[19]);
            */

#if 0
            //printf(" > %c\n", data[i]);

            R = _mm256_or_si256(R, group.pattern_masks[*iter]);

            //printf("group pattern: %hhx\n", *((uint8_t *)&group.pattern_masks[data[i]]));

            //printf("R:             %hhx\n", *((uint8_t *)&R));

            //R = _mm256_and_si256(R, pre_shift_mask);

            //printf("R after and:   %hhx\n", *((uint8_t *)&R));

            R = _mm256_add_epi8(R, R);
            //R = _mm256_slli_si256(R, 1);

            //printf("R after shift: %hhx\n", *((uint8_t *)&R));

            test = _mm256_and_si256(R, group.found_masks);

#if 1
            status = _mm256_cmpeq_epi8(test, zero);

            mask = _mm256_movemask_epi8(status);
#else
            //mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
            mask = _mm256_movemask_epi8(test);
#endif


#endif


            //printf("   mask : %x\n", mask);

            if (mask != 0)
                for (j = 0; j < group.used; j++)
                {
                    if ((mask & 0x1) == 1)
                    {
                        //assert((i + 1) >= group.m[j]);

                        /** TODO : update call
                        g_scan_context_register_atom_match(context,
                                                           group.found_id[j],
                                                           (iter - data) + 1 - group.m[j]);
                        **/

                    }

                    mask >>= 1;

                }

        }





#if 0
        for (; iter < maxiter; iter++)
        {

            //printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);


            asm volatile
            (
                /*
                 * R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
                 *
                 * Latency : 1
                 * Throughput : 0.33
                 * #Uops : 1
                 * Port Usage : 1*p015
                 *
                 */

                "vpor   %[PATTERN], %[STATE], %[STATE] ; "

                /*
                 * R = _mm256_add_epi8(R, R);
                 *
                 * Latency : 1
                 * Throughput : 0.3
                 * #Uops : 1
                 * Port Usage : 1*p015
                 *
                 */

                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                /*
                 * test = _mm256_and_si256(R, group.found_masks);
                 *
                 * Latency : 1
                 * Throughput : 0.33
                 * #Uops : 1
                 * Port Usage : 1*p015
                 *
                 */

                "vpand  %[FOUND], %[STATE], %%ymm7 ; "

                /*
                 * status = _mm256_cmpeq_epi8(test, zero);
                 *
                 * Latency : 1
                 * Throughput : 0.5
                 * #Uops : 1
                 * Port Usage : 1*p01
                 *
                 */

                "vpcmpeqb   %%ymm7, %[NUL], %%ymm8 ; "

                /*
                 * mask = _mm256_movemask_epi8(status);
                 *
                 * Latency : <5
                 * Throughput : 1
                 * #Uops : 1
                 * Port Usage : 1*p0
                 *
                 */

                "vpmovmskb  %%ymm8, %[MASK0] ; "


                //"vmovdqa  %%ymm7, %[OUTPUT] ; "

                //"vmovdqa  %%ymm8, %[OUTPUT2] ; "

                : [STATE] "+v"(R),
                  [OUTPUT] "=v"(test),
                  [OUTPUT2] "=v"(test2),
                  [MASK0] "=r"(mask),
                  [NUL] "+v"(zero)
                : [PATTERN] "m"(group./*manager->strings_8[k]->*/pattern_masks[*iter]),
                  [FOUND] "v"(found_masks)
                : "memory", "ymm7", "ymm8"

             );


            /*
            printf("        test: %02hhx %02hhx %02hhx %02hhx   %02hhx %02hhx %02hhx %02hhx   ...   %02hhx %02hhx %02hhx %02hhx\n",
                   ((uint8_t *)&test)[0],
                   ((uint8_t *)&test)[1],
                   ((uint8_t *)&test)[2],
                   ((uint8_t *)&test)[3],
                   ((uint8_t *)&test)[4],
                   ((uint8_t *)&test)[5],
                   ((uint8_t *)&test)[6],
                   ((uint8_t *)&test)[7],
                   ((uint8_t *)&test)[16],
                   ((uint8_t *)&test)[17],
                   ((uint8_t *)&test)[18],
                   ((uint8_t *)&test)[19]);

            printf("       test2: %02hhx %02hhx %02hhx %02hhx   %02hhx %02hhx %02hhx %02hhx   ...   %02hhx %02hhx %02hhx %02hhx\n",
                   ((uint8_t *)&test2)[0],
                   ((uint8_t *)&test2)[1],
                   ((uint8_t *)&test2)[2],
                   ((uint8_t *)&test2)[3],
                   ((uint8_t *)&test2)[4],
                   ((uint8_t *)&test2)[5],
                   ((uint8_t *)&test2)[6],
                   ((uint8_t *)&test2)[7],
                   ((uint8_t *)&test2)[16],
                   ((uint8_t *)&test2)[17],
                   ((uint8_t *)&test2)[18],
                   ((uint8_t *)&test2)[19]);
            */

#if 0
            //printf(" > %c\n", data[i]);

            R = _mm256_or_si256(R, group.pattern_masks[*iter]);

            //printf("group pattern: %hhx\n", *((uint8_t *)&group.pattern_masks[data[i]]));

            //printf("R:             %hhx\n", *((uint8_t *)&R));

            //R = _mm256_and_si256(R, pre_shift_mask);

            //printf("R after and:   %hhx\n", *((uint8_t *)&R));

            R = _mm256_add_epi8(R, R);
            //R = _mm256_slli_si256(R, 1);

            //printf("R after shift: %hhx\n", *((uint8_t *)&R));

            test = _mm256_and_si256(R, group.found_masks);

#if 1
            status = _mm256_cmpeq_epi8(test, zero);

            mask = _mm256_movemask_epi8(status);
#else
            //mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
            mask = _mm256_movemask_epi8(test);
#endif


#endif


            //printf("   mask : %x\n", mask);

            if (mask != 0)
                for (j = 0; j < group.used; j++)
                {
                    if ((mask & 0x1) == 1)
                    {
                        //assert((i + 1) >= group.m[j]);

                        /** TODO : update call
                        g_scan_context_register_atom_match(context,
                                                           group.found_id[j],
                                                           (iter - data) + 1 - group.m[j]);
                        **/

                    }

                    mask >>= 1;

                }

        }

#endif


    }


}














#if 0


#if 0

/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = lieu d'enregistrement des résultats.               *
*                content = données binaires à analyser.                       *
*                                                                             *
*  Description : Parcours un contenu binaire à la recherche de motifs.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void run_scan_avx2(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
    const group_manager_avx2_t *manager;    /* Accès simplifié             */

    grouped_strings_avx2_t groups[10];           /* Copie pour accès locaux     */


    phys_t dlen;                            /* Quantité de données         */
    vmpa2t pos;                             /* Point de départ ciblé       */
    const bin_t *data;                      /* Données à analyser          */
    __m256i zero;                           /* Constante 0 sur 256 bits    */
    size_t k;                               /* Boucle de parcours #1       */

    grouped_strings_avx2_t group;          /* Copie pour accès locaux     */
    __m256i R;                              /* Résultats courants          */
    __m256i pre_shift_mask;                 /* Préparation de décalage     */
    phys_t i;                               /* Boucle de parcours #2       */
    __m256i test;                           /* Test de correspondances     */
    __m256i status;                         /* Statut d'une comparaison    */
    int mask;                               /* Masque d'accès rapide       */
    size_t j;                               /* Boucle de parcours #3       */

    uint32_t leaves;
    int ret;


    phys_t old_i;
    phys_t p;

    //return;

    /* Initialisations diverses */

    manager = &backend->manager_avx2;

    dlen = g_binary_content_compute_size(content);

    g_binary_content_compute_start_pos(content, &pos);
    data = g_binary_content_get_raw_access(content, &pos, dlen);

    zero = _mm256_set1_epi16(0);

    /* Recherches des chaînes de moins de 8 caractères */

    printf(" --- manager->count_8: %zu\n", manager->count_8);

    ret = 0;

    //for (k = 0; k < manager->count_8; k++)
    //    memcpy(&groups[k], manager->strings_8[k], sizeof(grouped_strings_avx2_t));


    for (i = 0; i < dlen; )
    {

        //printf(" --- %llx\n", (unsigned long long)i);

        p = i + 4096;

        if (p > dlen)
            p = dlen;

        old_i = i;

        printf("old_i: %llx\n", (unsigned long long)old_i);

        for (k = 0; k < manager->count_8; k++)
        {

            group = *manager->strings_8[k];

            R = group.R;

            for (i = old_i ; i < p; i++)
            {

                //group = &groups[k];

                //printf(" k: %zu  i: %llx\n", k, (unsigned long long)i);

                //R = group.R;//_mm256_set1_epi8(~1);

                R = _mm256_or_si256(R, group.pattern_masks[data[i]]);

                R = _mm256_add_epi8(R, R);

                test = _mm256_and_si256(R, group.found_masks);

#if 0
                status = _mm256_cmpeq_epi8(test, zero);

                mask = _mm256_movemask_epi8(status);
#else
                //mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
                mask = _mm256_movemask_epi8(test);
#endif

                if (mask != 0xffffffff)
                {
                    leaves = group.leaves;

                    for (j = 0; j < group.used; j++)
                    {
                        if ((mask & 0x1) == 0)
                        {
                            if (leaves & 0x1) //group.leaves & (1u << j))
                                ;//define_full_match_avx2(backend, context, content, &group, j, i + 1);

                        }

                        mask >>= 1;

                        leaves >>= 1;

                    }

                }

                group.R = R;//_mm256_set1_epi8(~1);

                memcpy(manager->strings_8[k], &group, sizeof(grouped_strings_avx2_t));

            }


        }

    }

    printf("oh: %d\n", ret);


}


#else



/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = lieu d'enregistrement des résultats.               *
*                content = données binaires à analyser.                       *
*                                                                             *
*  Description : Parcours un contenu binaire à la recherche de motifs.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void run_scan_avx2(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
    const group_manager_avx2_t *manager;    /* Accès simplifié             */
    phys_t dlen;                            /* Quantité de données         */
    vmpa2t pos;                             /* Point de départ ciblé       */
    const bin_t *data;                      /* Données à analyser          */
    __m256i zero;                           /* Constante 0 sur 256 bits    */
    size_t k;                               /* Boucle de parcours #1       */
    grouped_strings_avx2_t group;           /* Copie pour accès locaux     */
    __m256i R;                              /* Résultats courants          */
    __m256i pre_shift_mask;                 /* Préparation de décalage     */
    phys_t i;                               /* Boucle de parcours #2       */
    __m256i test;                           /* Test de correspondances     */
    __m256i status;                         /* Statut d'une comparaison    */
    int mask;                               /* Masque d'accès rapide       */
    size_t j;                               /* Boucle de parcours #3       */

    uint32_t leaves;
    int ret;

    //return;

    /* Initialisations diverses */

    manager = &backend->manager_avx2;

    dlen = g_binary_content_compute_size(content);

    g_binary_content_compute_start_pos(content, &pos);
    data = g_binary_content_get_raw_access(content, &pos, dlen);

    zero = _mm256_set1_epi16(0);

    /* Recherches des chaînes de moins de 8 caractères */

    printf(" --- manager->count_8: %zu\n", manager->count_8);

    ret = 0;

    for (k = 0; k < manager->count_8; k++)
    {
        memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx2_t));

        //printf(" --- group.used: %zu\n", group.used);

        R = _mm256_set1_epi8(~1);

        //pre_shift_mask = _mm256_set1_epi8(0xef);

        for (i = 0; i < dlen; ++i)
        {
            //printf(" > %c\n", data[i]);

            R = _mm256_or_si256(R, group.pattern_masks[data[i]]);

            //printf("group pattern: %hhx\n", *((uint8_t *)&group.pattern_masks[data[i]]));

            //printf("R:             %hhx\n", *((uint8_t *)&R));

            //R = _mm256_and_si256(R, pre_shift_mask);

            //printf("R after and:   %hhx\n", *((uint8_t *)&R));

            R = _mm256_add_epi8(R, R);
            //R = _mm256_slli_si256(R, 1);

            //printf("R after shift: %hhx\n", *((uint8_t *)&R));

            test = _mm256_and_si256(R, group.found_masks);

#if 0
            status = _mm256_cmpeq_epi8(test, zero);

            mask = _mm256_movemask_epi8(status);
#else
            //mask = _mm256_movemask_epi8(test) ^ 0xffffffff;
            mask = _mm256_movemask_epi8(test);
#endif

            if (mask != 0xffffffff)
            {
                leaves = group.leaves;

                for (j = 0; j < group.used; j++)
                {
                    if ((mask & 0x1) == 0)
                    {
                        //assert((i + 1) >= group.m[j]);

                        if (leaves & 0x1) //group.leaves & (1u << j))
                            define_full_match_avx2(backend, context, content, &group, j, i + 1);
                        //else
                        //{
                        //    ret++;
                            //printf("%x\n", (unsigned int)i + 1);
                        //}
                        //else
                            //  g_scan_context_register_sub_match(context, group.found_id[j], i + 1 - group.m[j]);

                    }

                    mask >>= 1;

                    leaves >>= 1;

                }

            }

        }

    }

    printf("oh: %d\n", ret);

    /* Recherches des chaînes de moins de 16 caractères */

    for (k = 0; k < manager->count_16; k++)
    {
        memcpy(&group, manager->strings_16[k], sizeof(grouped_strings_avx2_t));

        R = _mm256_set1_epi16(~1);

        for (i = 0; i < dlen; ++i)
        {
            R = _mm256_or_si256(R, group.pattern_masks[data[i]]);
            R = _mm256_slli_epi16(R, 1);

            test = _mm256_and_si256(R, group.found_masks);

            status = _mm256_cmpeq_epi16(test, zero);

            mask = _mm256_movemask_epi8(status);

            if (mask != 0)
                for (j = 0; j < group.used; j++)
                {
                    if (mask & 0x3)
                    {
                        assert((i + 1) >= group.m[j]);

                        if (group.leaves & (1llu << j))
                            define_full_match_avx2(backend, context, content, &group, j, i + 1);
                        else
                            ;//g_scan_context_register_sub_match(context, group.found_id[j], i + 1 - group.m[j]);

                    }

                    mask >>= 2;

                }

        }

    }

}

#endif



#endif

















#endif



/* ---------------------------------------------------------------------------------- */
/*                       OPTIMISATIONS POUR ARCHITECTURE AVX512                       */
/* ---------------------------------------------------------------------------------- */


/**
 * Cf. https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX_512
 *   - https://agner.org/optimize/
 *   - https://uops.info/table.html
 */

#ifdef HAVE_AVX512_F

/******************************************************************************
*                                                                             *
*  Paramètres  : strings = ensemble de groupes constitués. [OUT]              *
*                count   = nombre de groupes courant. [OUT]                   *
*                                                                             *
*  Description : Indique la valeur portée par une expression rationnelle.     *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void extend_grouped_strings_avx512(grouped_strings_avx512_t ***strings, size_t *count)
{
    grouped_strings_avx512_t *new;          /* Zone supplémentaire         */
    size_t i;                               /* Boucle de parcours          */

    /* Définition d'un nouvel élément vierge */

    new = aligned_alloc(0x1000, sizeof(grouped_strings_avx512_t));

    for (i = 0; i < 256; i++)
        new->pattern_masks[i] = _mm512_set1_epi8(~0);

    new->found_masks = _mm512_set1_epi8(~0);

    new->R = _mm512_set1_epi8(~1);

    for (i = 0; i < 64; i++)
    {
        new->m[i] = 0;

        new->found_id[i] = INVALID_PATTERN_ID;

    }

    new->available = 64;
    new->used = 0;

    /* Inscription */

    *strings = realloc(*strings, ++(*count) * sizeof(grouped_strings_avx512_t *));

    (*strings)[*count - 1] = new;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = contexte de l'analyse à mener.                     *
*                plain   = chaîne de caractères classique à intégrer.         *
*                plen    = taille de cette chaîne.                            *
*                                                                             *
*  Description : Inscrit dans le moteur une chaîne de caractères à rechercher.*
*                                                                             *
*  Retour      : Indice de résultats pour le motif.                           *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static patid_t enroll_plain_pattern_avx512(GBitapBackend *backend, GScanContext *context, const bin_t *plain, size_t plen)
{
    patid_t result;                         /* Identifiant à retourner     */
    grouped_strings_avx512_t ***strings;    /* Groupe de chaînes visé      */
    size_t *count;                          /* Taille de ce groupe         */
    grouped_strings_avx512_t *last;         /* Dernier groupe à remplir    */
    size_t n;                               /* Indice dans le groupe       */
    size_t i;                               /* Boucle de parcours          */
    __m512i *letter;                        /* Lettre à marquer            */

    /* Sélection du groupe de travail adéquat */

    strings = &backend->manager_avx512.strings_8;
    count = &backend->manager_avx512.count_8;

    /* Préparation de la place nécessaire */

    if (*count == 0)
    {
        extend_grouped_strings_avx512(strings, count);

        last = (*strings)[0];

    }

    else
    {
        last = (*strings)[*count - 1];

        if (last->used == last->available)
        {
            extend_grouped_strings_avx512(strings, count);
            last = (*strings)[*count - 1];
        }

    }

    /* Intégration d'une nouvelle chaîne */

    n = last->used++;

    last->m[n] = plen;

    result = 0; // FIXME g_scan_context_get_new_pattern_id(context);

    last->found_id[n] = result;

    ((uint8_t *)&last->found_masks)[n] = (1 << plen);

    for (i = 0; i < plen; i++)
    {
        letter = last->pattern_masks + plain[i];
        ((uint8_t *)letter)[n] &= ~(1 << i);
    }

    return result;

}


/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = lieu d'enregistrement des résultats.               *
*                data    = données à analyser.                                *
*                dlen    = quantité de ces données.                           *
*                                                                             *
*  Description : Parcours un contenu binaire à la recherche de motifs.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void run_scan_avx512(const GBitapBackend *backend, GScanContext *context, const bin_t *data, phys_t dlen)
{
    const group_manager_avx512_t *manager;  /* Accès simplifié             */

    //register __m512i zero asm("zmm19");                           /* Constante 0 sur 512 bits    */

    //__m512i shift8_mask;                    /* Masque pour décalage manuel */


    size_t k;                               /* Boucle de parcours #1       */
    /*__attribute__((aligned(0x1000)))*/ grouped_strings_avx512_t group;         /* Copie pour accès locaux     */
    //void *grpptr;
    //grouped_strings_avx512_t *_group;       /* Copie pour accès locaux     */

    int ret;


    register __m512i R asm("zmm28");                              /* Résultats courants          */
    register __m512i found_masks asm("zmm21"); /* Vérifications accélérées  */


    register __mmask64 test_mask asm("k6");


    register const bin_t *iter asm("rsi");
    register const bin_t *maxiter/* asm("rdi")*/;
    //phys_t i;                               /* Boucle de parcours #2       */


    //__m512i test;

    __mmask64 mask;                         /* Masque d'accès rapide       */
    size_t j;                               /* Boucle de parcours #3       */


    /* Initialisations diverses */

    manager = &backend->manager_avx512;




    /* Recherches des chaînes de moins de 8 caractères */

    //asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");

    //zero = _mm512_set1_epi8(0);

    //asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");

    //shift8_mask = _mm512_set1_epi8(0x7f);



#define WORK_ON_COPY

    for (k = 0; k < manager->count_8; k++)
    {
#ifdef WORK_ON_COPY
        memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));

#else

        grpptr = alloca(sizeof(grouped_strings_avx512_t) + 0x1000);

        _group = grpptr + 0x1000 - (((unsigned long)grpptr) % 0x1000);

        //_group = manager->strings_8[k];

        memcpy(_group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));

        ret = mlock(_group, sizeof(grouped_strings_avx512_t));

        printf("ret = %d\n", ret);
#endif



        //printf(" --- group %p  --  used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
        //printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));


        asm volatile
        (
            /*
             * R = _mm512_set1_epi8(~1);
             *
             */

            "movabs         $0xfefefefefefefefe, %%rax ; "
            "vpbroadcastq   %%rax, %[STATE] ; "

            "movabs         $0xffffffffffffffff, %%rax ; "
            "kmovq          %%rax, %[KMASK] ; "

            /*
             *
             */

            "vmovdqa64 %[FOUND_SRC], %[FOUND_DST] ; "

            : [STATE] "=v"(R),
              [KMASK] "=Yk"(test_mask),
              [FOUND_DST] "=v"(found_masks)
#ifdef WORK_ON_COPY
            : [FOUND_SRC] "m"(group.found_masks)
#else
            : [FOUND_SRC] "m"(_group->found_masks)
#endif
            : "memory", "rax"

         );







        //for (i = 0; i < dlen; i++)

        maxiter = data + dlen;

        for (iter = data; iter < maxiter; iter++)
        {

            //printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);


            asm volatile goto
            (
                /*
                 * R = _mm512_or_si512(R, group.pattern_masks[*iter]);
                 *
                 * Latency : 1-9
                 * Throughput : 0.5
                 * #Uops : 1-2
                 * Port Usage : 1*p05+1*p23
                 *
                 */

                "vpord  %[PATTERN], %[STATE], %[STATE] ; "

                /*
                 * R = _mm512_add_epi8(R, R);
                 *
                 * Latency : 1
                 * Throughput : 0.5
                 * #Uops : 1
                 * Port Usage : 1*p05
                 *
                 */

                "vpaddb %[STATE], %[STATE], %[STATE] ; "

                /*
                 * mask = _mm512_test_epi8_mask(R, group.found_masks);
                 *
                 * Latency : 3
                 * Throughput : 1
                 * #Uops : 2
                 * Port Usage : 1*p23+1*p5
                 *
                 */

                /******************************
                 * Version 0

                ******************/

                //"vptestmb   %[FOUND], %[STATE], %%k7 ; "

                /******************************
                 * Version 1

                "vmovdqa64  %[STATE], %%zmm12 ; "

                "vptestmb   %[FOUND], %%zmm12, %%k7 ; "

                ******************/

                /******************************
                 * Version 2

                "vpandd     %[STATE], %[FOUND], %%zmm12 ; "

                "vpcmpneqb  %[NUL], %%zmm12, %%k7 ; "

                ******************/


                "vmovdqa64  %[STATE], %%zmm12 ; "

                "vptestmb   %[FOUND], %%zmm12, %%k7 ; "


                "ktestq %[KMASK], %%k7 ; "

                "jc %l[next_iter] ; "





                /*
                 * (suite)
                 *
                 * Latency : 3
                 * Throughput : 1
                 * #Uops : 1
                 * Port Usage : 1*p5
                 *
                 */

                "kmovq  %%k7, %[MASK0] ; "

                //"vmovdqa64  %%zmm12, %[OUTPUT] ; "

                //"nop; nop; nop; nop; nop; nop; nop; nop; "
                //"nop; nop; nop; nop; nop; nop; nop; nop; "

                : [STATE] "+v"(R),
                  //[OUTPUT] "=v"(test),
                  [MASK0] "=r"(mask)
                  //[NUL] "=v"(zero)
#ifdef WORK_ON_COPY
                : [PATTERN] "m"(group.pattern_masks[*iter]),
#else
                : [PATTERN] "m"(_group->pattern_masks[*iter]),
#endif
                  [FOUND] "v"(found_masks),
                  [KMASK] "Yk"(test_mask)
                : "memory", "k7", "zmm12"
                : next_iter

             );




            /*
            printf("  found mask: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&group.found_masks)[0],
                   ((uint8_t *)&group.found_masks)[1],
                   ((uint8_t *)&group.found_masks)[2],
                   ((uint8_t *)&group.found_masks)[3],
                   ((uint8_t *)&group.found_masks)[4],
                   ((uint8_t *)&group.found_masks)[5],
                   ((uint8_t *)&group.found_masks)[6],
                   ((uint8_t *)&group.found_masks)[7]);


            printf("        test: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&test)[0],
                   ((uint8_t *)&test)[1],
                   ((uint8_t *)&test)[2],
                   ((uint8_t *)&test)[3],
                   ((uint8_t *)&test)[4],
                   ((uint8_t *)&test)[5],
                   ((uint8_t *)&test)[6],
                   ((uint8_t *)&test)[7]);


            printf("  -> mask: 0x%llx\n", (unsigned long long)mask);
            */


#ifdef WORK_ON_COPY

            //if (mask != 0xffffffffffffffffllu)
                for (j = 0; j < group.used; j++)
                {
                    if ((mask & 0x1) == 0)
                    {
                        //assert((i + 1) >= group.m[j]);

                        /** TODO : update call
                        g_scan_context_register_atom_match(context,
                                                           group.found_id[j],
                                                           (iter - data) + 1 - group.m[j]);
                        **/

                    }

                    mask >>= 1;

                }

#else

#   error "WEFEF"

            if (mask != 0xffffffffffffffffllu)
                for (j = 0; j < _group->used; j++)
                {
                    if ((mask & 0x1) == 0)
                    {
                        //assert((i + 1) >= group.m[j]);

                        /** TODO : update call
                        g_scan_context_register_atom_match(context,
                                                           _group->found_id[j],
                                                           (iter - data) + 1 - _group->m[j]);
                        **/

                    }

                    mask >>= 1;

                }

#endif


            next_iter:

            //;

            //iter++;

        }

    }

}











#if 0











/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = lieu d'enregistrement des résultats.               *
*                content = données binaires à analyser.                       *
*                                                                             *
*  Description : Parcours un contenu binaire à la recherche de motifs.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void run_scan_avx512____good_asm_perfs(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
    const group_manager_avx512_t *manager;  /* Accès simplifié             */
    phys_t dlen;                            /* Quantité de données         */
    vmpa2t pos;                             /* Point de départ ciblé       */
    const bin_t *data;                      /* Données à analyser          */


    //__m512i shift8_mask;                    /* Masque pour décalage manuel */


    size_t k;                               /* Boucle de parcours #1       */
    grouped_strings_avx512_t group;         /* Copie pour accès locaux     */

    register __m512i found_masks asm("zmm21"); /* Vérifications accélérées  */


    //register volatile __m512i zero/* asm("zmm19")*/;                           /* Constante 0 sur 512 bits    */
    register __m512i R asm("zmm28");                              /* Résultats courants          */

    //int counter;

    const bin_t *iter;
    const bin_t *maxiter;
    //phys_t i;                               /* Boucle de parcours #2       */


    __m512i test;

    __mmask64 mask;                         /* Masque d'accès rapide       */
    size_t j;                               /* Boucle de parcours #3       */


    //register __m512i z30 asm("zmm30");


    //return;


    //counter = 0;

    //return;

    /* Initialisations diverses */

    manager = &backend->manager_avx512;

    dlen = g_binary_content_compute_size(content);

    g_binary_content_compute_start_pos(content, &pos);
    data = g_binary_content_get_raw_access(content, &pos, dlen);

    /* Recherches des chaînes de moins de 8 caractères */

    printf(" --- manager512->count_8: %zu\n", manager->count_8);

    asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");

    //zero = _mm512_set1_epi8(0);

    asm volatile ("nop; nop; nop; nop; nop; nop; nop; ");

    //shift8_mask = _mm512_set1_epi8(0x7f);


    for (k = 0; k < manager->count_8; k++)
    {
        memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));




        //printf(" --- group %p  --  used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
        //printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));


        asm volatile
        (
            /*
             * R = _mm512_set1_epi8(~1);
             *
             */

            "movabs         $0xfefefefefefefefe, %%rax ; "
            "vpbroadcastq   %%rax, %[STATE] ; "

            /*
             *
             */

            "vmovdqa64 %[FOUND_SRC], %[FOUND_DST] ; "

            : [STATE] "=v"(R),
              [FOUND_DST] "=v"(found_masks)
            : [FOUND_SRC] "m"(group.found_masks)
            : "memory", "rax"

         );







        //for (i = 0; i < dlen; i++)

        maxiter = data + dlen;

        for (iter = data; iter < maxiter; iter++)
        {

            //printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);


            asm volatile
            (

                /*
                 * R = _mm512_or_si512(R, group.pattern_masks[*iter]);
                 *
                 * Latency : 1-9
                 * Throughput : 0.5
                 * #Uops : 1-2
                 * Port Usage : 1*p05+1*p23
                 *
                 */

                "vpord  %[PATTERN], %[STATE], %[STATE] ; "

                /*
                 * R = _mm512_add_epi8(R, R);
                 *
                 * Latency : 1
                 * Throughput : 0.5
                 * #Uops : 1
                 * Port Usage : 1*p05
                 *
                 */

                "vpaddb   %[STATE], %[STATE], %[STATE] ; "

                /*
                 * mask = _mm512_test_epi8_mask(R, group.found_masks);
                 *
                 * Latency : 3
                 * Throughput : 1
                 * #Uops : 2
                 * Port Usage : 1*p23+1*p5
                 *
                 */

                /******************************
                 * Version 0

                ******************/

                "vptestmb   %[FOUND], %[STATE], %%k7 ; "

                /******************************
                 * Version 1

                "vmovdqa64  %[STATE], %%zmm12 ; "

                "vptestmb   %[FOUND], %%zmm12, %%k0 ; "

                ******************/

                /******************************
                 * Version 2

                "vpandd     %[STATE], %[FOUND], %%zmm12 ; "

                "vpcmpneqb  %[NUL], %%zmm12, %%k7 ; "

                ******************/

                /*
                 * (suite)
                 *
                 * Latency : 3
                 * Throughput : 1
                 * #Uops : 1
                 * Port Usage : 1*p5
                 *
                 */

                "kmovq  %%k7, %[MASK0] ; "

                //"vmovdqa64  %%zmm12, %[OUTPUT] ; "

                //"nop; nop; nop; nop; nop; nop; nop; nop; "
                //"nop; nop; nop; nop; nop; nop; nop; nop; "

                : [STATE] "+v"(R),
                  [OUTPUT] "=v"(test),
                  [MASK0] "=r"(mask)/*,
                  [NUL] "+v"(zero)*/
                : [PATTERN] "v"(group.pattern_masks[*iter]),
                  [FOUND] "v"(found_masks)
                : "memory", "k0", "zmm12"

             );




            /*
            printf("  found mask: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&group.found_masks)[0],
                   ((uint8_t *)&group.found_masks)[1],
                   ((uint8_t *)&group.found_masks)[2],
                   ((uint8_t *)&group.found_masks)[3],
                   ((uint8_t *)&group.found_masks)[4],
                   ((uint8_t *)&group.found_masks)[5],
                   ((uint8_t *)&group.found_masks)[6],
                   ((uint8_t *)&group.found_masks)[7]);


            printf("        test: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&test)[0],
                   ((uint8_t *)&test)[1],
                   ((uint8_t *)&test)[2],
                   ((uint8_t *)&test)[3],
                   ((uint8_t *)&test)[4],
                   ((uint8_t *)&test)[5],
                   ((uint8_t *)&test)[6],
                   ((uint8_t *)&test)[7]);


            printf("  -> mask: 0x%llx\n", (unsigned long long)mask);
            */

#if 0

            /*
            printf("           R: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&R)[0],
                   ((uint8_t *)&R)[1],
                   ((uint8_t *)&R)[2],
                   ((uint8_t *)&R)[3],
                   ((uint8_t *)&R)[4],
                   ((uint8_t *)&R)[5],
                   ((uint8_t *)&R)[6],
                   ((uint8_t *)&R)[7]);

            printf("  found mask: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&group.found_masks)[0],
                   ((uint8_t *)&group.found_masks)[1],
                   ((uint8_t *)&group.found_masks)[2],
                   ((uint8_t *)&group.found_masks)[3],
                   ((uint8_t *)&group.found_masks)[4],
                   ((uint8_t *)&group.found_masks)[5],
                   ((uint8_t *)&group.found_masks)[6],
                   ((uint8_t *)&group.found_masks)[7]);
            */

            /*

            printf("        test: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&test)[0],
                   ((uint8_t *)&test)[1],
                   ((uint8_t *)&test)[2],
                   ((uint8_t *)&test)[3],
                   ((uint8_t *)&test)[4],
                   ((uint8_t *)&test)[5],
                   ((uint8_t *)&test)[6],
                   ((uint8_t *)&test)[7]);

            */

#endif





#   define TEST_MASK 0xffffffffffffffffllu
#   define TEST_BIT 0


            //printf("mask: %llx\n", (unsigned long long)mask);


            if (mask != TEST_MASK)
            {
                //printf("mask: %llx\n", (unsigned long long)mask);

                //counter++;
                //printf("Ouhc: %p - %x\n", &group, *((uint8_t *)&mask));
                //printf("Ouhc: %x\n", 1);
                //asm("vzeroupper;");
                //printf("Ouhc: %hhx\n", R[0]);
                for (j = 0; j < group.used; j++)
                {
                    if ((mask & 0x1) == TEST_BIT)
                    {
                        //assert((i + 1) >= group.m[j]);

                        //printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);
                        printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)(iter - data) + 1);


                    }

                    mask >>= 1;
                    //printf("> mask: %llx\n", (unsigned long long)mask);

                }



            }



        }

        //printf("%hhx\n", ((uint8_t *)&R)[0], ((uint8_t *)&mask)[0]);

    }

    //printf("counter=%d\n", counter);


}




/******************************************************************************
*                                                                             *
*  Paramètres  : backend = moteur de recherche à manipuler.                   *
*                context = lieu d'enregistrement des résultats.               *
*                content = données binaires à analyser.                       *
*                                                                             *
*  Description : Parcours un contenu binaire à la recherche de motifs.        *
*                                                                             *
*  Retour      : -                                                            *
*                                                                             *
*  Remarques   : -                                                            *
*                                                                             *
******************************************************************************/

static void run_scan_avx512_best_test(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
    const group_manager_avx512_t *manager;  /* Accès simplifié             */
    phys_t dlen;                            /* Quantité de données         */
    vmpa2t pos;                             /* Point de départ ciblé       */
    const bin_t *data;                      /* Données à analyser          */


    //__m512i shift8_mask;                    /* Masque pour décalage manuel */


    size_t k;                               /* Boucle de parcours #1       */
    grouped_strings_avx512_t group;         /* Copie pour accès locaux     */

    //register __m512i zero;                           /* Constante 0 sur 512 bits    */
    register __m512i R;                              /* Résultats courants          */

    //int counter;

    const bin_t *iter;
    const bin_t *maxiter;
    //phys_t i;                               /* Boucle de parcours #2       */


    //__m512i test;

    __mmask64 mask;                         /* Masque d'accès rapide       */
    size_t j;                               /* Boucle de parcours #3       */

    //return;


    //counter = 0;

    //return;

    /* Initialisations diverses */

    manager = &backend->manager_avx512;

    dlen = g_binary_content_compute_size(content);

    g_binary_content_compute_start_pos(content, &pos);
    data = g_binary_content_get_raw_access(content, &pos, dlen);

    /* Recherches des chaînes de moins de 8 caractères */

    printf(" --- manager512->count_8: %zu\n", manager->count_8);

    //zero = _mm512_set1_epi8(0);

    //shift8_mask = _mm512_set1_epi8(0x7f);



    for (k = 0; k < manager->count_8; k++)
    {
        memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));

        //printf(" --- group %p  --  used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
        //printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));

        R = _mm512_set1_epi8(~1);



        /* vpord zmm, zmm, zmm : latence 1, 1*p05 */
        //R = _mm512_or_si512(R, group.pattern_masks[data[0]]);

        //for (i = 0; i < dlen; i++)

        maxiter = data + dlen;

        for (iter = data; iter < maxiter; iter++)
        {

            //printf("--- %llx <-> %c\n", (unsigned long long)(iter - data), *iter);


            //R = _mm512_or_si512(R, group.pattern_masks[data[i]]);
            R = _mm512_or_si512(R, group.pattern_masks[*iter]);


#if 1
            /* vpaddb zmm, zmm, zmm : latence 1, 1*p05 */
            R = _mm512_add_epi8(R, R);
#else
            /* vpandd zmm, zmm, zmm : latence 1, 1*p5 */
            R = _mm512_and_si512(R, shift8_mask);
            /* vpslldq zmm, zmm, imm8 : latence 1, 1*p5 */
            R = _mm512_bslli_epi128(R, 1);

#endif

            /*
            printf("           R: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&R)[0],
                   ((uint8_t *)&R)[1],
                   ((uint8_t *)&R)[2],
                   ((uint8_t *)&R)[3],
                   ((uint8_t *)&R)[4],
                   ((uint8_t *)&R)[5],
                   ((uint8_t *)&R)[6],
                   ((uint8_t *)&R)[7]);

            printf("  found mask: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&group.found_masks)[0],
                   ((uint8_t *)&group.found_masks)[1],
                   ((uint8_t *)&group.found_masks)[2],
                   ((uint8_t *)&group.found_masks)[3],
                   ((uint8_t *)&group.found_masks)[4],
                   ((uint8_t *)&group.found_masks)[5],
                   ((uint8_t *)&group.found_masks)[6],
                   ((uint8_t *)&group.found_masks)[7]);
            */

#if 1
            /* vptestmb k, zmm, zmm : latence 3, 1*p5 */
            mask = _mm512_test_epi8_mask(R, group.found_masks);


            //test = _mm512_add_epi64(R, zero);

            //mask = _mm512_test_epi8_mask(test, group.found_masks);





#   define TEST_MASK 0xffffffffffffffffllu
#   define TEST_BIT 0

            /* comparaison : != */


#else
            /* vpandd zmm, zmm, zmm : latence 1, 1*p05 */
            test = _mm512_and_si512(R, group.found_masks);


            printf("        test: %hhx %hhx %hhx %hhx   %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&test)[0],
                   ((uint8_t *)&test)[1],
                   ((uint8_t *)&test)[2],
                   ((uint8_t *)&test)[3],
                   ((uint8_t *)&test)[4],
                   ((uint8_t *)&test)[5],
                   ((uint8_t *)&test)[6],
                   ((uint8_t *)&test)[7]);

            /* vpmovb2m k, zmm : latence 3 (au lieu de 1 !?), 1*p0 */
            //mask = _mm512_movepi8_mask(test);

#   define TEST_MASK 0
#   define TEST_BIT 0


            //test = _mm512_popcnt_epi8(test);

#endif


            //printf("  final mask: %16llx\n", (unsigned long long)mask);



            //R = _mm512_or_si512(R, group.pattern_masks[data[i + 1]]);

#if 1


            if (mask != TEST_MASK)
            {
                //counter++;
                //printf("Ouhc: %p - %x\n", &group, *((uint8_t *)&mask));
                printf("Ouhc: %p\n", &group);
                //printf("Ouhc: %hhx\n", R[0]);
                for (j = 0; j < group.used; j++)
                {
                    if ((mask & 0x1) == TEST_BIT)
                    {
                        //assert((i + 1) >= group.m[j]);

                        //printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);
                        printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)(iter - data) + 1);


                    }

                    mask >>= 1;

                }



            }


#else

            if (_mm512_reduce_or_epi64(test) != 0)
            {
                for (j = 0; j < group.used; j++)
                {
                    if (((uint8_t *)&test)[j] == 0)
                    {
                        //assert((i + 1) >= group.m[j]);

                        printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);

                    }


                }

            }

#endif


        }

        //printf("%hhx\n", ((uint8_t *)&R)[0], ((uint8_t *)&mask)[0]);

    }

    //printf("counter=%d\n", counter);


}





static void run_scan_avx512__saved(const GBitapBackend *backend, GScanContext *context, GBinContent *content)
{
    const group_manager_avx512_t *manager;  /* Accès simplifié             */
    phys_t dlen;                            /* Quantité de données         */
    vmpa2t pos;                             /* Point de départ ciblé       */
    const bin_t *data;                      /* Données à analyser          */


    __m512i shift8_mask;                    /* Masque pour décalage manuel */


    size_t k;                               /* Boucle de parcours #1       */
    grouped_strings_avx512_t group;         /* Copie pour accès locaux     */


    __m512i R;                              /* Résultats courants          */

    //int counter;

    phys_t i;                               /* Boucle de parcours #2       */


    __m512i test;

    __mmask64 mask;                         /* Masque d'accès rapide       */
    size_t j;                               /* Boucle de parcours #3       */



    //counter = 0;

    //return;

    /* Initialisations diverses */

    manager = &backend->manager_avx512;

    dlen = g_binary_content_compute_size(content);

    g_binary_content_compute_start_pos(content, &pos);
    data = g_binary_content_get_raw_access(content, &pos, dlen);

    /* Recherches des chaînes de moins de 8 caractères */

    printf(" --- manager512->count_8: %zu\n", manager->count_8);



    shift8_mask = _mm512_set1_epi8(0x7f);


    for (k = 0; k < manager->count_8; k++)
    {
        memcpy(&group, manager->strings_8[k], sizeof(grouped_strings_avx512_t));

        //printf(" --- group %p  --  used: %zu (sz: %zu)\n", &group, group.used, sizeof(grouped_strings_avx512_t));
        //printf(" --- group.used: %zu (sz: %zu)\n", group.used, sizeof(grouped_strings_avx512_t));

        R = _mm512_set1_epi8(~1);

        /* vpord zmm, zmm, zmm : latence 1, 1*p05 */
        R = _mm512_or_si512(R, group.pattern_masks[data[0]]);

        for (i = 0; i < dlen; i++)
        {

            /*
            printf("--- %llx <-> %c\n", (unsigned long long)i, data[i]);

            printf("  R: %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&R)[0],
                   ((uint8_t *)&R)[1],
                   ((uint8_t *)&R)[2],
                   ((uint8_t *)&R)[3]);

            printf("  mask: %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&group.pattern_masks[data[i]])[0],
                   ((uint8_t *)&group.pattern_masks[data[i]])[1],
                   ((uint8_t *)&group.pattern_masks[data[i]])[2],
                   ((uint8_t *)&group.pattern_masks[data[i]])[3]);
            */

            //R = _mm512_or_si512(R, group.pattern_masks[data[i]]);

            /*
            printf("  R: %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&R)[0],
                   ((uint8_t *)&R)[1],
                   ((uint8_t *)&R)[2],
                   ((uint8_t *)&R)[3]);
            */

#if 1
            /* vpaddb zmm, zmm, zmm : latence 1, 1*p05 */
            R = _mm512_add_epi8(R, R);
#else
            /* vpandd zmm, zmm, zmm : latence 1, 1*p5 */
            R = _mm512_and_si512(R, shift8_mask);
            /* vpslldq zmm, zmm, imm8 : latence 1, 1*p5 */
            R = _mm512_bslli_epi128(R, 1);

#endif

#if 1
            /* vptestmb k, zmm, zmm : latence 3, 1*p5 */
            mask = _mm512_test_epi8_mask(R, group.found_masks);
#else
            test = _mm512_and_si512(R, group.found_masks);
            test = _mm512_popcnt_epi8(test);

#endif

            /*
            printf("  found mask: %hhx %hhx %hhx %hhx\n",
                   ((uint8_t *)&group.found_masks)[0],
                   ((uint8_t *)&group.found_masks)[1],
                   ((uint8_t *)&group.found_masks)[2],
                   ((uint8_t *)&group.found_masks)[3]);

            printf("  final mask: %16llx\n", (unsigned long long)mask);
            */


            R = _mm512_or_si512(R, group.pattern_masks[data[i + 1]]);

#if 1

            if (mask != 0xffffffffffffffffllu)
            {
                //counter++;
                //printf("Ouhc: %p - %x\n", &group, *((uint8_t *)&mask));
                //printf("Ouhc: %p\n", &group);
                for (j = 0; j < group.used; j++)
                {
                    if ((mask & 0x1) == 0)
                    {
                        //assert((i + 1) >= group.m[j]);

                        printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);


                    }

                    mask >>= 1;

                }



            }


#else

            if (_mm512_reduce_or_epi64(test) != 0)
            {
                for (j = 0; j < group.used; j++)
                {
                    if (((uint8_t *)&test)[j] == 0)
                    {
                        //assert((i + 1) >= group.m[j]);

                        printf(">> FOUND %zu @ %x !!!!!!!!!!!!!!\n", j, (unsigned int)i + 1);

                    }


                }

            }

#endif


        }

        //printf("%hhx\n", ((uint8_t *)&R)[0], ((uint8_t *)&mask)[0]);

    }

    //printf("counter=%d\n", counter);


}
#endif



#endif