From 30e56f25772247d76c7775d32ccd5b5a0661d3bf Mon Sep 17 00:00:00 2001
From: Cyrille Bagard <nocbos@gmail.com>
Date: Sun, 18 Feb 2024 23:10:08 +0100
Subject: Read escaped strings faster.

---
 src/analysis/scan/tokens.l | 429 ++++++++++++++++++++++++++-------------------
 1 file changed, 247 insertions(+), 182 deletions(-)

diff --git a/src/analysis/scan/tokens.l b/src/analysis/scan/tokens.l
index 86e3e92..ab881c1 100644
--- a/src/analysis/scan/tokens.l
+++ b/src/analysis/scan/tokens.l
@@ -14,6 +14,31 @@
 #include <stdlib.h>
 
 
+/* Tête de lecture pour conversions */
+typedef union _read_ptr_t
+{
+    const uint8_t *byte_pos;                /* Lecture par blocs de 8 bits */
+    const uint16_t *hword_pos;              /* Lecture par blocs de 16 bits*/
+
+} read_ptr_t;
+
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+#   define MAKE_HWORD(ch1, ch2) ((uint16_t)(ch2 << 8 | ch1))
+
+#elif __BYTE_ORDER == __BIG_ENDIAN
+
+#   define MAKE_HWORD(ch1, ch2) ((uint16_t)(ch1 << 8 | ch2))
+
+#else
+
+    /* __PDP_ENDIAN et Cie... */
+#   error "Congratulations! Your byte order is not supported!"
+
+#endif
+
+
 
 /******************************************************************************
 *                                                                             *
@@ -29,124 +54,143 @@
 *                                                                             *
 ******************************************************************************/
 
-static void rost_unescape_string_bytes(const char *src, size_t len, sized_string_t *out)
+static void rost_unescape_string(const char *src, size_t len, sized_string_t *out)
 {
-    size_t i;                               /* Boucle de parcours          */
+    read_ptr_t reader;                      /* Tête de lecture             */
+    const bin_t *max;                       /* Fin du parcours             */
+    uint16_t half;                          /* Moitié de mot               */
     bin_t byte;                             /* Octet à analyser            */
-    bin_t next;                             /* Octet suivant               */
+    bin_t *writer;                          /* Tête d'écriture             */
 
-    out->len = 0;
+    reader.byte_pos = (const uint8_t *)src;
+    max = reader.byte_pos + len;
 
-    for (i = 0; i < len; i++)
-    {
-        byte = src[i];
+    writer = out->bin_data;
 
-        switch (byte)
+    while (reader.byte_pos < max)
+    {
+        /**
+         * La lecture par groupes de deux octets n'est pas forcément toujours
+         * logique : pour "\nabc", la dernière lecture va considérer 'c"',
+         * incluant ainsi le caractère '"' qui a été écarté pour l'appel.
+         *
+         * Le code est cependant suffisamment souple pour ignore le superflu.
+         */
+        switch (*reader.hword_pos)
         {
-            case '\\':
-
-                next = src[i + 1];
-
-                switch (next)
-                {
-                    case 'a':
-                        out->data[out->len++] = '\a';
-                        break;
-
-                    case 'b':
-                        out->data[out->len++] = '\b';
-                        break;
-
-                    case 't':
-                        out->data[out->len++] = '\t';
-                        break;
-
-                    case 'n':
-                        out->data[out->len++] = '\n';
-                        break;
-
-                    case 'v':
-                        out->data[out->len++] = '\v';
-                        break;
-
-                    case 'f':
-                        out->data[out->len++] = '\f';
-                        break;
-
-                    case 'r':
-                        out->data[out->len++] = '\r';
-                        break;
-
-                    case 'e':
-                        out->data[out->len++] = '\e';
-                        break;
-
-                    case '"':
-                        out->data[out->len++] = '\"';
-                        break;
+            case MAKE_HWORD('\\', 'a'):
+                reader.hword_pos++;
+                *writer++ = '\a';
+                break;
 
-                    case '\\':
-                        out->data[out->len++] = '\\';
-                        break;
+            case MAKE_HWORD('\\', 'b'):
+                reader.hword_pos++;
+                *writer++ = '\b';
+                break;
 
-                    case 'x':
+            case MAKE_HWORD('\\', 't'):
+                reader.hword_pos++;
+                *writer++ = '\t';
+                break;
 
-                        next = src[i + 2];
+            case MAKE_HWORD('\\', 'n'):
+                reader.hword_pos++;
+                *writer++ = '\n';
+                break;
 
-                        switch (next)
-                        {
-                            case '0' ... '9':
-                                out->data[out->len] = (next - '0');
-                                break;
+            case MAKE_HWORD('\\', 'v'):
+                reader.hword_pos++;
+                *writer++ = '\v';
+                break;
 
-                            case 'A' ... 'F':
-                                out->data[out->len] = 0xa + (next - 'A');
-                                break;
+            case MAKE_HWORD('\\', 'f'):
+                reader.hword_pos++;
+                *writer++ = '\f';
+                break;
 
-                            case 'a' ... 'f':
-                                out->data[out->len] = 0xa + (next - 'a');
-                                break;
+            case MAKE_HWORD('\\', 'r'):
+                reader.hword_pos++;
+                *writer++ = '\r';
+                break;
 
-                        }
+            case MAKE_HWORD('\\', 'e'):
+                reader.hword_pos++;
+                *writer++ = '\e';
+                break;
 
-                        out->data[out->len] <<= 4;
+            case MAKE_HWORD('\\', '"'):
+                reader.hword_pos++;
+                *writer++ = '\"';
+                break;
 
-                        next = src[i + 3];
+            case MAKE_HWORD('\\', '\\'):
+                reader.hword_pos++;
+                *writer++ = '\\';
+                break;
 
-                        switch (next)
-                        {
-                            case '0' ... '9':
-                                out->data[out->len] |= (next - '0');
-                                break;
+            case MAKE_HWORD('\\', 'x'):
+                reader.hword_pos++;
+
+                /**
+                 * Le jeu des expressions régulières qui amène à l'appel de
+                 * cette fonction limite les caractères possibles à trois
+                 * ensembles : chiffres et lettres en majuscules et minuscules.
+                 *
+                 * La bascule des lettres en minuscules ramène les possibles
+                 * à deux ensembles uniquement, simplifiant ainsi les règles
+                 * de filtrage : aucun switch case n'est ainsi requis !
+                 */
+
+                half = *reader.hword_pos++;
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                byte = (half & 0xff);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+                byte = (half >> 8);
+#endif
 
-                            case 'A' ... 'F':
-                                out->data[out->len] |= 0xa + (next - 'A');
-                                break;
+                /* '0' ... '9' */
+                if (byte <= '9')
+                    *writer = (byte - '0');
 
-                            case 'a' ... 'f':
-                                out->data[out->len] |= 0xa + (next - 'a');
-                                break;
+                /* 'A' ... 'F' || 'a' ... 'f' */
+                else
+                {
+                    byte |= 0x20;
+                    *writer = 0xa + (byte - 'a');
+                }
 
-                        }
+                *writer <<= 4;
 
-                        out->len++;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                byte = (half >> 8);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+                byte = (half & 0xff);
+#endif
 
-                        i += 2;
-                        break;
+                /* '0' ... '9' */
+                if (byte <= '9')
+                    *writer++ |= (byte - '0');
 
+                /* 'A' ... 'F' || 'a' ... 'f' */
+                else
+                {
+                    byte |= 0x20;
+                    *writer++ |= 0xa + (byte - 'a');
                 }
 
-                i++;
                 break;
 
             default:
-                out->data[out->len++] = byte;
+                *writer++ = *reader.byte_pos++;
                 break;
 
         }
 
     }
 
+    out->len = writer - out->bin_data;
+
 }
 
 
@@ -164,132 +208,153 @@ static void rost_unescape_string_bytes(const char *src, size_t len, sized_string
 *                                                                             *
 ******************************************************************************/
 
-static void rost_unescape_bytes(const char *src, size_t len, sized_string_t *out)
+static void rost_unescape_regex(const char *src, size_t len, sized_string_t *out)
 {
-    size_t i;                               /* Boucle de parcours          */
+    read_ptr_t reader;                      /* Tête de lecture             */
+    const bin_t *max;                       /* Fin du parcours             */
+    uint16_t half;                          /* Moitié de mot               */
     bin_t byte;                             /* Octet à analyser            */
-    bin_t next;                             /* Octet suivant               */
+    bin_t *writer;                          /* Tête d'écriture             */
 
-    out->len = 0;
+    reader.byte_pos = (const uint8_t *)src;
+    max = reader.byte_pos + len;
 
-    for (i = 0; i < len; i++)
-    {
-        byte = src[i];
+    writer = out->bin_data;
 
-        switch (byte)
+    while (reader.byte_pos < max)
+    {
+        /**
+         * La lecture par groupes de deux octets n'est pas forcément toujours
+         * logique : pour "\nabc", la dernière lecture va considérer 'c"',
+         * incluant ainsi le caractère '"' qui a été écarté pour l'appel.
+         *
+         * Le code est cependant suffisamment souple pour ignore le superflu.
+         */
+        switch (*reader.hword_pos)
         {
-            case '\\':
-
-                next = src[i + 1];
-
-                switch (next)
-                {
-                    case 'a':
-                        out->data[out->len++] = '\a';
-                        break;
-
-                    case 'b':
-                        out->data[out->len++] = '\b';
-                        break;
-
-                    case 't':
-                        out->data[out->len++] = '\t';
-                        break;
-
-                    case 'n':
-                        out->data[out->len++] = '\n';
-                        break;
-
-                    case 'v':
-                        out->data[out->len++] = '\v';
-                        break;
-
-                    case 'f':
-                        out->data[out->len++] = '\f';
-                        break;
-
-                    case 'r':
-                        out->data[out->len++] = '\r';
-                        break;
-
-                    case 'e':
-                        out->data[out->len++] = '\e';
-                        break;
-
-                    case '"':
-                        out->data[out->len++] = '\"';
-                        break;
+            case MAKE_HWORD('\\', 'a'):
+                reader.hword_pos++;
+                *writer++ = '\a';
+                break;
 
-                    case '\\':
-                        out->data[out->len++] = '\\';
-                        break;
+            case MAKE_HWORD('\\', 'b'):
+                reader.hword_pos++;
+                *writer++ = '\b';
+                break;
 
-                    case 'x':
+            case MAKE_HWORD('\\', 't'):
+                reader.hword_pos++;
+                *writer++ = '\t';
+                break;
 
-                        next = src[i + 2];
+            case MAKE_HWORD('\\', 'n'):
+                reader.hword_pos++;
+                *writer++ = '\n';
+                break;
 
-                        switch (next)
-                        {
-                            case '0' ... '9':
-                                out->data[out->len] = (next - '0');
-                                break;
+            case MAKE_HWORD('\\', 'v'):
+                reader.hword_pos++;
+                *writer++ = '\v';
+                break;
 
-                            case 'A' ... 'F':
-                                out->data[out->len] = 0xa + (next - 'A');
-                                break;
+            case MAKE_HWORD('\\', 'f'):
+                reader.hword_pos++;
+                *writer++ = '\f';
+                break;
 
-                            case 'a' ... 'f':
-                                out->data[out->len] = 0xa + (next - 'a');
-                                break;
+            case MAKE_HWORD('\\', 'r'):
+                reader.hword_pos++;
+                *writer++ = '\r';
+                break;
 
-                        }
+            case MAKE_HWORD('\\', 'e'):
+                reader.hword_pos++;
+                *writer++ = '\e';
+                break;
 
-                        out->data[out->len] <<= 4;
+            case MAKE_HWORD('\\', '"'):
+                reader.hword_pos++;
+                *writer++ = '\"';
+                break;
 
-                        next = src[i + 3];
+            case MAKE_HWORD('\\', '\\'):
+                reader.hword_pos++;
+                *writer++ = '\\';
+                break;
 
-                        switch (next)
-                        {
-                            case '0' ... '9':
-                                out->data[out->len] |= (next - '0');
-                                break;
+            case MAKE_HWORD('\\', 'x'):
+                reader.hword_pos++;
+
+                /**
+                 * Le jeu des expressions régulières qui amène à l'appel de
+                 * cette fonction limite les caractères possibles à trois
+                 * ensembles : chiffres et lettres en majuscules et minuscules.
+                 *
+                 * La bascule des lettres en minuscules ramène les possibles
+                 * à deux ensembles uniquement, simplifiant ainsi les règles
+                 * de filtrage : aucun switch case n'est ainsi requis !
+                 */
+
+                half = *reader.hword_pos++;
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                byte = (half & 0xff);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+                byte = (half >> 8);
+#endif
 
-                            case 'A' ... 'F':
-                                out->data[out->len] |= 0xa + (next - 'A');
-                                break;
+                /* '0' ... '9' */
+                if (byte <= '9')
+                    *writer = (byte - '0');
 
-                            case 'a' ... 'f':
-                                out->data[out->len] |= 0xa + (next - 'a');
-                                break;
+                /* 'A' ... 'F' || 'a' ... 'f' */
+                else
+                {
+                    byte |= 0x20;
+                    *writer = 0xa + (byte - 'a');
+                }
 
-                        }
+                *writer <<= 4;
 
-                        out->len++;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                byte = (half >> 8);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+                byte = (half & 0xff);
+#endif
 
-                        i += 2;
-                        break;
+                /* '0' ... '9' */
+                if (byte <= '9')
+                    *writer++ |= (byte - '0');
 
-                    case '{':
-                        out->data[out->len++] = '{';
-                        break;
+                /* 'A' ... 'F' || 'a' ... 'f' */
+                else
+                {
+                    byte |= 0x20;
+                    *writer++ |= 0xa + (byte - 'a');
+                }
 
-                    case '}':
-                        out->data[out->len++] = '}';
-                        break;
+                break;
 
-                }
+            case MAKE_HWORD('\\', '{'):
+                reader.hword_pos++;
+                *writer++ = '{';
+                break;
 
-                i++;
+            case MAKE_HWORD('\\', '}'):
+                reader.hword_pos++;
+                *writer++ = '}';
                 break;
 
             default:
-                out->data[out->len++] = byte;
+                *writer++ = *reader.byte_pos++;
                 break;
 
         }
 
     }
 
+    out->len = writer - out->bin_data;
+
 }
 
 
@@ -394,7 +459,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
 <inc_path>\"{str_mixed}+\"          {
                                         POP_STATE;
 
-                                        rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+                                        rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
 
 #ifndef NDEBUG
                                         /* Pour rendre plus lisibles les impressions de débogage */
@@ -514,7 +579,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
        <meta_value>\"{str_mixed}*\" {
                                         POP_STATE;
 
-                                        rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+                                        rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
 
 #ifndef NDEBUG
                                         /* Pour rendre plus lisibles les impressions de débogage */
@@ -551,7 +616,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
                                     }
 
 <condition>\"{str_mixed}*\"         {
-                                        rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+                                        rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
 
 #ifndef NDEBUG
                                         /* Pour rendre plus lisibles les impressions de débogage */
@@ -587,7 +652,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
 <bytes_value>\"{str_mixed}+\"       {
                                         POP_STATE;
 
-                                        rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+                                        rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
 
 #ifndef NDEBUG
                                         /* Pour rendre plus lisibles les impressions de débogage */
@@ -812,7 +877,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
                    <bytes_regex>"." { return DOT; }
 
     <bytes_regex>({regular_chars})+ {
-                                        rost_unescape_bytes(yytext, yyleng, tmp_0);
+                                        rost_unescape_regex(yytext, yyleng, tmp_0);
 
                                         printf(" regular: '%s'\n", yytext);
 
-- 
cgit v0.11.2-87-g4458