From 30e56f25772247d76c7775d32ccd5b5a0661d3bf Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Sun, 18 Feb 2024 23:10:08 +0100 Subject: Read escaped strings faster. --- src/analysis/scan/tokens.l | 429 ++++++++++++++++++++++++++------------------- 1 file changed, 247 insertions(+), 182 deletions(-) diff --git a/src/analysis/scan/tokens.l b/src/analysis/scan/tokens.l index 86e3e92..ab881c1 100644 --- a/src/analysis/scan/tokens.l +++ b/src/analysis/scan/tokens.l @@ -14,6 +14,31 @@ #include +/* Tête de lecture pour conversions */ +typedef union _read_ptr_t +{ + const uint8_t *byte_pos; /* Lecture par blocs de 8 bits */ + const uint16_t *hword_pos; /* Lecture par blocs de 16 bits*/ + +} read_ptr_t; + + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +# define MAKE_HWORD(ch1, ch2) ((uint16_t)(ch2 << 8 | ch1)) + +#elif __BYTE_ORDER == __BIG_ENDIAN + +# define MAKE_HWORD(ch1, ch2) ((uint16_t)(ch1 << 8 | ch2)) + +#else + + /* __PDP_ENDIAN et Cie... */ +# error "Congratulations! Your byte order is not supported!" + +#endif + + /****************************************************************************** * * @@ -29,124 +54,143 @@ * * ******************************************************************************/ -static void rost_unescape_string_bytes(const char *src, size_t len, sized_string_t *out) +static void rost_unescape_string(const char *src, size_t len, sized_string_t *out) { - size_t i; /* Boucle de parcours */ + read_ptr_t reader; /* Tête de lecture */ + const bin_t *max; /* Fin du parcours */ + uint16_t half; /* Moitié de mot */ bin_t byte; /* Octet à analyser */ - bin_t next; /* Octet suivant */ + bin_t *writer; /* Tête d'écriture */ - out->len = 0; + reader.byte_pos = (const uint8_t *)src; + max = reader.byte_pos + len; - for (i = 0; i < len; i++) - { - byte = src[i]; + writer = out->bin_data; - switch (byte) + while (reader.byte_pos < max) + { + /** + * La lecture par groupes de deux octets n'est pas forcément toujours + * logique : pour "\nabc", la dernière lecture va considérer 'c"', + * incluant ainsi le caractère '"' qui a été écarté pour l'appel. + * + * Le code est cependant suffisamment souple pour ignore le superflu. + */ + switch (*reader.hword_pos) { - case '\\': - - next = src[i + 1]; - - switch (next) - { - case 'a': - out->data[out->len++] = '\a'; - break; - - case 'b': - out->data[out->len++] = '\b'; - break; - - case 't': - out->data[out->len++] = '\t'; - break; - - case 'n': - out->data[out->len++] = '\n'; - break; - - case 'v': - out->data[out->len++] = '\v'; - break; - - case 'f': - out->data[out->len++] = '\f'; - break; - - case 'r': - out->data[out->len++] = '\r'; - break; - - case 'e': - out->data[out->len++] = '\e'; - break; - - case '"': - out->data[out->len++] = '\"'; - break; + case MAKE_HWORD('\\', 'a'): + reader.hword_pos++; + *writer++ = '\a'; + break; - case '\\': - out->data[out->len++] = '\\'; - break; + case MAKE_HWORD('\\', 'b'): + reader.hword_pos++; + *writer++ = '\b'; + break; - case 'x': + case MAKE_HWORD('\\', 't'): + reader.hword_pos++; + *writer++ = '\t'; + break; - next = src[i + 2]; + case MAKE_HWORD('\\', 'n'): + reader.hword_pos++; + *writer++ = '\n'; + break; - switch (next) - { - case '0' ... '9': - out->data[out->len] = (next - '0'); - break; + case MAKE_HWORD('\\', 'v'): + reader.hword_pos++; + *writer++ = '\v'; + break; - case 'A' ... 'F': - out->data[out->len] = 0xa + (next - 'A'); - break; + case MAKE_HWORD('\\', 'f'): + reader.hword_pos++; + *writer++ = '\f'; + break; - case 'a' ... 'f': - out->data[out->len] = 0xa + (next - 'a'); - break; + case MAKE_HWORD('\\', 'r'): + reader.hword_pos++; + *writer++ = '\r'; + break; - } + case MAKE_HWORD('\\', 'e'): + reader.hword_pos++; + *writer++ = '\e'; + break; - out->data[out->len] <<= 4; + case MAKE_HWORD('\\', '"'): + reader.hword_pos++; + *writer++ = '\"'; + break; - next = src[i + 3]; + case MAKE_HWORD('\\', '\\'): + reader.hword_pos++; + *writer++ = '\\'; + break; - switch (next) - { - case '0' ... '9': - out->data[out->len] |= (next - '0'); - break; + case MAKE_HWORD('\\', 'x'): + reader.hword_pos++; + + /** + * Le jeu des expressions régulières qui amène à l'appel de + * cette fonction limite les caractères possibles à trois + * ensembles : chiffres et lettres en majuscules et minuscules. + * + * La bascule des lettres en minuscules ramène les possibles + * à deux ensembles uniquement, simplifiant ainsi les règles + * de filtrage : aucun switch case n'est ainsi requis ! + */ + + half = *reader.hword_pos++; + +#if __BYTE_ORDER == __LITTLE_ENDIAN + byte = (half & 0xff); +#elif __BYTE_ORDER == __BIG_ENDIAN + byte = (half >> 8); +#endif - case 'A' ... 'F': - out->data[out->len] |= 0xa + (next - 'A'); - break; + /* '0' ... '9' */ + if (byte <= '9') + *writer = (byte - '0'); - case 'a' ... 'f': - out->data[out->len] |= 0xa + (next - 'a'); - break; + /* 'A' ... 'F' || 'a' ... 'f' */ + else + { + byte |= 0x20; + *writer = 0xa + (byte - 'a'); + } - } + *writer <<= 4; - out->len++; +#if __BYTE_ORDER == __LITTLE_ENDIAN + byte = (half >> 8); +#elif __BYTE_ORDER == __BIG_ENDIAN + byte = (half & 0xff); +#endif - i += 2; - break; + /* '0' ... '9' */ + if (byte <= '9') + *writer++ |= (byte - '0'); + /* 'A' ... 'F' || 'a' ... 'f' */ + else + { + byte |= 0x20; + *writer++ |= 0xa + (byte - 'a'); } - i++; break; default: - out->data[out->len++] = byte; + *writer++ = *reader.byte_pos++; break; } } + out->len = writer - out->bin_data; + } @@ -164,132 +208,153 @@ static void rost_unescape_string_bytes(const char *src, size_t len, sized_string * * ******************************************************************************/ -static void rost_unescape_bytes(const char *src, size_t len, sized_string_t *out) +static void rost_unescape_regex(const char *src, size_t len, sized_string_t *out) { - size_t i; /* Boucle de parcours */ + read_ptr_t reader; /* Tête de lecture */ + const bin_t *max; /* Fin du parcours */ + uint16_t half; /* Moitié de mot */ bin_t byte; /* Octet à analyser */ - bin_t next; /* Octet suivant */ + bin_t *writer; /* Tête d'écriture */ - out->len = 0; + reader.byte_pos = (const uint8_t *)src; + max = reader.byte_pos + len; - for (i = 0; i < len; i++) - { - byte = src[i]; + writer = out->bin_data; - switch (byte) + while (reader.byte_pos < max) + { + /** + * La lecture par groupes de deux octets n'est pas forcément toujours + * logique : pour "\nabc", la dernière lecture va considérer 'c"', + * incluant ainsi le caractère '"' qui a été écarté pour l'appel. + * + * Le code est cependant suffisamment souple pour ignore le superflu. + */ + switch (*reader.hword_pos) { - case '\\': - - next = src[i + 1]; - - switch (next) - { - case 'a': - out->data[out->len++] = '\a'; - break; - - case 'b': - out->data[out->len++] = '\b'; - break; - - case 't': - out->data[out->len++] = '\t'; - break; - - case 'n': - out->data[out->len++] = '\n'; - break; - - case 'v': - out->data[out->len++] = '\v'; - break; - - case 'f': - out->data[out->len++] = '\f'; - break; - - case 'r': - out->data[out->len++] = '\r'; - break; - - case 'e': - out->data[out->len++] = '\e'; - break; - - case '"': - out->data[out->len++] = '\"'; - break; + case MAKE_HWORD('\\', 'a'): + reader.hword_pos++; + *writer++ = '\a'; + break; - case '\\': - out->data[out->len++] = '\\'; - break; + case MAKE_HWORD('\\', 'b'): + reader.hword_pos++; + *writer++ = '\b'; + break; - case 'x': + case MAKE_HWORD('\\', 't'): + reader.hword_pos++; + *writer++ = '\t'; + break; - next = src[i + 2]; + case MAKE_HWORD('\\', 'n'): + reader.hword_pos++; + *writer++ = '\n'; + break; - switch (next) - { - case '0' ... '9': - out->data[out->len] = (next - '0'); - break; + case MAKE_HWORD('\\', 'v'): + reader.hword_pos++; + *writer++ = '\v'; + break; - case 'A' ... 'F': - out->data[out->len] = 0xa + (next - 'A'); - break; + case MAKE_HWORD('\\', 'f'): + reader.hword_pos++; + *writer++ = '\f'; + break; - case 'a' ... 'f': - out->data[out->len] = 0xa + (next - 'a'); - break; + case MAKE_HWORD('\\', 'r'): + reader.hword_pos++; + *writer++ = '\r'; + break; - } + case MAKE_HWORD('\\', 'e'): + reader.hword_pos++; + *writer++ = '\e'; + break; - out->data[out->len] <<= 4; + case MAKE_HWORD('\\', '"'): + reader.hword_pos++; + *writer++ = '\"'; + break; - next = src[i + 3]; + case MAKE_HWORD('\\', '\\'): + reader.hword_pos++; + *writer++ = '\\'; + break; - switch (next) - { - case '0' ... '9': - out->data[out->len] |= (next - '0'); - break; + case MAKE_HWORD('\\', 'x'): + reader.hword_pos++; + + /** + * Le jeu des expressions régulières qui amène à l'appel de + * cette fonction limite les caractères possibles à trois + * ensembles : chiffres et lettres en majuscules et minuscules. + * + * La bascule des lettres en minuscules ramène les possibles + * à deux ensembles uniquement, simplifiant ainsi les règles + * de filtrage : aucun switch case n'est ainsi requis ! + */ + + half = *reader.hword_pos++; + +#if __BYTE_ORDER == __LITTLE_ENDIAN + byte = (half & 0xff); +#elif __BYTE_ORDER == __BIG_ENDIAN + byte = (half >> 8); +#endif - case 'A' ... 'F': - out->data[out->len] |= 0xa + (next - 'A'); - break; + /* '0' ... '9' */ + if (byte <= '9') + *writer = (byte - '0'); - case 'a' ... 'f': - out->data[out->len] |= 0xa + (next - 'a'); - break; + /* 'A' ... 'F' || 'a' ... 'f' */ + else + { + byte |= 0x20; + *writer = 0xa + (byte - 'a'); + } - } + *writer <<= 4; - out->len++; +#if __BYTE_ORDER == __LITTLE_ENDIAN + byte = (half >> 8); +#elif __BYTE_ORDER == __BIG_ENDIAN + byte = (half & 0xff); +#endif - i += 2; - break; + /* '0' ... '9' */ + if (byte <= '9') + *writer++ |= (byte - '0'); - case '{': - out->data[out->len++] = '{'; - break; + /* 'A' ... 'F' || 'a' ... 'f' */ + else + { + byte |= 0x20; + *writer++ |= 0xa + (byte - 'a'); + } - case '}': - out->data[out->len++] = '}'; - break; + break; - } + case MAKE_HWORD('\\', '{'): + reader.hword_pos++; + *writer++ = '{'; + break; - i++; + case MAKE_HWORD('\\', '}'): + reader.hword_pos++; + *writer++ = '}'; break; default: - out->data[out->len++] = byte; + *writer++ = *reader.byte_pos++; break; } } + out->len = writer - out->bin_data; + } @@ -394,7 +459,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]* \"{str_mixed}+\" { POP_STATE; - rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0); + rost_unescape_string(yytext + 1, yyleng - 2, tmp_0); #ifndef NDEBUG /* Pour rendre plus lisibles les impressions de débogage */ @@ -514,7 +579,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]* \"{str_mixed}*\" { POP_STATE; - rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0); + rost_unescape_string(yytext + 1, yyleng - 2, tmp_0); #ifndef NDEBUG /* Pour rendre plus lisibles les impressions de débogage */ @@ -551,7 +616,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]* } \"{str_mixed}*\" { - rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0); + rost_unescape_string(yytext + 1, yyleng - 2, tmp_0); #ifndef NDEBUG /* Pour rendre plus lisibles les impressions de débogage */ @@ -587,7 +652,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]* \"{str_mixed}+\" { POP_STATE; - rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0); + rost_unescape_string(yytext + 1, yyleng - 2, tmp_0); #ifndef NDEBUG /* Pour rendre plus lisibles les impressions de débogage */ @@ -812,7 +877,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]* "." { return DOT; } ({regular_chars})+ { - rost_unescape_bytes(yytext, yyleng, tmp_0); + rost_unescape_regex(yytext, yyleng, tmp_0); printf(" regular: '%s'\n", yytext); -- cgit v0.11.2-87-g4458