summaryrefslogtreecommitdiff
path: root/src/analysis
diff options
context:
space:
mode:
authorCyrille Bagard <nocbos@gmail.com>2024-02-18 22:10:08 (GMT)
committerCyrille Bagard <nocbos@gmail.com>2024-02-18 22:10:08 (GMT)
commit30e56f25772247d76c7775d32ccd5b5a0661d3bf (patch)
treed8349cda5eee0d73156599e5bcc071c1ee709eb1 /src/analysis
parenteb217683eba08d51c089cd71860e3581f73ec073 (diff)
Read escaped strings faster.
Diffstat (limited to 'src/analysis')
-rw-r--r--src/analysis/scan/tokens.l429
1 files changed, 247 insertions, 182 deletions
diff --git a/src/analysis/scan/tokens.l b/src/analysis/scan/tokens.l
index 86e3e92..ab881c1 100644
--- a/src/analysis/scan/tokens.l
+++ b/src/analysis/scan/tokens.l
@@ -14,6 +14,31 @@
#include <stdlib.h>
+/* Tête de lecture pour conversions */
+typedef union _read_ptr_t
+{
+ const uint8_t *byte_pos; /* Lecture par blocs de 8 bits */
+ const uint16_t *hword_pos; /* Lecture par blocs de 16 bits*/
+
+} read_ptr_t;
+
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+# define MAKE_HWORD(ch1, ch2) ((uint16_t)(ch2 << 8 | ch1))
+
+#elif __BYTE_ORDER == __BIG_ENDIAN
+
+# define MAKE_HWORD(ch1, ch2) ((uint16_t)(ch1 << 8 | ch2))
+
+#else
+
+ /* __PDP_ENDIAN et Cie... */
+# error "Congratulations! Your byte order is not supported!"
+
+#endif
+
+
/******************************************************************************
* *
@@ -29,124 +54,143 @@
* *
******************************************************************************/
-static void rost_unescape_string_bytes(const char *src, size_t len, sized_string_t *out)
+static void rost_unescape_string(const char *src, size_t len, sized_string_t *out)
{
- size_t i; /* Boucle de parcours */
+ read_ptr_t reader; /* Tête de lecture */
+ const bin_t *max; /* Fin du parcours */
+ uint16_t half; /* Moitié de mot */
bin_t byte; /* Octet à analyser */
- bin_t next; /* Octet suivant */
+ bin_t *writer; /* Tête d'écriture */
- out->len = 0;
+ reader.byte_pos = (const uint8_t *)src;
+ max = reader.byte_pos + len;
- for (i = 0; i < len; i++)
- {
- byte = src[i];
+ writer = out->bin_data;
- switch (byte)
+ while (reader.byte_pos < max)
+ {
+ /**
+ * La lecture par groupes de deux octets n'est pas forcément toujours
+ * logique : pour "\nabc", la dernière lecture va considérer 'c"',
+ * incluant ainsi le caractère '"' qui a été écarté pour l'appel.
+ *
+ * Le code est cependant suffisamment souple pour ignore le superflu.
+ */
+ switch (*reader.hword_pos)
{
- case '\\':
-
- next = src[i + 1];
-
- switch (next)
- {
- case 'a':
- out->data[out->len++] = '\a';
- break;
-
- case 'b':
- out->data[out->len++] = '\b';
- break;
-
- case 't':
- out->data[out->len++] = '\t';
- break;
-
- case 'n':
- out->data[out->len++] = '\n';
- break;
-
- case 'v':
- out->data[out->len++] = '\v';
- break;
-
- case 'f':
- out->data[out->len++] = '\f';
- break;
-
- case 'r':
- out->data[out->len++] = '\r';
- break;
-
- case 'e':
- out->data[out->len++] = '\e';
- break;
-
- case '"':
- out->data[out->len++] = '\"';
- break;
+ case MAKE_HWORD('\\', 'a'):
+ reader.hword_pos++;
+ *writer++ = '\a';
+ break;
- case '\\':
- out->data[out->len++] = '\\';
- break;
+ case MAKE_HWORD('\\', 'b'):
+ reader.hword_pos++;
+ *writer++ = '\b';
+ break;
- case 'x':
+ case MAKE_HWORD('\\', 't'):
+ reader.hword_pos++;
+ *writer++ = '\t';
+ break;
- next = src[i + 2];
+ case MAKE_HWORD('\\', 'n'):
+ reader.hword_pos++;
+ *writer++ = '\n';
+ break;
- switch (next)
- {
- case '0' ... '9':
- out->data[out->len] = (next - '0');
- break;
+ case MAKE_HWORD('\\', 'v'):
+ reader.hword_pos++;
+ *writer++ = '\v';
+ break;
- case 'A' ... 'F':
- out->data[out->len] = 0xa + (next - 'A');
- break;
+ case MAKE_HWORD('\\', 'f'):
+ reader.hword_pos++;
+ *writer++ = '\f';
+ break;
- case 'a' ... 'f':
- out->data[out->len] = 0xa + (next - 'a');
- break;
+ case MAKE_HWORD('\\', 'r'):
+ reader.hword_pos++;
+ *writer++ = '\r';
+ break;
- }
+ case MAKE_HWORD('\\', 'e'):
+ reader.hword_pos++;
+ *writer++ = '\e';
+ break;
- out->data[out->len] <<= 4;
+ case MAKE_HWORD('\\', '"'):
+ reader.hword_pos++;
+ *writer++ = '\"';
+ break;
- next = src[i + 3];
+ case MAKE_HWORD('\\', '\\'):
+ reader.hword_pos++;
+ *writer++ = '\\';
+ break;
- switch (next)
- {
- case '0' ... '9':
- out->data[out->len] |= (next - '0');
- break;
+ case MAKE_HWORD('\\', 'x'):
+ reader.hword_pos++;
+
+ /**
+ * Le jeu des expressions régulières qui amène à l'appel de
+ * cette fonction limite les caractères possibles à trois
+ * ensembles : chiffres et lettres en majuscules et minuscules.
+ *
+ * La bascule des lettres en minuscules ramène les possibles
+ * à deux ensembles uniquement, simplifiant ainsi les règles
+ * de filtrage : aucun switch case n'est ainsi requis !
+ */
+
+ half = *reader.hword_pos++;
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ byte = (half & 0xff);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ byte = (half >> 8);
+#endif
- case 'A' ... 'F':
- out->data[out->len] |= 0xa + (next - 'A');
- break;
+ /* '0' ... '9' */
+ if (byte <= '9')
+ *writer = (byte - '0');
- case 'a' ... 'f':
- out->data[out->len] |= 0xa + (next - 'a');
- break;
+ /* 'A' ... 'F' || 'a' ... 'f' */
+ else
+ {
+ byte |= 0x20;
+ *writer = 0xa + (byte - 'a');
+ }
- }
+ *writer <<= 4;
- out->len++;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ byte = (half >> 8);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ byte = (half & 0xff);
+#endif
- i += 2;
- break;
+ /* '0' ... '9' */
+ if (byte <= '9')
+ *writer++ |= (byte - '0');
+ /* 'A' ... 'F' || 'a' ... 'f' */
+ else
+ {
+ byte |= 0x20;
+ *writer++ |= 0xa + (byte - 'a');
}
- i++;
break;
default:
- out->data[out->len++] = byte;
+ *writer++ = *reader.byte_pos++;
break;
}
}
+ out->len = writer - out->bin_data;
+
}
@@ -164,132 +208,153 @@ static void rost_unescape_string_bytes(const char *src, size_t len, sized_string
* *
******************************************************************************/
-static void rost_unescape_bytes(const char *src, size_t len, sized_string_t *out)
+static void rost_unescape_regex(const char *src, size_t len, sized_string_t *out)
{
- size_t i; /* Boucle de parcours */
+ read_ptr_t reader; /* Tête de lecture */
+ const bin_t *max; /* Fin du parcours */
+ uint16_t half; /* Moitié de mot */
bin_t byte; /* Octet à analyser */
- bin_t next; /* Octet suivant */
+ bin_t *writer; /* Tête d'écriture */
- out->len = 0;
+ reader.byte_pos = (const uint8_t *)src;
+ max = reader.byte_pos + len;
- for (i = 0; i < len; i++)
- {
- byte = src[i];
+ writer = out->bin_data;
- switch (byte)
+ while (reader.byte_pos < max)
+ {
+ /**
+ * La lecture par groupes de deux octets n'est pas forcément toujours
+ * logique : pour "\nabc", la dernière lecture va considérer 'c"',
+ * incluant ainsi le caractère '"' qui a été écarté pour l'appel.
+ *
+ * Le code est cependant suffisamment souple pour ignore le superflu.
+ */
+ switch (*reader.hword_pos)
{
- case '\\':
-
- next = src[i + 1];
-
- switch (next)
- {
- case 'a':
- out->data[out->len++] = '\a';
- break;
-
- case 'b':
- out->data[out->len++] = '\b';
- break;
-
- case 't':
- out->data[out->len++] = '\t';
- break;
-
- case 'n':
- out->data[out->len++] = '\n';
- break;
-
- case 'v':
- out->data[out->len++] = '\v';
- break;
-
- case 'f':
- out->data[out->len++] = '\f';
- break;
-
- case 'r':
- out->data[out->len++] = '\r';
- break;
-
- case 'e':
- out->data[out->len++] = '\e';
- break;
-
- case '"':
- out->data[out->len++] = '\"';
- break;
+ case MAKE_HWORD('\\', 'a'):
+ reader.hword_pos++;
+ *writer++ = '\a';
+ break;
- case '\\':
- out->data[out->len++] = '\\';
- break;
+ case MAKE_HWORD('\\', 'b'):
+ reader.hword_pos++;
+ *writer++ = '\b';
+ break;
- case 'x':
+ case MAKE_HWORD('\\', 't'):
+ reader.hword_pos++;
+ *writer++ = '\t';
+ break;
- next = src[i + 2];
+ case MAKE_HWORD('\\', 'n'):
+ reader.hword_pos++;
+ *writer++ = '\n';
+ break;
- switch (next)
- {
- case '0' ... '9':
- out->data[out->len] = (next - '0');
- break;
+ case MAKE_HWORD('\\', 'v'):
+ reader.hword_pos++;
+ *writer++ = '\v';
+ break;
- case 'A' ... 'F':
- out->data[out->len] = 0xa + (next - 'A');
- break;
+ case MAKE_HWORD('\\', 'f'):
+ reader.hword_pos++;
+ *writer++ = '\f';
+ break;
- case 'a' ... 'f':
- out->data[out->len] = 0xa + (next - 'a');
- break;
+ case MAKE_HWORD('\\', 'r'):
+ reader.hword_pos++;
+ *writer++ = '\r';
+ break;
- }
+ case MAKE_HWORD('\\', 'e'):
+ reader.hword_pos++;
+ *writer++ = '\e';
+ break;
- out->data[out->len] <<= 4;
+ case MAKE_HWORD('\\', '"'):
+ reader.hword_pos++;
+ *writer++ = '\"';
+ break;
- next = src[i + 3];
+ case MAKE_HWORD('\\', '\\'):
+ reader.hword_pos++;
+ *writer++ = '\\';
+ break;
- switch (next)
- {
- case '0' ... '9':
- out->data[out->len] |= (next - '0');
- break;
+ case MAKE_HWORD('\\', 'x'):
+ reader.hword_pos++;
+
+ /**
+ * Le jeu des expressions régulières qui amène à l'appel de
+ * cette fonction limite les caractères possibles à trois
+ * ensembles : chiffres et lettres en majuscules et minuscules.
+ *
+ * La bascule des lettres en minuscules ramène les possibles
+ * à deux ensembles uniquement, simplifiant ainsi les règles
+ * de filtrage : aucun switch case n'est ainsi requis !
+ */
+
+ half = *reader.hword_pos++;
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ byte = (half & 0xff);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ byte = (half >> 8);
+#endif
- case 'A' ... 'F':
- out->data[out->len] |= 0xa + (next - 'A');
- break;
+ /* '0' ... '9' */
+ if (byte <= '9')
+ *writer = (byte - '0');
- case 'a' ... 'f':
- out->data[out->len] |= 0xa + (next - 'a');
- break;
+ /* 'A' ... 'F' || 'a' ... 'f' */
+ else
+ {
+ byte |= 0x20;
+ *writer = 0xa + (byte - 'a');
+ }
- }
+ *writer <<= 4;
- out->len++;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ byte = (half >> 8);
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ byte = (half & 0xff);
+#endif
- i += 2;
- break;
+ /* '0' ... '9' */
+ if (byte <= '9')
+ *writer++ |= (byte - '0');
- case '{':
- out->data[out->len++] = '{';
- break;
+ /* 'A' ... 'F' || 'a' ... 'f' */
+ else
+ {
+ byte |= 0x20;
+ *writer++ |= 0xa + (byte - 'a');
+ }
- case '}':
- out->data[out->len++] = '}';
- break;
+ break;
- }
+ case MAKE_HWORD('\\', '{'):
+ reader.hword_pos++;
+ *writer++ = '{';
+ break;
- i++;
+ case MAKE_HWORD('\\', '}'):
+ reader.hword_pos++;
+ *writer++ = '}';
break;
default:
- out->data[out->len++] = byte;
+ *writer++ = *reader.byte_pos++;
break;
}
}
+ out->len = writer - out->bin_data;
+
}
@@ -394,7 +459,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
<inc_path>\"{str_mixed}+\" {
POP_STATE;
- rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+ rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
#ifndef NDEBUG
/* Pour rendre plus lisibles les impressions de débogage */
@@ -514,7 +579,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
<meta_value>\"{str_mixed}*\" {
POP_STATE;
- rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+ rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
#ifndef NDEBUG
/* Pour rendre plus lisibles les impressions de débogage */
@@ -551,7 +616,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
}
<condition>\"{str_mixed}*\" {
- rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+ rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
#ifndef NDEBUG
/* Pour rendre plus lisibles les impressions de débogage */
@@ -587,7 +652,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
<bytes_value>\"{str_mixed}+\" {
POP_STATE;
- rost_unescape_string_bytes(yytext + 1, yyleng - 2, tmp_0);
+ rost_unescape_string(yytext + 1, yyleng - 2, tmp_0);
#ifndef NDEBUG
/* Pour rendre plus lisibles les impressions de débogage */
@@ -812,7 +877,7 @@ bytes_fuzzy_id [\*A-Za-z_][\*A-Za-z0-9_]*
<bytes_regex>"." { return DOT; }
<bytes_regex>({regular_chars})+ {
- rost_unescape_bytes(yytext, yyleng, tmp_0);
+ rost_unescape_regex(yytext, yyleng, tmp_0);
printf(" regular: '%s'\n", yytext);