From 537ed969ad0305e70dc2d503dbc49df859892717 Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Thu, 29 Sep 2016 23:21:26 +0200 Subject: Added partial support for Dalvik MUTF-8 encodings. --- ChangeLog | 6 ++ src/format/mangling/dex/type_gram.y | 16 +++-- src/format/mangling/dex/type_tok.l | 115 +++++++++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+), 12 deletions(-) diff --git a/ChangeLog b/ChangeLog index 78421b8..332e6fa 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +16-09-29 Cyrille Bagard + + * src/format/mangling/dex/type_gram.y: + * src/format/mangling/dex/type_tok.l: + Add partial support for Dalvik MUTF-8 encodings. + 16-09-28 Cyrille Bagard * plugins/readdex/class.c: diff --git a/src/format/mangling/dex/type_gram.y b/src/format/mangling/dex/type_gram.y index 1176bd2..d5c3f26 100644 --- a/src/format/mangling/dex/type_gram.y +++ b/src/format/mangling/dex/type_gram.y @@ -23,6 +23,7 @@ bool demangle_dex_type(GDexDemangler *, const char *); #include "../../../analysis/types/basic.h" #include "../../../analysis/types/cse.h" +#include "../../../common/extstr.h" } @@ -45,7 +46,7 @@ bool demangle_dex_type(GDexDemangler *, const char *); %type type_descriptor field_type_descriptor non_array_field_type_descriptor full_class_name -%type TEXT +%type TEXT simple_name %{ @@ -87,18 +88,15 @@ non_array_field_type_descriptor: | L full_class_name SEMICOLON { $$ = $2; } full_class_name: - TEXT { $$ = g_class_enum_type_new(CET_CLASS, $1); } - | full_class_name SLASH TEXT { + simple_name { $$ = g_class_enum_type_new(CET_CLASS, $1); } + | full_class_name SLASH simple_name { $$ = g_class_enum_type_new(CET_CLASS, $3); g_data_type_set_namespace($$, $1); g_object_unref($1); } - | full_class_name DOLLAR TEXT { - $$ = g_class_enum_type_new(CET_CLASS, $3); - g_data_type_set_namespace($$, $1); - g_object_unref($1); - } - +simple_name: + TEXT { $$ = strdup($1); } + | simple_name TEXT { $$ = stradd($1, $2); } %% diff --git a/src/format/mangling/dex/type_tok.l b/src/format/mangling/dex/type_tok.l index 7b8a8d3..9c24085 100644 --- a/src/format/mangling/dex/type_tok.l +++ b/src/format/mangling/dex/type_tok.l @@ -10,10 +10,13 @@ %option noyywrap %option yylineno %option nounput -%option noinput + /*%option noinput*/ %x string +ASCII [A-Za-z0-9] +SIMPLE {ASCII}|"$"|"-"|"_" + %% "V" { return V; } @@ -28,10 +31,116 @@ "L" { BEGIN(string); return L; } "["* { type_lval.adeep = strlen(yytext); return ARRAY; } "/" { return SLASH; } -"$" { return DOLLAR; } ";" { BEGIN(INITIAL); return SEMICOLON; } -[A-Za-z0-9_-]* { type_lval.text = yytext; return TEXT; } +{SIMPLE}* { type_lval.text = yytext; return TEXT; } + +. { + unsigned char next; + char mutf8[4]; + + switch ((unsigned char)yytext[0]) + { + /* U+00a1 ... U+1fff */ + case 0x00 ... 0x1f: + + next = input(); + + if (yytext[0] == 0x00 && next < 0xa1) + { + REJECT; + } + + else + { + mutf8[0] = yytext[0]; + mutf8[1] = next; + mutf8[2] = '\0'; + + strcpy(type_lval.text, mutf8); return TEXT; + + } + + break; + + /* U+2010 ... U+2027 / U+2030 ... U+d7ff */ + case 0x20: + + next = input(); + + switch (next) + { + case 0x10 ... 0x27: + case 0x30 ... 0xff: + + mutf8[0] = yytext[0]; + mutf8[1] = next; + mutf8[2] = '\0'; + + strcpy(type_lval.text, mutf8); return TEXT; + break; + + default: + REJECT; + break; + + } + + break; + + /* ~ U+2030 ... U+d7ff */ + case 0x21 ... 0xd7: + + next = input(); + + mutf8[0] = yytext[0]; + mutf8[1] = next; + mutf8[2] = '\0'; + + strcpy(type_lval.text, mutf8); return TEXT; + break; + + /* U+e000 ... U+ffef */ + case 0xe0 ... 0xff: + + next = input(); + + if (yytext[0] == 0xff && next > 0xef) + { + REJECT; + } + + else + { + mutf8[0] = yytext[0]; + mutf8[1] = next; + mutf8[2] = '\0'; + + strcpy(type_lval.text, mutf8); return TEXT; + + } + + break; + + /* U+10000 ... U+10ffff */ + /* + case 0x10: + + mutf8[0] = yytext[0]; + mutf8[1] = input(); + mutf8[2] = input(); + mutf8[3] = '\0'; + + strcpy(type_lval.text, mutf8); return TEXT; + break; + */ + + default: + REJECT; + break; + + } + } %% -- cgit v0.11.2-87-g4458