From 537ed969ad0305e70dc2d503dbc49df859892717 Mon Sep 17 00:00:00 2001
From: Cyrille Bagard <nocbos@gmail.com>
Date: Thu, 29 Sep 2016 23:21:26 +0200
Subject: Added partial support for Dalvik MUTF-8 encodings.

---
 ChangeLog                           |   6 ++
 src/format/mangling/dex/type_gram.y |  16 +++--
 src/format/mangling/dex/type_tok.l  | 115 +++++++++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 12 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 78421b8..332e6fa 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+16-09-29  Cyrille Bagard <nocbos@gmail.com>
+
+	* src/format/mangling/dex/type_gram.y:
+	* src/format/mangling/dex/type_tok.l:
+	Add partial support for Dalvik MUTF-8 encodings.
+
 16-09-28  Cyrille Bagard <nocbos@gmail.com>
 
 	* plugins/readdex/class.c:
diff --git a/src/format/mangling/dex/type_gram.y b/src/format/mangling/dex/type_gram.y
index 1176bd2..d5c3f26 100644
--- a/src/format/mangling/dex/type_gram.y
+++ b/src/format/mangling/dex/type_gram.y
@@ -23,6 +23,7 @@ bool demangle_dex_type(GDexDemangler *, const char *);
 
 #include "../../../analysis/types/basic.h"
 #include "../../../analysis/types/cse.h"
+#include "../../../common/extstr.h"
 
 }
 
@@ -45,7 +46,7 @@ bool demangle_dex_type(GDexDemangler *, const char *);
 
 %type <type> type_descriptor field_type_descriptor non_array_field_type_descriptor full_class_name
 
-%type <text> TEXT
+%type <text> TEXT simple_name
 
 
 %{
@@ -87,18 +88,15 @@ non_array_field_type_descriptor:
 	| L full_class_name SEMICOLON       { $$ = $2; }
 
 full_class_name:
-    TEXT                                { $$ = g_class_enum_type_new(CET_CLASS, $1); }
-    | full_class_name SLASH TEXT        {
+    simple_name                         { $$ = g_class_enum_type_new(CET_CLASS, $1); }
+    | full_class_name SLASH simple_name {
                                             $$ = g_class_enum_type_new(CET_CLASS, $3);
                                             g_data_type_set_namespace($$, $1);
                                             g_object_unref($1);
                                         }
-    | full_class_name DOLLAR TEXT       {
-                                            $$ = g_class_enum_type_new(CET_CLASS, $3);
-                                            g_data_type_set_namespace($$, $1);
-                                            g_object_unref($1);
-                                        }
-
+simple_name:
+    TEXT                                { $$ = strdup($1); }
+    | simple_name TEXT                  { $$ = stradd($1, $2); }
 
 %%
 
diff --git a/src/format/mangling/dex/type_tok.l b/src/format/mangling/dex/type_tok.l
index 7b8a8d3..9c24085 100644
--- a/src/format/mangling/dex/type_tok.l
+++ b/src/format/mangling/dex/type_tok.l
@@ -10,10 +10,13 @@
 %option noyywrap
 %option yylineno
 %option nounput
-%option noinput
+ /*%option noinput*/
 
 %x string
 
+ASCII       [A-Za-z0-9]
+SIMPLE      {ASCII}|"$"|"-"|"_"
+
 %%
 
 "V"                     { return V; }
@@ -28,10 +31,116 @@
 "L"                     { BEGIN(string); return L; }
 "["*                    { type_lval.adeep = strlen(yytext); return ARRAY; }
 <string>"/"             { return SLASH; }
-<string>"$"             { return DOLLAR; }
 <string>";"             { BEGIN(INITIAL); return SEMICOLON; }
 
-<string>[A-Za-z0-9_-]*  { type_lval.text = yytext; return TEXT; }
+<string>{SIMPLE}*       { type_lval.text = yytext; return TEXT; }
+
+<string>.               {
+                            unsigned char next;
+                            char mutf8[4];
+
+                            switch ((unsigned char)yytext[0])
+                            {
+                                /* U+00a1 ... U+1fff */
+                                case 0x00 ... 0x1f:
+
+                                    next = input();
+
+                                    if (yytext[0] == 0x00 && next < 0xa1)
+                                    {
+                                        REJECT;
+                                    }
+
+                                    else
+                                    {
+                                        mutf8[0] = yytext[0];
+                                        mutf8[1] = next;
+                                        mutf8[2] = '\0';
+
+                                        strcpy(type_lval.text, mutf8); return TEXT;
+
+                                    }
+
+                                    break;
+
+                                /* U+2010 ... U+2027 / U+2030 ... U+d7ff */
+                                case 0x20:
+
+                                    next = input();
+
+                                    switch (next)
+                                    {
+                                        case 0x10 ... 0x27:
+                                        case 0x30 ... 0xff:
+
+                                            mutf8[0] = yytext[0];
+                                            mutf8[1] = next;
+                                            mutf8[2] = '\0';
+
+                                            strcpy(type_lval.text, mutf8); return TEXT;
+                                            break;
+
+                                        default:
+                                            REJECT;
+                                            break;
+
+                                    }
+
+                                    break;
+
+                                /* ~ U+2030 ... U+d7ff */
+                                case 0x21 ... 0xd7:
+
+                                    next = input();
+
+                                    mutf8[0] = yytext[0];
+                                    mutf8[1] = next;
+                                    mutf8[2] = '\0';
+
+                                    strcpy(type_lval.text, mutf8); return TEXT;
+                                    break;
+
+                                /* U+e000 ... U+ffef */
+                                case 0xe0 ... 0xff:
+
+                                    next = input();
+
+                                    if (yytext[0] == 0xff && next > 0xef)
+                                    {
+                                        REJECT;
+                                    }
+
+                                    else
+                                    {
+                                        mutf8[0] = yytext[0];
+                                        mutf8[1] = next;
+                                        mutf8[2] = '\0';
+
+                                        strcpy(type_lval.text, mutf8); return TEXT;
+
+                                    }
+
+                                    break;
+
+                                /* U+10000 ... U+10ffff */
+                                /*
+                                case 0x10:
+
+                                    mutf8[0] = yytext[0];
+                                    mutf8[1] = input();
+                                    mutf8[2] = input();
+                                    mutf8[3] = '\0';
+
+                                    strcpy(type_lval.text, mutf8); return TEXT;
+                                    break;
+                                */
+
+                                default:
+                                    REJECT;
+                                    break;
+
+                            }
 
+                        }
 
 %%
-- 
cgit v0.11.2-87-g4458