summaryrefslogtreecommitdiff
path: root/tools/fuzzing/rost/convert.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/fuzzing/rost/convert.py')
-rw-r--r--tools/fuzzing/rost/convert.py452
1 files changed, 452 insertions, 0 deletions
diff --git a/tools/fuzzing/rost/convert.py b/tools/fuzzing/rost/convert.py
new file mode 100644
index 0000000..b0ed90c
--- /dev/null
+++ b/tools/fuzzing/rost/convert.py
@@ -0,0 +1,452 @@
+
+import re
+import sys
+
+
+def define_PLAIN_TEXT(name, last):
+ """Create definition for the PLAIN_TEXT token."""
+
+ print(' "<%s>": [ ["\\\"", "<str_not_escaped>", "\\\""] ],' % name.lower())
+ print(' "<str_not_escaped>": [ ["<char>"], ["<char>", "<char>"], ["<char>", "<char>", "<char>"] ],')
+ print(' "<char>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["A"], ["B"], ["C"], ["D"], ["E"], ["F"] ]%s' % (',' if not(last) else ''))
+
+
+def define_IDENTIFIER(name, last):
+ """Create definition for the RULE_IDENTIFIER token."""
+
+ print(' "<%s>": [ [ "<id>", "<id>", "<id>", "<idx>" ] ],' % name.lower())
+ print(' "<id>": [ ["a"], ["b"], ["c"], ["d"], ["e"], ["f"], ["g"], ["h"], ["i"], ["j"], ["k"], ["l"] ],')
+ print(' "<idx>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ]%s' % (',' if not(last) else ''))
+
+
+def define_SIGNED_INTEGER(name, last):
+ """Create definition for the SIGNED_INTEGER token."""
+
+ print(' "<%s>": [ ["-", "<unsigned_integer>"] ]%s' % (name.lower(), ',' if not(last) else ''))
+
+
+def define_UNSIGNED_INTEGER(name, last):
+ """Create definition for the UNSIGNED_INTEGER token."""
+
+ print(' "<%s>": [ ["<fnumber>"], ["<number>", "<fnumber>"], ["<number>", "<fnumber>", "<fnumber>"] ],' % name.lower())
+ print(' "<number>": [ ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ],')
+ print(' "<fnumber>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ]%s' % (',' if not(last) else ''))
+
+
+def define_BYTES_ID(name, last):
+ """Create definition for the BYTES_ID token."""
+
+ print(' "<%s>": [ ["$"], ["$*"], [ "$", "<id>", "<idx>" ], [ "$", "<id>", "*" ] ],' % name.lower())
+ print(' "<id>": [ ["a"], ["b"], ["c"], ["d"], ["e"], ["f"], ["g"], ["h"], ["i"], ["j"], ["k"], ["l"] ],')
+ print(' "<idx>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ]%s' % (',' if not(last) else ''))
+
+
+def define_BYTES_ID_COUNTER(name, last):
+ """Create definition for the BYTES_ID_COUNTER token."""
+
+ print(' "<%s>": [ ["#"], ["#*"], [ "#", "<id>", "<idx>" ], [ "#", "<id>", "*" ] ],' % name.lower())
+ print(' "<id>": [ ["a"], ["b"], ["c"], ["d"], ["e"], ["f"], ["g"], ["h"], ["i"], ["j"], ["k"], ["l"] ],')
+ print(' "<idx>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ]%s' % (',' if not(last) else ''))
+
+
+def define_BYTES_ID_START(name, last):
+ """Create definition for the BYTES_ID_START token."""
+
+ print(' "<%s>": [ ["@"], ["@*"], [ "@", "<id>", "<idx>" ], [ "@", "<id>", "*" ] ],' % name.lower())
+ print(' "<id>": [ ["a"], ["b"], ["c"], ["d"], ["e"], ["f"], ["g"], ["h"], ["i"], ["j"], ["k"], ["l"] ],')
+ print(' "<idx>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ]%s' % (',' if not(last) else ''))
+
+
+def define_BYTES_ID_LENGTH(name, last):
+ """Create definition for the BYTES_ID_LENGTH token."""
+
+ print(' "<%s>": [ ["!"], ["!*"], [ "!", "<id>", "<idx>" ], [ "!", "<id>", "*" ] ],' % name.lower())
+ print(' "<id>": [ ["a"], ["b"], ["c"], ["d"], ["e"], ["f"], ["g"], ["h"], ["i"], ["j"], ["k"], ["l"] ],')
+ print(' "<idx>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ]%s' % (',' if not(last) else ''))
+
+
+def define_BYTES_ID_END(name, last):
+ """Create definition for the BYTES_ID_END token."""
+
+ print(' "<%s>": [ ["~"], ["~*"], [ "~", "<id>", "<idx>" ], [ "~", "<id>", "*" ] ],' % name.lower())
+ print(' "<id>": [ ["a"], ["b"], ["c"], ["d"], ["e"], ["f"], ["g"], ["h"], ["i"], ["j"], ["k"], ["l"] ],')
+ print(' "<idx>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"] ]%s' % (',' if not(last) else ''))
+
+
+def define_HEX_BYTES(name, last):
+ """Create definition for the HEX_BYTES token."""
+
+ print(' "<%s>": [ ["<hex>", "<hex>"] ],' % name.lower())
+ print(' "<hex>": [ ["0"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["a"], ["b"], ["c"], ["d"], ["e"], ["f"] ]%s' % (',' if not(last) else ''))
+
+
+def define_FULL_MASK(name, last):
+ """Create definition for the FULL_MASK token."""
+
+ print(' "<%s>": [ ["?", "?"] ]%s' % (name.lower(), ',' if not(last) else ''))
+
+
+def define_SEMI_MASK(name, last):
+ """Create definition for the SEMI_MASK token."""
+
+ print(' "<%s>": [ ["?0"], ["1?"] ]%s' % (name.lower(), ',' if not(last) else ''))
+
+
+def define_KB(name, last):
+ """Create definition for the KB token."""
+
+ print(' "<%s>": [ ["kb"], ["Kb"], ["kB"], ["KB"] ]%s' % (name.lower(), ',' if not(last) else ''))
+
+
+def define_MB(name, last):
+ """Create definition for the MB token."""
+
+ print(' "<%s>": [ ["mb"], ["Mb"], ["mB"], ["MB"] ]%s' % (name.lower(), ',' if not(last) else ''))
+
+
+def define_GB(name, last):
+ """Create definition for the GB token."""
+
+ print(' "<%s>": [ ["gb"], ["Gb"], ["gB"], ["GB"] ]%s' % (name.lower(), ',' if not(last) else ''))
+
+
+__lexer_tokens = {
+ 'PLAIN_TEXT': define_PLAIN_TEXT,
+ 'ESCAPED_TEXT': define_PLAIN_TEXT,
+ 'RULE_IDENTIFIER': define_IDENTIFIER,
+ 'INFO_KEY': define_PLAIN_TEXT,
+ 'SIGNED_INTEGER': define_SIGNED_INTEGER,
+ 'UNSIGNED_INTEGER': define_UNSIGNED_INTEGER,
+
+ 'BYTES_ID': define_BYTES_ID,
+ 'BYTES_FUZZY_ID': define_BYTES_ID,
+ 'BYTES_ID_COUNTER': define_BYTES_ID_COUNTER,
+ 'BYTES_FUZZY_ID_COUNTER': define_BYTES_ID_COUNTER,
+ 'BYTES_ID_START': define_BYTES_ID_START,
+ 'BYTES_FUZZY_ID_START': define_BYTES_ID_START,
+ 'BYTES_ID_LENGTH': define_BYTES_ID_LENGTH,
+ 'BYTES_FUZZY_ID_LENGTH': define_BYTES_ID_LENGTH,
+ 'BYTES_ID_END': define_BYTES_ID_END,
+ 'BYTES_FUZZY_ID_END': define_BYTES_ID_END,
+
+ 'NAME': define_PLAIN_TEXT,
+ 'HEX_BYTES': define_HEX_BYTES,
+ 'FULL_MASK': define_FULL_MASK,
+ 'SEMI_MASK': define_SEMI_MASK,
+ 'REGEX_BYTES': define_PLAIN_TEXT,
+ 'REGEX_CLASSES': define_PLAIN_TEXT,
+ 'REGEX_RANGE': define_PLAIN_TEXT,
+ 'KB': define_KB,
+ 'MB': define_MB,
+ 'GB': define_GB,
+}
+
+
+def remove_grammar_comments(grammar):
+ """Delete all the C code comments."""
+
+ # Cf. https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python/241506#241506
+
+ def replacer(match):
+ s = match.group(0)
+ if s.startswith('/'):
+ return ' ' # note: a space and not an empty string
+ else:
+ return s
+
+ regex = re.compile(
+ r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+ re.DOTALL | re.MULTILINE
+ )
+
+ return regex.sub(replacer, grammar)
+
+
+def remove_grammar_actions(grammar):
+ """Delete all the C code handling tokens."""
+
+ remaining = ''
+
+ scope = 0
+ string = False
+
+ for ch in grammar:
+
+ if ch == '{' and not(string):
+ scope += 1
+
+ elif ch == '}' and not(string):
+ assert(scope > 0)
+ scope -= 1
+
+ elif scope == 0:
+ remaining += ch
+ if ch == '"':
+ string = not(string)
+
+ return remaining
+
+
+def is_upper(text):
+ """State if a string is upper case."""
+
+ return text.upper() == text
+
+
+def parse_rule_definition(grammar):
+ """Process the definition of one rule."""
+
+ result = []
+
+ regex = re.compile('(?<!")\|')
+
+ definitions = regex.split(grammar)
+
+ definitions = [ d.strip() for d in definitions ]
+
+ for d in definitions:
+
+ tokens = d.split()
+
+ converted = []
+
+ for t in tokens:
+
+ if not(t.startswith('"')) and is_upper(t):
+
+ if not(t in __lexer_tokens.keys()):
+ print('Missing def:', t)
+ sys.exit()
+
+ assert(t in __lexer_tokens.keys())
+
+ converted.append('"<%s>"' % t.lower())
+
+ else:
+
+ if t.startswith('"'):
+ converted.append('%s' % t)
+ else:
+ converted.append('"<%s>"' % t)
+
+ result.append(converted)
+
+ return result
+
+
+def parse_rules(grammar):
+ """Process all the rules contained in the grammar."""
+
+ tree = {}
+
+ regex = re.compile('[\n\t ]*([^\n\t :]+)[\n\t ]*:([^;]+);')
+
+ rules = regex.findall(grammar)
+
+ first = True
+
+ for r in rules:
+
+ if first:
+ print(' "<START>": [ ["<%s>"] ],' % r[0])
+ first = False
+
+ definitions = parse_rule_definition(r[1])
+
+ tree[r[0]] = definitions
+
+ return tree
+
+
+def simplify_tree(tree):
+ """Remove nodes which only are links between two levels of nodes."""
+
+ """
+ a = [ [b] ]
+ b = [ [c], [d] ]
+
+ -> replace a by b
+ """
+
+ # Examples: cexpression, modifier_arg
+
+ replaced = {}
+
+ for k, v in tree.items():
+
+ if len(v) == 1 and len(v[0]) == 1:
+
+ replaced['"<%s>"' % k] = v[0][0]
+
+ new_tree = {}
+
+ for k, v in tree.items():
+
+ name = '"<%s>"' % k
+
+ if not(name in replaced.keys()):
+
+ new_v = []
+
+ for vv in v:
+
+ new_vv = vv
+
+ for rk, rv in replaced.items():
+ new_vv = list(map(lambda x: x.replace(rk, rv), new_vv))
+
+ new_v.append(new_vv)
+
+ new_tree[k] = new_v
+
+ return new_tree
+
+
+def find_direct_parent_nodes(tree, name):
+ """Find all the rules containing a rule."""
+
+ rules = []
+
+ name = '"<%s>"' % name
+
+ for k, v in tree.items():
+
+ for vv in v:
+
+ if len(vv) == 1 and vv[0] == name and not(k in rules):
+
+ rules.append(k)
+
+ return rules
+
+
+def remove_indirect_left_recursion(tree):
+ """Remove all nodes which implies indirect left recursion."""
+
+ """
+ a = b
+ b = a + c
+
+ -> a = a + c
+ """
+
+ # Examples: logical_expr, relational_expr, string_op, arithm_expr, intersection
+
+ replaced = {}
+
+ for k, v in tree.items():
+
+ parents = find_direct_parent_nodes(tree, k)
+
+ if len(parents) != 1:
+ continue
+
+ parent = parents[0]
+
+ for vv in v:
+
+ if vv[0] == '"<%s>"' % parent:
+ replaced[k] = v
+ break
+
+ new_tree = {}
+
+ for k, v in tree.items():
+
+ if not(k in replaced.keys()):
+
+ new_v = []
+
+ for vv in v:
+
+ if len(vv) != 1:
+ new_v.append(vv)
+
+ else:
+
+ modified = False
+
+ for rk, rv in replaced.items():
+ if '"<%s>"' % rk == vv[0]:
+ new_v += rv
+ modified = True
+ break
+
+ if not(modified):
+ new_v.append(vv)
+
+ new_tree[k] = new_v
+
+ return new_tree
+
+
+def output_rules(tree):
+ """Output a translated rule."""
+
+ for k, v in tree.items():
+
+ print(' "<%s>": [' % k, end='')
+
+ first = True
+
+ for d in v:
+
+ if not(first):
+ print(',', end='')
+
+ if len(d) == 0:
+ print(' []', end='')
+
+ else:
+
+ print(' [', end='')
+
+ sub_first = True
+
+ for sub_d in d:
+
+ if not(sub_first):
+ print(', ', end='')
+
+ print('%s' % sub_d, end='')
+
+ sub_first = False
+
+ print(']', end='')
+
+ first = False
+
+ print(' ],')
+
+
+if __name__ == '__main__':
+ """Script entrypoint."""
+
+ # Cf. https://github.com/AFLplusplus/Grammar-Mutator/blob/stable/doc/customizing-grammars.md
+
+ with open(sys.argv[1], 'r') as fd:
+ grammar = fd.read()
+
+ grammar = grammar.split('%%')[1]
+
+ grammar = remove_grammar_comments(grammar)
+
+ grammar = remove_grammar_actions(grammar)
+
+ print('{')
+
+ tree = parse_rules(grammar)
+
+ tree = simplify_tree(tree)
+
+ tree = remove_indirect_left_recursion(tree)
+
+ output_rules(tree)
+
+ count = len(__lexer_tokens.keys())
+
+ for name, cb in __lexer_tokens.items():
+ cb(name, count == 1)
+ count -= 1
+
+ print('}')