Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ast2json #5

Merged
merged 6 commits into from
Mar 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
/pp.c
/out.go
/a.out
/pp.json
149 changes: 149 additions & 0 deletions ast2json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import sys
import re
import json

# This script converts the output of clang AST into a JSON file.
#
# Usage:
# clang -Xclang -ast-dump -fsyntax-only myfile.c | python ast2json.py
#
# Yes, there are many better ways to do this. However I chose this method
# because:
#
# 1. I need to separate the clang AST from the c2go conversion process so that
# the c2go program can ingest a reliable JSON file and not depend on clang or
# its different versions at all.
# 2. The clang API is not stable and trying to match up binaries with different
# versions and operating systems can be tricky and brittle.
# 3. This tool, in time, will become a better binary of some kind that produces
# much the same JSON output (so minimal changes to c2go.py).
# 4. I needed something quick and dirty to proof the complete toolchain and get
# it working on different versions of clang and different operating systems
# before we enough information to really standardise the process.

regex = {
'AlwaysInlineAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> always_inline",
'ArraySubscriptExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<tags>.*)",
'AsmLabelAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \"(?P<function>.+)\"",
'AvailabilityAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<os>\w+) (?P<version>[\d.]+) (?P<unknown1>[\d.]+) (?P<unknown2>[\d.]+) (?P<unknown3>\".*?\"|\w+) (?P<unknown4>\".*?\"|\w+)",
'BinaryOperator': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' '(?P<operator>.*?)'",
'BreakStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
'BuiltinType': r'^ (?P<address>[0-9a-fx]+) \'(?P<name>.*)\'',
'CallExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*?)\'',
'CharacterLiteral': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<value>\d+)",
'CompoundStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>',
'ConstantArrayType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<size>\d+)',
'CStyleCastExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' <(?P<kind>.*)>",
'DeclRefExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)'.*? (lvalue (?P<kind>\w+)|Function) (?P<address2>[0-9a-fx]+) '(?P<name>.*?)' '(?P<type2>.*?)'",
'DeclStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
'DeprecatedAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \"(?P<message1>.*?)\" \"(?P<message2>.*?)\"",
'ElaboratedType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<tags>.+)',
'FieldDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<tags>.*?)(?P<name>\w+?) '(?P<type>.+?)'",
'FloatingLiteral': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*)' (?P<value>.+)",
'FormatAttr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>(?P<tags> Implicit)? (?P<function>\w+) (?P<unknown1>\d+) (?P<unknown2>\d+)',
'ForStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
'FunctionDecl': r"^ (?P<address>[0-9a-fx]+) (?P<prev>prev [0-9a-fx]+)? ?<(?P<position1>.*)> (?P<position2>[^ ]+)(?P<tags1> implicit)?(?P<tags2> used)? (?P<name>\w+) '(?P<type>.*)'(?P<tags3> extern)?",
'IfStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>',
'ImplicitCastExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\' <(?P<kind>.*)>',
'IntegerLiteral': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\' (?P<value>.+)',
'MemberExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<tags>.*?)(?P<name>\w+) (?P<address2>[0-9a-fx]+)",
'ParenExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*?)\'',
'ParmVarDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>.+?)(?P<name> \w+)? '(?P<type>.*?)'(?P<type2>:'.*?')?",
'PointerType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'Record': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'RecordDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<kind>struct|union) (?P<name>\w+)",
'RecordType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'ReturnStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>',
'StringLiteral': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\'(?P<tags> lvalue)? (?P<value>.*)',
'TranslationUnitDecl': r'^ (?P<address>[0-9a-fx]+)',
'Typedef': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'TypedefDecl': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.+?)> (?P<position2><invalid sloc>|[^ ]+)(?P<tags>.*?) (?P<name>\w+) \'(?P<type>.*?)\'(?P<type2>:\'.*?\')?',
'TypedefType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<tags>.+)',
'UnaryOperator': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)'(?P<tags1> lvalue)?(?P<tags2> prefix)?(?P<tags3> postfix)? '(?P<operator>.*?)'",
'VarDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<name>.+) '(?P<type>.+?)'.*?(?P<tags>.*)",
'WhileStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
}

def build_tree(nodes, depth):
"""Convert an array of nodes, each prefixed with a depth into a tree."""
if len(nodes) == 0:
return []

# Split the list into sections, treat each section as a a tree with its own
# root.
sections = []
for node in nodes:
if node[0] == depth:
sections.append([node])
else:
sections[-1].append(node)

results = []
for section in sections:
children = build_tree([n for n in section if n[0] > depth], depth + 1)
result = section[0][1]

if len(children) > 0:
result['children'] = children

results.append(result)

return results

def read_ast():
stdin = sys.stdin.read()
uncolored = re.sub(r'\x1b\[[\d;]+m', '', stdin)
return uncolored.split("\n")

def convert_lines_to_nodes(lines):
nodes = []
for line in lines:
if line.strip() == '':
continue

# This will need to be handled more gracefully... I'm not even sure
# what this means?
if '<<<NULL>>>' in line:
continue

indent_and_type = re.search(r'^([|\- `]*)(\w+)', line)
if indent_and_type is None:
print("Can not understand line '%s'" % line)
sys.exit(1)

node_type = indent_and_type.group(2)
# if node_type == 'FieldDecl':
# print(line[offset:])

offset = len(indent_and_type.group(0))
try:
result = re.search(regex[node_type], line[offset:])
except KeyError:
print("There is no regex for '%s'." % node_type)
print("I will print out all the lines so a regex can be created:\n")

for line in lines:
s = re.search(r'^([|\- `]*)(\w+)', line)
if s is not None and node_type == s.group(2):
print(line[offset:])

sys.exit(1)

if result is None:
print("Can not understand line '%s'" % line)
sys.exit(1)

node = result.groupdict()

node['node'] = node_type

indent_level = len(indent_and_type.group(1)) / 2
nodes.append([indent_level, node])

return nodes

lines = read_ast()
nodes = convert_lines_to_nodes(lines)
tree = build_tree(nodes, 0)

print(json.dumps(tree, sort_keys=True, indent=2, separators=(',', ': ')))
Loading