pyparser.py

###Parser used to create simple analytical representations of python code
#It parses the python into a bunch of nested dictionaries that contain 
#the arguments and statements inside various functions / classes.
#Thanks to the public domain "GardenSnake" parser, as I took a small
#portion of the token list and regex from there, and it helped me to 
#learn David Beazley's PLY from http://www.dabeaz.com/ply/

import sys
import ply.lex as lex
import ply.yacc as yacc
import pprint

###Options###
PARSER_PRINTTOKENS = False	#prints every tag generated by the parser on a new line
PARSER_DEBUG =  False		#prints helpful parser debug messages
LEXER_PRINTTOKENS = False	#prints every token synthesized by the tokeniser
LEXER_DEBUG = False			#prints helpful tokeniser debug messages

####Lexer####
tokens = (
	'CLASS',
	'DEF',
	'IF',
	'ELIF',
	'ELSE',
	'AND',
	'OR',
	'FOR',
	'IN',
	'FROM',
	'IMPORT',
	'WHILE',
	'WITH',
	'AS',
	'TRY',
	'EXCEPT',
	'FINALLY',
	'PASS',
	'BREAK',
	'ASSERT',
	"YIELD",
	"PRINT",
	"GLOBAL",
	'RETURN',
	
	'NAME',
	'NUMBER',
	'STRING',
	
	'LPAR',
	'RPAR',
	'LSPAR',
	'RSPAR',
	'LCPAR',
	'RCPAR',
	'COLON',
	'EQ',
	'NEQ',
	'ASSIGN',
	'LT',
	'LTEQ',
	'GT',
	'GTEQ',
	'PLUS',
	'MINUS',
	'MULT',
	'DIV',
	'MOD',
	'COMMA',
	'DOT',
	'SEMICOLON',
	
	'WS',
	'NEWLINE',
	'ENDMARKER',
	'INDENT'
)
def t_NUMBER(t):
    r"(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"								# (\d+(\.\d*)?|\.\d+) is for decimals, ([eE][-+]? \d+) is for ??? maybe hex?
    if (LEXER_PRINTTOKENS): print 'NUMBER',
    return t	

def t_STRING(t):
	r'(r?)((\"{3}(\\"|[^"])*\"{3})|(\'{3}(\\\'|[^\'])*\'{3})|(\"{1}(\\"|[^"])*\"{1})|(\'{1}(\\\'|[^\'])*\'{1}))'
	if (LEXER_PRINTTOKENS): print 'STRING',								#allow sets of 3 or 1 quotation marks to surround
	return t															# the string, but ignore escaped quotation marks

t_COLON = r':'
t_EQ = r'==|(is)'
t_NEQ = r'!='
t_ASSIGN = r'=|\+=|-=|\*=|/='
t_LT = r'<'
t_LTEQ = r'<='
t_GT = r'>'
t_GTEQ = r'>='
t_PLUS = r'\+'
t_MINUS = r'-'
t_MULT = r'\*'
t_MOD = r'%'
t_DIV = r'/'
t_COMMA = r','
t_DOT = r'\.'
t_SEMICOLON = r';'

RESERVED = {
	"class" : "CLASS",
	"def" : "DEF",
	"if" : "IF",
	"elif" : "ELIF",
	"else" : "ELSE",
	"return" : "RETURN",
	"and" : "AND",
	"or" : "OR",
	"for" : "FOR",
	"in" : "IN",
	"import" : "IMPORT",
	"while" : "WHILE",
	"with" : "WITH",
	"as" : "AS",
	"try" : "TRY",
	"except" : "EXCEPT",
	"finally" : "FINALLY",
	"pass" : "PASS",
	"break" : "BREAK",
	"assert" : "ASSERT",
	"yield" : "YIELD",
	"print" : "PRINT",
	"from" : "FROM",
	"global" : "GLOBAL"
}

def t_NAME(t):
    r'[a-zA-Z_][a-zA-Z0-9_]*'
    t.type = RESERVED.get(t.value, "NAME")
    if (LEXER_PRINTTOKENS): print 'NAME',
    return t

# Putting this before t_WS let it consume lines with only comments in
# them so the latter code never sees the WS part.  Not consuming the
# newline.  Needed for "if 1: #comment"

def t_comment(t):
    r"[ ]*\043[^\n]*"  # \043 is '#'
    if (LEXER_PRINTTOKENS): 'COMMENT',
    pass

#store indentation value for the line
def t_INDENT(t):
	r"(\t)|(\ {4})"
	t.value.replace(r"(\ {4})", "\t")									#convert serieses of four spaces to tabs
	if (LEXER_PRINTTOKENS): 'INDENT',
	if (t.lexer.paren_count == 0):
		return t

# Whitespace
def t_WS(t):
    r'\ '
    pass																#ignore whitespace

# Don't generate newline tokens when inside of parenthesis, eg
#   a = (1,
#        2, 3)
def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)
    t.type = "NEWLINE"
    if (LEXER_PRINTTOKENS): print 'NEWLINE',
    if (t.lexer.paren_count == 0):
		return t

def t_LPAR(t):
	r'\('
	if (LEXER_PRINTTOKENS): print 'LPAR',
	t.lexer.paren_count += 1
	return t

def t_RPAR(t):
	r'\)'
	if (LEXER_PRINTTOKENS): print 'RPAR',
	t.lexer.paren_count -= 1
	return t

def t_LSPAR(t):
	r'\['
	if (LEXER_PRINTTOKENS): print 'LSPAR',
	t.lexer.paren_count += 1
	return t

def t_RSPAR(t):
	r'\]'
	if (LEXER_PRINTTOKENS): print 'RSPAR',
	t.lexer.paren_count -= 1
	return t

def t_LCPAR(t):
	r'\{'
	if (LEXER_PRINTTOKENS): print 'LCPAR',
	t.lexer.paren_count += 1
	return t

def t_RCPAR(t):
	r'\}'
	if (LEXER_PRINTTOKENS): print 'RCPAR',
	t.lexer.paren_count -= 1
	return t

def t_error(t):
	if (LEXER_DEBUG): raise SyntaxError("Unknown symbol %r" % (t))
	if (LEXER_DEBUG): print "Skipping", repr(t.value[0])
	t.lexer.skip(1)
    
lexer = lex.lex(debug=LEXER_DEBUG)
lexer.paren_count = 0

#function to create a new token (not used)
def _new_token(type, lineno):
    tok = lex.LexToken()
    tok.type = type
    tok.value = None
    tok.lineno = lineno
    return tok

###Parser Stuff#########################################################
#below is a dictionary that stores data about the classes and functions.
#it stores with the following structure, where "base" is outside declared
# functions or classes, "statements" list contains all statements that
# aren't followed by indents, (excludes for, def, etc),  and "arguments" 
# list contains the arguments for the parent
#tree###class###function#statements##########
tree = {
		"base" : {							#the faux-class dictionary containing anything outside a class
				"base" : {					#the faux-function dictionary containing anything outside a function
						"statements" : []	#a list of statements that don't precede indents in the parent
						}					#the list of arguments is present within every function and class except base
				}
		}
currentclass, currentfunc = "base", "base"	#tracks what class and function we're looking at
numindents, lastnumindents = 0, 0			#tracks the number of indents on the current and previous line, if current < last then it's dedented
laststmttype = ""							#tracks the statement type of the last line, for adding to the indentlayer list
indentlayer = []							#a list for keeping track of the layers of indentation we're on, eg ["class", "def", "for", "if"], reducing by 1 for every dedent
tryid = 0									#IDs for the various branching statements so they don't override each other
exceptid = 0
finallyid = 0
ifid = 0
withid = 0
forid = 0
whileid = 0

precedence = (
	('left', 'NUMBER', 'STRING'),
	('left', 'OR'),
	('left', 'AND'),
	('left', 'PLUS', 'MINUS', 'MULT', 'DIV'),
	('left', 'COMMA'),
	('left', 'NODOT'),
	('left', 'OPER', 'DOT'),
	('left', 'POSTDENT')
)

#this rule encapsulates the entire code
def p_lines(p):
	'''
	lines		: lines indented
				| indented
	'''

#indentation tracker for each line. Also encapsulates literally every line recognisable by the parser
def p_indented(p):
	'''
	indented	: indent stmt NEWLINE
				| stmt NEWLINE
	'''
	global numindents
	global lastnumindents
	global currentfunc
	global currentclass
	global laststmttype
	stmttype = ''														#used for adding to "indentlayer", and also for checking if the 
																		# currentfunc/class should be switched back to "base"
	if (len(p) == 4):													#if there are indents
		stmt = p[2]
		indent = p[1]
	else:																#if there aren't indents
		stmt = p[1]
		indent = ()

	stmttype = str(stmt[0])												#set statement type to stmt's statement type
	if (len(indent) > 1):												#if the indents are in nested tuples
		numindents = len(make_list(indent))
	else:
		numindents = len(indent)

	relativeindents = numindents-lastnumindents
	if (relativeindents < 0): 											#if dedented
		for i in range(0, -relativeindents):
			if (len(indentlayer) > 0):
				del indentlayer[-1]										#we exit the highest indentation layer, no matter what it is
			else:
				if (currentfunc != "base"):
					currentfunc = "base"
				elif (currentclass != "base"):
					currentclass = "base"

	parse_stmt(stmt)													#parse the statement AFTER indentations are calculated

	if (PARSER_DEBUG): 
		if (numindents != lastnumindents or numindents == 0):
			print("\t the indentation layers are now class {} func {} layers {}").format(currentclass, currentfunc, indentlayer)

	laststmttype = stmttype
	lastnumindents = numindents

#skips empty lines in the indentation tracker
def p_indented_emptyline(p):
	'''
	indented	: indent NEWLINE
				| NEWLINE
	'''

#statement tag for a statement with indents after it (they are ignored)
def p_stmt_postdent(p):
	'''
	stmt		: stmt indent %prec POSTDENT
	'''
	p[0] = p[1]

#statement tag for one-line if statements (or other statements)
def p_stmt_oneline(p):
	'''
	stmt		: branch nonbranch
	'''
	p[0] = (p[1], p[2])

#statement tag for things that require an indented submethod (classes, functions, for loops etc)
def p_stmt_branching(p):
	'''
	stmt		: branch
	'''
	p[0] = ("branch", p[1])

#statement tag for things to be added to the "statements" list
def p_stmt(p):
	'''
	stmt		: nonbranch
	'''
	p[0] = ("nonbranch", p[1])

#tag for statements that don't require a new branch on the tree,
# can either be placed directly after branching statements in the same line or on its own in one line
def p_nonbranch(p):
	'''
	nonbranch	: assign
				| variable
				| constant
				| boring
				| import
				| print
				| global
				| return
	'''
	p[0] = p[1]

#generic tag for all statements that will require a new tree branch
def p_branch(p):
	'''
	branch		: try
				| finally
				| def
				| if
				| for
				| while
				| with
				| except
	'''
	p[0] = p[1]	#branching statements should be specially structured as a tuple
				# eg (str(tagname), str(uniqueID), args, statements)
				# or (str(tagname), str(uniqueID), args)
				# or (str(tagname), str(uniqueID)) if no args or statements are necessary

#tag for try statements
def p_try(p):
	'''
	try			: TRY COLON
	'''
	global tryid
	t = find_unclosed_indentlayer()
	t[str(p[1])] = {"arguments" : (),
					"statements" : []}
	p[0] = ("try", "try_"+str(tryid))
	tryid += 1

#tag for finally statements
def p_finally(p):
	'''
	finally		: FINALLY COLON
	'''
	global finallyid
	p[0] = ("finally", "finally"+finallyid)
	finallyid += 1

#function or class definitions
def p_def(p):
	'''
	def 		: DEF nameprmtrs COLON
				| CLASS nameprmtrs COLON
	'''
	p[0] = (str(p[1]), str(p[2][0]), p[2][1])

#class definitions that don't specify any inheritence
def p_def_class(p):
	'''
	def			: CLASS NAME COLON
	'''
	p[0] = (str(p[1]), str(p[2]), [])

#if statemnet duh
def p_if(p):
	'''
	if 			: IF checks COLON
				| ELIF checks COLON
				| IF value COLON
				| ELIF value COLON
				| ELSE COLON
	'''
	global ifid
	if (len(p) == 4):
		p[0] = (str(p[1]), str(p[1]) + "_" + str(ifid), p[2])
	elif (len(p) == 3):
		p[0] = (str(p[1]), str(p[1]) + "_" + str(ifid), ())
	ifid += 1

#for statement duh
def p_for(p):
	'''
	for 		: FOR in COLON
	'''
	global forid
	p[0] = ("for", "for_"+str(forid), str(p[2][1]) + " in " + str(p[2][2]))
	forid += 1

#while loop statement duh
def p_while(p):
	'''
	while 		: WHILE checks COLON
	'''
	global whileid
	p[0] = ("while", "while_"+str(whileid), p[2])
	whileid += 1

#with tag
def p_with(p):
	'''
	with 		: WITH variable AS uname COLON
				| WITH variable COLON
	'''
	global withid
	if (len(p) == 6):
		p[0] = ("with", "with_" + str(withid), str(p[2]) + " as " + str(p[4]))
	else:
		p[0] = ("with", "with_" + str(withid), str(p[2]))
	withid += 1

#exception tag
def p_except(p):
	'''
	except		: EXCEPT COLON
				| EXCEPT NAME COLON
				| EXCEPT argslist COLON
				| EXCEPT argslist AS argslist COLON
	'''
	global exceptid
	if (len(p) == 3):
		p[0] = ("except", "except_"+str(exceptid))
	elif (len(p) == 4):
		p[0] = ("except", "except_"+str(exceptid), p[2])
	else:
		p[0] = ("except", "except_"+str(exceptid), str(p[2])+" as "+str(p[4]))
	exceptid += 1

#for import tags at the beginning of the document
def p_import(p):
	'''
	import		: IMPORT valuelist
				| FROM variable IMPORT valuelist
				| IMPORT variable AS valuelist
				| FROM variable IMPORT variable AS valuelist
	'''
	if (len(p) == 3):
		p[0] = (p[1], p[2])
	elif (len(p)==5):
		p[0] = (p[1], p[2], p[3], p[4])
	else:
		p[0] = (p[1], p[2], p[3], p[4], p[5], p[6])

#for print statements
def p_print(p):
	'''
	print		: PRINT valuelist
				| PRINT valuelist COMMA
	'''
	vallist = make_list(p[2])
	p[0] = (p[1], vallist)

#for global declarer thingies
def p_global(p):
	'''
	global		: GLOBAL NAME
	'''
	p[0] = (p[1], p[2])

#for return statements
def p_return(p):
	'''
	return		: RETURN valuelist
	'''
	p[0] = (p[1], p[2])

#tag used for running functions as well as for function/class definitions
def p_nameprmtrs(p):
	'''
	nameprmtrs	: uname prmtrs
	'''
	p[0] = (p[1], p[2])	#parameters are listed in the list labelled "arguments"

#for checks surrounded by brackets
def p_checks_par(p):
	'''
	checks		: LPAR checks RPAR
	'''
	p[0] = p[2]

#for combining if statements
def p_checks(p):
	'''
	checks		: checks andor check
				| checks andor value
				| value andor value
				| check
	'''
	if (len(p) == 4):
		p[0] = (p[2], p[1], p[3])
	else:
		p[0] = p[1]

#a boolean statement like x > y
def p_check(p):
	'''
	check		: value equals value
				| in
	'''
	if (len(p) == 4):
		p[0] = (p[2], p[1], p[3])
	else:
		p[0] = p[1]

#something in something, eg for something in something or if something in something
def p_in(p):
	'''
	in			: value IN value
	'''
	p[0] = (p[2], p[1], p[3])

#combines the indentations in nested tuples (to later be tallied up for each line)
def p_indent(p):
	'''
	indent		: indent INDENT
				| INDENT
	'''
	if (len(p) == 3):
		p[0] = (p[1], "(indent)")
	else:
		p[0] = ("(indent)")
	if (PARSER_PRINTTOKENS): print '(indent)',

#a list of arguments for use as parameters of some function
def p_prmtrs(p):
	'''
	prmtrs		: LPAR argslist RPAR
				| LPAR assign RPAR
				| tuple
	'''
	if (len(p) == 4):
		p[0] = make_list(p[2])
	else:
		p[0] = make_list(p[1])

#a list of arguments, mostly riding on the back of "valuelist"
def p_argslist(p):
	'''
	argslist	: argslist COMMA valuelist
				| argslist COMMA assign
				| valuelist COMMA assign
				| assign COMMA assign
	'''
	p[0] = (p[1], p[3])

#a tuple in python, also can be reduced to a arglist 
def p_tuple(p):
	'''
	tuple		: LPAR valuelist RPAR
				| LPAR RPAR
	'''
	if (len(p) == 3):
		p[0] = ()
	else:
		p[0] = p[2]

#a python-syntax list like l = ["jingle", 2, 4], uses valuelist rather than arglist because a list like "[2, hong= 1]" is not allowed
def p_pythonlist(p):
	'''
	pythonlist	: LSPAR valuelist RSPAR
				| LSPAR RSPAR
	'''
	if (len(p) == 3):
		p[0] = []
	elif (len(p[2]) > 1):
		p[0] = make_list(p[2])
	else:
		p[0] = p[2]

#dictionary which contains lookuplist between curly brackets
def p_dictionary(p):
	'''
	dictionary	: LCPAR pairlist RCPAR
				| LCPAR RCPAR
	'''
	if (len(p) == 3):
		p[0] = []
	elif (p[2][0] != ":"):
		p[0] = make_dict(p[2], appenddict = {})
	else:
		p[0] = {p[2][1] : p[2][2]}

#list of key:value pairs
def p_pairlist(p):
	'''
	pairlist	: pairlist COMMA pair
				| pair
	'''
	if (len(p) == 4):
		p[0] = (p[1], p[3])
	else:
		p[0] = (p[1])

#a key:value pair in a dictionary
def p_pair(p):
	'''
	pair		: value COLON value
	'''
	p[0] = (p[2], p[1], p[3])

#list of values
def p_valuelist(p):
	'''
	valuelist	: valuelist COMMA value
				| value
	'''
	if (len(p) == 4):
		p[0] = (p[1], p[3])
	else:
		p[0] = p[1]

#groups variables, constants and operations because mostly there is no distinction in usage
def p_value(p):
	'''
	value		: variable %prec NODOT
				| constant %prec NODOT
				| operation
	'''
	p[0] = p[1]				#the operation tag is circularly referenced in the value tag, but this should be ok as chaining operations is allowed anyway

#generates operation tags for any type of operation (all are considered the same)
def p_operation(p):
	'''
	operation	: value operator value %prec OPER
	'''
	p[0] = (p[2], p[1], p[3])

#variables can be NAMEs or they can be NAME[0]['dingle'] for lists, dicts, tuples etc
def p_variable(p):
	'''
	variable	: uname
				| uname accesslist
				| nameprmtrs
				| variable DOT variable
				| constant DOT variable
				| print DOT variable
	'''
	if (len(p) == 2):
		p[0] = p[1]
	elif (len(p) == 4):
		p[0] = (".", p[1], p[3])
	else:
		p[0] = str(p[1]) + str(p[2])

#for variables preceded by - or + signs
def p_uname(p):
	'''
	uname		: NAME
				| MINUS NAME
				| PLUS NAME
	'''
	if (len(p) == 3):
		p[0] = str(p[1]) + str(p[2])
	else:
		p[0] = p[1]

#a tag for access to lists, tuples and dicts (for example, the "[0][1]" part in "p[0][1] = bob")
def p_accesslist(p):
	'''
	accesslist	: accesslist accessor
				| accessor
	'''
	if (len(p) == 3):
		p[0] = str(p[1]) + str(p[2])
	else:
		p[0] = p[1]

#this tag enables [:1], [1:], and [1:2] formats to be used to access lists
def p_accessor(p):
	'''
	accessor	: LSPAR value RSPAR
				| LSPAR pair RSPAR
				| LSPAR value COLON RSPAR
				| LSPAR COLON value RSPAR
	'''
	if (len(p) == 5):
		p[0] = "[" + str(p[2]) + str(p[3]) + "]"
	else:
		p[0] = "[" + str(p[2]) + "]"

#represents an assignment operation, eg: hours = seconds/360, size = 12, name = "helga", xspeed = yspeed
def p_assign(p):
	'''
	assign		: assign ASSIGN value
				| variable ASSIGN value
	'''
	p[0] = (p[2], p[1], p[3])

#groups NUMBERS, STRINGS, lists together as constants
def p_constant(p):
	'''
	constant	: number
				| STRING
				| pythonlist
				| tuple
				| dictionary
	'''
	p[0] = p[1]

#groups OR/AND because for analysis purposes at the moment they are identical (although parser could be extended for analysis of logic)
def p_andor(p):
	'''
	andor		: AND
				| OR
	'''
	p[0] = p[1]

#for literals that are used on their own and just appear as lone statements on their own line
def p_boring(p):
	'''
	boring		: PASS
				| BREAK
				| YIELD
	'''
	p[0] = str(p[1])

#groups numbers and numbers with preceding + or -
def p_number(p):
	'''
	number		: NUMBER
				| MINUS NUMBER
				| PLUS NUMBER
	'''
	if (p[1] == '+'):
		p[0] = float(p[2])
	elif (p[1] == '-'):
		p[0] = -float(p[2])
	else:
		p[0] = p[1]

#groups plus, minus, divide, multiply, exponents together cos we don't need them for analysis (not yet at least)
def p_operator(p):
	'''
	operator	: PLUS
				| MINUS
				| MULT
				| DIV
				| MOD
	'''
	p[0] = p[1]

#groups ==, >=, <=, !=, <, and >
def p_equals(p):
	'''
	equals		: EQ
				| NEQ
				| LT
				| LTEQ
				| GT
				| GTEQ
				| ASSIGN ASSIGN
	'''
	p[0] = p[1]

#error function for parser if it finds a bad tag
def p_error(p):
    if (PARSER_DEBUG): print "Error!", repr(p)

parser = yacc.yacc(debug=PARSER_DEBUG)

###Methods used by Parser###############################################
def make_list(p, appendlist=None):										#using recursion, converts series of arguments seperated by commas into a python list
	if (appendlist == None):
		appendlist = []
	
	if (type(p) == tuple):												#if it's not the end of the expression
		if (len(p) == 2):												#lists as nested tuples always have length 2 expression
			appendlist.append(p[1])										#append the argument token to list
			appendlist = make_list(p[0], appendlist)
		elif (len(p) == 1 or len(p) == 3):								#if it's an argument token rather than a list token, 
			appendlist.append(p[0])										# the function will handle it well
		elif (len(p) != 0):
			if PARSER_DEBUG: print("!! parser tried to make a list out of something that isn't a list! at {}").format(p)
	elif (type(p) == str):
		appendlist.append(p)											#if it's just a string rather than a tuple
	return appendlist													#return the modified list

def make_dict(p, appenddict=None):										#similar to make_list but it works on tuples formatted
	if (appenddict == None):
		appenddict = {}
	
	if (type(p) == tuple):												# like (":", bing, bong) into dicts formatted like {bing:bong}
		if (len(p) == 2):												#if it's a valid "list" expression
			appenddict[p[1][1]] = p[1][2]								#append the key and value tokens from p[1] to list
			appenddict = make_dict(p[0], appenddict)					
		elif (len(p) == 3):												#if it's a key:value token rather than a list token,
			appenddict[p[1]] = p[2]										# the function handles it
		else:
			print("!! parser tried to make a dict out of something that isn't a dict! at {}").format(p)
	return appenddict

#finds the last layer of "unclosed" indents inside the tree, starting at currentclass/currentfunc
def find_unclosed_indentlayer():
	global indentlayer
	global tree
	global currentclass
	global currentfunc
	t = tree[currentclass][currentfunc]
	for l in indentlayer:
		if (str(l) != currentfunc and str(l) != currentclass): #skips the class/function indentlayers as they are set to base otherwise
			t = t[l]								#goes one branch deeper in the tree for every member of the indentlayer list
	return t
	
#creates a new tree branch at the deepest layer of unclosed indentation with the given arguments
def grow_new_branch(p):	#p is a tuple from a parser tag, where [0] = stmttype, [1] = stmtid, [2] = args, [3] = stmts
	global tree
	global currentclass
	global currentfunc
	stmttype = p[0]
	stmtid = p[1]
	args = []
	stmts = []
	if (len(p) > 2): args = p[2]
	if (len(p) > 3): stmts = p[3]

	if (stmttype == 'class'):											#classes are always at the root of the tree
		currentclass = stmtid
		tree[currentclass] = {"arguments" : args,						#make a class including a base function	
							  "base" : {"arguments" : [], "statements" : []}}
	elif (stmttype == 'def'):											#functions are always second from the tree root	
		currentfunc = stmtid
		tree[currentclass][currentfunc] = {"arguments" : args, "statements" : stmts}
	else:																#other branching statements may be anywhere above the tree root
		t = find_unclosed_indentlayer()
		t[stmtid] = {"arguments" : args,
					"statements": stmts}

	if (PARSER_DEBUG):
		print "\t\t branch called", stmtid, " was grown on the tree"

#adds statements to the currently open indent layer
def add_stmts_to_tree(stmts):
	global tree
	global currentclass
	global currentfunc
	t = find_unclosed_indentlayer()
	if not "statements" in t:
		t["statements"] = []
	if (type(stmts) == list): 
		t["statements"] += stmts
	else:
		t["statements"].append(stmts)

#parses a statement and does the appropriate action to get it into the tree
#branching stmt should be in the format (type, (stmttype, stmtid, args, statements))
#nonbranching stmt will be in the format
def parse_stmt(stmt):
	if (PARSER_PRINTTOKENS): print("found {}ing statement that reads {}").format(stmt[0], stmt[1])
	global indentlayer
	
	if (stmt[0] == "branch"):
		grow_new_branch(stmt[1])
		if (stmt[1][0] != "def" and stmt[1][0] != "class"): 			#don't append if it's a function or class definition
			indentlayer.append(stmt[1][1])								#append the declaration's stmtid to the indentlist
	elif (stmt[0] == "nonbranch"):
		add_stmts_to_tree(stmt[1])
	else:																#this means it's a oneline statement
		grow_new_branch(stmt[0])										#add a new branch for the branching statement
																		#create a fake indent by 
		indentlayer.append(stmt[0][1])									# appending the declaration's stmtid to the indentlist
		add_stmts_to_tree(stmt[1])										#add the statement to the branch we just created
		del indentlayer[-1]												#close the one-line branch we just created

###TEST
#~ while True:
	#~ with open("./example.py", "r") as f:
		#~ data = f.read();
	#~ try:
		#~ s = data
	#~ except EOFError:
		#~ break
	#~ parser.parse(s)
	#~ print("class\t\tfunc\t\tcontents")
	#~ pp = pprint.PrettyPrinter(indent=4)
	#~ pp.pprint(tree)
	#~ break

def parse_file(filename):
	with open(filename, "r") as f:
		data = f.read();
	try:
		s = data
	except EOFError:
		return tree
	parser.parse(s)
	return tree