-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrammar_parser.py
203 lines (157 loc) · 5.78 KB
/
grammar_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import argparse
from collections import Counter
class Rule(object):
"""Represent a single Grammar rule."""
def __init__(self, num, short, expanded, subsequence_starts, subsequence_lengths,
occurrence, use, min_length, max_length, mean_length):
self.num = num
self.short = short
self.expanded = expanded
self.subsequence_starts = subsequence_starts
self.subsequence_lengths = subsequence_lengths
self.occurrence = occurrence
self.use = use
self.min_length = min_length
self.max_length = max_length
self.mean_length = mean_length
@property
def words(self):
return self.expanded.split() if self.expanded else []
@property
def flat(self):
return self.expanded.replace(" ", "") if self.expanded else ""
def __str__(self):
return "%d -> %s" % (self.num, self.short)
def __repr__(self):
return self.__str__()
class GrammarParser(object):
"""Parse GrammarViz grammar files."""
delim = " -> "
def __init__(self, fpath):
self.fpath = fpath
self.enumerator = None
self.line_num = 0
self.line = None
def parse(self):
self._next_rule_num = 0
with open(self.fpath) as f:
self.enumerator = enumerate(f)
grammar = self.parse_header()
grammar.rule0 = self.parse_rule0()
grammar.rules = self.parse_remaining_rules()
return grammar
def get_next_rule_num(self):
next_num = self._next_rule_num
self._next_rule_num += 1
return next_num
def parse_header(self):
self.enumerator.next() # discard filename line
window_size = self.parse_next_option()
paa_size = self.parse_next_option()
alphabet_size = self.parse_next_option()
return Grammar(window_size, paa_size, alphabet_size)
def parse_remaining_rules(self):
rules = []
while True:
try:
rule = self.parse_next_rule()
rules.append(rule)
except StopIteration:
break
return rules
def next_line(self):
self.line_num, self.line = self.enumerator.next()
return self.line
def parse_next_option(self):
line = self.next_line()
return int(line.split()[-1])
def parse_n_options(self, n):
return [self.parse_next_option() for _ in range(n)]
def parse_rule0(self):
self.next_line() # discard comment line
line = self.next_line()
short = line.split(self.delim)[1][1:-1] # strip off quotes
return Rule(self.get_next_rule_num(), short, None, [], [], *self.parse_n_options(5))
def parse_next_rule(self):
self.next_line() # discard comment line
line = self.next_line()
part1, part2 = line.split(',')
short = part1.split(self.delim)[1].replace("'", "")
expanded = part2.split(':')[-1].replace("'", "").strip()
line = self.next_line()
starts = eval(line.split(': ')[1])
line = self.next_line()
lengths = eval(line.split(': ')[1])
return Rule(self.get_next_rule_num(), short, expanded, starts, lengths,
*self.parse_n_options(5))
class Grammar(object):
"""Represent a grammar, including the options it was produced with."""
def __init__(self, window_size, paa_size, alphabet_size):
self.window_size = window_size
self.paa_size = paa_size
self.alphabet_size = alphabet_size
self.rule0 = None
self._rules = None
self.rule_counts = Counter()
self._rule_map = {}
@staticmethod
def from_file(fpath):
parser = GrammarParser(fpath)
return parser.parse()
@property
def rules(self):
return self._rules
@rules.setter
def rules(self, rules):
self._rules = rules
self.rule_counts.clear()
self._rule_map.clear()
if rules is None:
return
for rule in rules:
self.rule_counts[rule.num] = rule.occurrence
self._rule_map[rule.num] = rule
def topn(self, n):
if self.rule_counts and self._rule_map:
top = self.rule_counts.most_common(n)
return [(self._rule_map[num], count) for num, count in top]
else:
return []
def longest_rule(self):
if not self.rules:
return None
return max(rule.expanded.count(" ") + 1 for rule in self.rules)
def shortest_rule(self):
if not self.rules:
return None
return min(rule.expanded.count(" ") + 1 for rule in self.rules)
@property
def tag(self):
return "w%dp%da%d" % (self.window_size, self.paa_size, self.alphabet_size)
def __str__(self):
num_rules = len(self.rules) if self.rules else 0
return "Grammar(window_size=%d, paa_size=%d, alphabet_size=%d) with %d rules" % (
self.window_size, self.paa_size, self.alphabet_size, num_rules)
def __repr__(self):
return self.__str__()
def __iter__(self):
return iter(self.rules) if self.rules else iter([])
def __len__(self):
return len(self.rules) if self.rules else 0
def __getitem__(self, rule_num):
return self._rule_map[rule_num]
def make_parser():
parser = argparse.ArgumentParser(
description="Parse grammar files into Grammar objects")
parser.add_argument(
"grammar_file",
help="path to grammar file to parse")
return parser
if __name__ == "__main__":
cli_parser = make_parser()
args = cli_parser.parse_args()
try:
grammar = Grammar.from_file(args.grammar_file)
except Exception as e:
print("Encountered error on Line %d: %s\nline: %s" % (
parser.line_num, e, parser.line))