-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathreadxml.py
executable file
·64 lines (59 loc) · 2.25 KB
/
readxml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import argparse
import os
def xml_reader(gold_file, xml_file, output_filename):
import xml.etree.ElementTree as ET
import sys
import json
from nltk.corpus import wordnet as wn
d = {}
with open(gold_file,'r') as map_file:
for line in map_file:
d[line.strip().split()[0]] = line.strip().split()[1]
tree = ET.parse(xml_file)
root = tree.getroot()
final_json = []
for doc in root:
for sent in doc:
count = 0
l = []
orig = ""
anno = ""
stem = []
pos = []
for tok in sent:
try:
att = tok.attrib['id']
orig = orig + " " + "_".join(tok.text.strip().split()) # hyphen or underscore?
anno = anno + " " + d[tok.attrib['id']]
l.append(count)
count += 1
stem.append(tok.attrib['lemma'])
pos.append(tok.attrib['pos'])
except:
orig = orig + " " + tok.text
anno = anno + " " + tok.text
count += 1
final_json.append({
'original': orig.strip().lower(),
'annotated' : anno.strip().lower(),
'offsets' : l,
'doc_offset' : sent.attrib["id"],
'stems': stem,
'pos': pos
})
op_fname = str(output_filename)+".json"
with open(op_fname, 'w') as outfile:
json.dump(final_json, outfile)
if(__name__=='__main__'):
parser = argparse.ArgumentParser(description='reads xml and converts it to json with the dictionary information incorporated in it.')
parser.add_argument('--goldfile', type=str,
help='gold file name')
parser.add_argument('--xmlfile', type=str,
help='xml file name')
parser.add_argument('--opname', type=str,
help='output file name')
parser.add_argument('--opdir', type=str, default='./temp',
help='output dir path')
args = parser.parse_args()
#print (args)
xml_reader(args.goldfile, args.xmlfile, os.path.join(args.opdir, args.opname))