-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpeep_tar.py
125 lines (110 loc) · 4.82 KB
/
peep_tar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from lxml import etree
import tarfile
import sys
import collections as coll
import time
import random
import parsing_xml as px
from functools import partial
def article_name_dict(tar_obj):
"""
tar_obj is an object produced with tarfile.open(...)
Returns a default dict with key article name ex. math.9203203
and value: ['9203_001/math.9203203/sl2z.xml']
"""
article_dict = coll.defaultdict(list)
for pathname in tar_obj.getnames():
dirname = pathname.split('/')[1]
article_dict[dirname].append(pathname)
return article_dict
def tar_iter(tarpath, patt):
"""
returns an iterator to the file objects of a tar zip that have a certain
pattern in their names in Pair (fname, tar_fobj)
"""
retries = 0
while retries < 10:
retries += 1
try:
with tarfile.open(tarpath) as tar_file:
for fname in filter(lambda n: patt in n, tar_file.getnames()):
yield (fname, tar_file.extractfile(fname))
break
except tarfile.ReadError as ee:
wait_delay = random.randint(5,15)
print(ee, "while opening tarfile: %s, waiting for %s"%(tarpath, wait_delay))
time.sleep(wait_delay)
def tar(tarpath, *args):
"""
A processed article and its logs look like:
'2003_003/2003.00782/MJD_Q8Cp_arxiv.xml',
'2003_003/2003.00782/latexml_commentary.txt',
'2003_003/2003.00782/latexml_errors_mess.txt',
tarpath: Is the address of a processed by LaTeXML .tar.gz file
args may be:
int: return the i-th file
str: return the first match of the names in the tarfile
"""
with tarfile.open(tarpath) as tar_file:
if args:
if isinstance(args[0], int):
contains = lambda ftype,fname: ftype in fname
xmlname = list(filter(partial(contains, '.xml'),
tar_file.getnames()))[args[0]]
logname = list(filter(partial(contains, '.txt'),
tar_file.getnames()))[args[0]]
elif isinstance(args[0], str):
# args[0] is something like '2003_003/2003.00782/MJD_Q8Cp_arxiv.xml',
# middle_uid should be 2003.00782
middle_uid = args[0].split('/')[1]
def contains(ftype,fname):
return (args[0] in fname and ftype in fname)
def contains_uid(ftype, fname):
return (middle_uid in fname and ftype in fname)
try:
xmlname = next(filter(partial(contains, '.xml'),
tar_file.getnames()))
except StopIteration:
print(f"No xml results for {args[0]} in {tarpath}")
xmlname = None
try:
logname = next(filter(partial(contains_uid, '.txt'),
tar_file.getnames()))
except StopIteration:
print(f"No txt results for {args[0]} in {tarpath}")
logname = None
else:
xmlname = tar_file.getnames()[0]
log_xtract = None if logname is None else \
tar_file.extractfile(logname).read().decode('utf8')
xml_xtract = px.DefinitionsXML(tar_file.extractfile(xmlname))
return (log_xtract, xml_xtract)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='parsing xml commandline script')
parser.add_argument('tarname', type=str, nargs=1,
help='tar.gz file name to peep into. ex. math.9213435.tar.gz or \
1803_343343.tar.gz')
parser.add_argument('-p', '--pattern', help='pattern in the name of the article to \
decompress',
type=str)
parser.add_argument('-c', '--commentary', action='store_const', const=True, help='print the commentary file')
parser.add_argument('-e', '--errors', action='store_const', const=True, help='print the commentary file')
args = parser.parse_args(sys.argv[1:])
if args.pattern:
patt = args.pattern
else:
patt = ''
with tarfile.open(args.tarname[0]) as tar_file:
article_dict = article_name_dict(tar_file)
for name,val in filter(lambda n: patt in n[0], article_dict.items()):
if args.commentary:
fobj = tar_file.extractfile(next(filter(lambda s: 'comment' in s, val)))
print(fobj.read().decode('utf-8'))
elif args.errors:
fobj = tar_file.extractfile(next(filter(lambda s: 'errors_mes' in s, val)))
print(fobj.read().decode('utf-8'))
else:
fobj = tar_file.extractfile(next(filter(lambda s: '.xml' in s, val)))
the_tree = etree.parse(fobj)
print(etree.tostring(the_tree, pretty_print=True).decode('utf-8'))