-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathraw_to_flattened.py
69 lines (52 loc) · 2.31 KB
/
raw_to_flattened.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# raw_to_flattened.py
import argparse
# usage: python3 row_to_flattened.py infile outfile
# infile: filename of a multi-line query result txt.
# outfile: filename to write a a flattened version of the input, where each
# record occupies exactly one line, consisting of the record's lines from the
# input file concatenated together with escaped '\\n' sequences.
sentinel_linemarker = "\\n"
def starts_with_digit(instring):
return instring[:1].isdigit()
def flatten_instream_to_outstream(instream, outstream):
# first, read ahead past the header, to the first record line. this is the
# first line that begins with a digit.
to_emit = ""
while True:
line = instream.readline()
if not line:
break # end of file.
if starts_with_digit(line):
to_emit = line.replace("\n", sentinel_linemarker)
break # exit read-loop, 'to_emit' contains first line of record.
# if we're here, and to_emit is empty, it means we hit the end of the file.
if not to_emit:
return
# if we're here, then to_emit contains the first record, in progress.
# loop through to the end.
while True:
line = instream.readline()
if not line:
break # end of file.
if starts_with_digit(line):
# line is the start of a new record. emit the old one.
outstream.write(to_emit + "\n")
to_emit = ""
# then, add the contents of the current line to the entry in progress.
to_emit = to_emit + line.replace("\n", sentinel_linemarker)
# emit the final entry.
outstream.write(to_emit + "\n")
def flatten_by_filename(input_filename, output_filename):
infile = open(input_filename, 'r')
outfile = open(output_filename, 'w')
flatten_instream_to_outstream(infile, outfile)
outfile.close()
infile.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="raw_to_flattened")
parser.add_argument("input_filename", metavar="input_filename", type=str,
help="filename of l-n query result.")
parser.add_argument("output_filename", metavar="output_filename", type=str,
help="filename of output file (single line per record).")
args = parser.parse_args()
flatten_by_filename(args.input_filename, args.output_filename)