-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy patheuroparl_extract.py
executable file
·76 lines (63 loc) · 2.41 KB
/
europarl_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
import sys
from collections import defaultdict, Counter
defaultdict_set = lambda: defaultdict(set)
def main(args):
europarl = defaultdict(defaultdict_set)
langqueue = []
for lang in args.languages:
prefix = "{}/europarl-v7.{}-en".format(args.dir, lang)
filename_lang = ".".join((prefix, lang))
filename_en = ".".join((prefix, "en"))
for filename in (filename_lang, filename_en):
if not os.path.exists(filename):
raise Exception("File not found: {}".format(filename))
langqueue.append((lang, filename_en, filename_lang))
for (lang, filename_en, filename_lang) in langqueue:
print("Processing {}-en...".format(lang))
with open(filename_en, "r") as f_en:
with open(filename_lang, "r") as f_lang:
for (line_en, line_lang) in zip(f_en, f_lang):
line_en, line_lang = line_en.strip(), line_lang.strip()
if not line_en or not line_lang:
continue
europarl[line_en][lang].add(line_lang)
print("Found {:8d} unique English lines.".format(len(europarl)))
multicount = sum(
1
for translations in europarl.values()
if len(translations) == len(args.languages)
)
print("Found {:8d} lines that appear in all languages.".format(multicount))
outfile = {
lang: open("extracted.multi.{}".format(lang), "w")
for lang in list(args.languages) + ["en"]
}
for (line, translations) in europarl.items():
if len(translations) < len(args.languages):
continue
print(line, file=outfile["en"])
for (lang, lines_trans) in translations.items():
for line_trans in lines_trans:
print(line_trans, file=outfile[lang])
for f in outfile.values():
f.close()
if __name__ == "__main__":
description = ""
epilog = "Languages of Europarl files are detected from their filenames."
parser = argparse.ArgumentParser(description=description, epilog=epilog)
parser.add_argument(
"languages",
nargs="+",
metavar="LANG",
type=str,
help="Language codes to include",
)
parser.add_argument(
"--dir", default=".", type=str, help="Directory for Europarl files"
)
args = parser.parse_args()
main(args)