-
Notifications
You must be signed in to change notification settings - Fork 64
/
Copy pathcorpus.py
75 lines (64 loc) · 2.31 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#! /usr/bin/env python
#-*- coding:utf-8 -*-
import codecs
import sys
import os
import json
from utils import DATA_PROCESSED_DIR, DATA_RAW_DIR, split_sentences
from rhyme import RhymeUtil
_corpus_list = ['qts_tab.txt', 'qss_tab.txt', 'qsc_tab.txt', 'qtais_tab.txt',
'yuan.all', 'ming.all', 'qing.all']
def _parse_corpus(raw_file, json_file):
print "Parsing %s ..." %raw_file ,
sys.stdout.flush()
rdict = RhymeUtil()
data = []
with codecs.open(raw_file, 'r', 'utf-8') as fin:
tags = fin.readline().strip().split(u'\t')
line = fin.readline().strip()
while line:
toks = line.split(u'\t')
poem = {'source':os.path.basename(raw_file)}
for idx, tok in enumerate(toks):
if tags[idx] != 'body':
poem[tags[idx]] = tok
else:
body = tok
flag = True
left = body.find(u'(')
while left >= 0:
right = body.find(u')')
if right < left:
flag = False
break
else:
body = body[:left]+body[right+1:]
left = body.find(u'(')
if flag and body.find(u')') < 0:
poem['sentences'] = split_sentences(body)
for sentence in poem['sentences']:
if not reduce(lambda x,ch: x and rdict.has_char(ch), sentence, True):
flag = False
break
if flag:
data.append(poem)
line = fin.readline().strip()
with codecs.open(json_file, 'w', 'utf-8') as fout:
json.dump(data, fout)
print "Done (%d poems)" %len(data)
return data
def get_all_corpus():
corpus = []
for raw in _corpus_list:
json_file = os.path.join(DATA_PROCESSED_DIR, raw.replace('all', 'json').replace('txt', 'json'))
try:
with codecs.open(json_file, 'r', 'utf-8') as fin:
data = json.load(fin)
except IOError:
data = _parse_corpus(os.path.join(DATA_RAW_DIR, raw), json_file)
finally:
corpus.extend(data)
return corpus
if __name__ == '__main__':
corpus = get_all_corpus()
print "Size of the entire corpus: %d" % len(corpus)