forked from Disiok/poetry-seq2seq
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathquatrains.py
37 lines (29 loc) · 1012 Bytes
/
quatrains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#! /usr/bin/env python
#-*- coding:utf-8 -*-
from utils import *
from corpus import get_all_corpus
from vocab import get_vocab
def is_quatrain(poem):
if poem['source'] == 'qsc_tab.txt':
return False
else:
sentences = poem['sentences']
return len(sentences) == 4 and \
(len(sentences[0]) == 5 or len(sentences[0]) == 7) and \
reduce(lambda x, sentence: x and len(sentence) == len(sentences[0]),
sentences[1:], True)
def get_quatrains():
_, ch2int = get_vocab()
def quatrain_filter(poem):
if not is_quatrain(poem):
return False
else:
for sentence in poem['sentences']:
for ch in sentence:
if ch not in ch2int:
return False
return True
return filter(quatrain_filter, get_all_corpus())
if __name__ == '__main__':
quatrains = get_quatrains()
print "Size of quatrains: %d" % len(quatrains)