-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_epub_to_json.rb
195 lines (166 loc) · 6.24 KB
/
convert_epub_to_json.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
require 'nokogiri'
require 'json'
## Run in the folder of epub
## where should be texts like text0001.html, text0002.html, etc
def clean_html(div)
return nil if div.nil?
div.xpath('.//a').each do |a_el|
# remove all attributes
a_el.attributes.each do |attr|
a_el.remove_attribute(attr[0])
end
end
# remove all span tags, but keep the content
while div.xpath('.//span').length > 0
div.xpath('.//span').each do |span|
span.replace(span.inner_html)
end
end
while div.xpath('.//div').length > 0
div.xpath('.//div').each do |div|
div.replace(div.inner_html)
end
end
while div.xpath('.//blockquote').length > 0
div.xpath('.//blockquote').each do |blockquote|
blockquote.replace("<div>#{blockquote.inner_html}</div>")
end
end
res = div.inner_html
res.gsub!(/\n/, '')
res.gsub!(/\r/, '')
res.strip!
res
end
def clean_heading(div)
return nil if div.nil?
# like fret<sup>1</sup> -> fret
while div.xpath('.//sup').length > 0
div.xpath('.//sup').each do |sup|
sup.remove
end
end
res = div.content
res.gsub!(/\n/, '')
res.gsub!(/\r/, '')
res.strip!
res
end
# for pretty printing
# doc = Nokogiri::XML(html,&:noblanks)
# puts doc
def process(html, words)
doc = Nokogiri::HTML(html,&:noblanks)
doc = doc.xpath("/html/body")
word_div = doc.xpath('./div')[0]
description_div = doc.xpath('./div')[1]
origin_div = doc.xpath('./div')[2]
## get word
word_spans = word_div.xpath('./span')
word = clean_heading(word_spans[0])
pronunciation = clean_html(word_spans[1])
notes = clean_html(word_spans[2])
word_variant = nil
# e.g. front line /ˈˌfrənt ˈˌlīn / frontline
# maybe there are cases when its not word variant
if notes && !notes.include?('‹')
word_variant = notes
notes = nil
words[word_variant] ||= []
words[word_variant].push(word)
end
unless description_div
puts "No description for #{word}, #{html}"
return
end
## get description
description_parts = description_div.xpath('./blockquote')
# можно по количеству blockquotes определить, есть ли номера
with_numbers = description_parts[0].xpath("./span")[0]&.content == "I."
word_description = {}
# handle each part of speech
description_parts.each do |description_part|
block_spans = description_part.xpath("./span")
if block_spans.count > 3
raise "Too many block spans #{block_spans.count}, #{word}, #{block_spans.to_html}"
end
part_of_speech = with_numbers ? block_spans[1] : block_spans[0]
part_of_speech_notes = clean_html(with_numbers ? block_spans[2] : block_spans[1])
description_part_div = description_part.xpath("./div")[0]
description = clean_html(description_part_div)
# !description - e.g. Friesland
# part_of_speech.length > 30 - in Rome there is description - but it's bugged structure, and there is alien div
if !description || (part_of_speech && part_of_speech.content.length > 30)
description = clean_html(part_of_speech) + description.to_s
part_of_speech = 'definition'
end
part_of_speech = part_of_speech.is_a?(String) ? part_of_speech : clean_heading(part_of_speech)
if part_of_speech == 'derivatives'
# TODO: там рядом есть ещё произношение и часть речи, но надо структуру ссылок
# менять в итоговом хеше
derivatives = description_part_div.xpath(".//div//b")
# raise "#{description_part_div.to_html}\n#{derivatives.inspect}"
# derivatives добавляем как ссылки на исходное слово
derivatives.each do |derivative|
words[clean_heading(derivative)] ||= []
words[clean_heading(derivative)].push(word)
end
next
end
if part_of_speech == 'symbol'
# dont handle symbols
next
end
part_of_speech_hash = { description: description }
part_of_speech_hash[:notes] = part_of_speech_notes if part_of_speech_notes
# может быть несколько описаний для одной части речи, no obj и т.п.
word_description[part_of_speech] ||= []
word_description[part_of_speech] << part_of_speech_hash
end
### get origin
origin = nil
if origin_div
# remove '- origin'
origin_div.xpath('./span').each do |span|
if span.content.include?('– origin')
span.remove
end
end
origin = clean_html(origin_div)
end
word_hash = {
word: word,
description: word_description,
}
word_hash[:pronunciation] = pronunciation if pronunciation
word_hash[:word_note] = notes if notes
word_hash[:origin] = origin if origin
words[word] ||= []
words[word].push(word_hash)
end
(1..19).to_a.map do |i|
puts "Processing file #{i}"
str = `cat text#{format("%05d", i)}.html`
arr = str.split("<hr/>"); arr.count
# remove html wrapper
arr[0].sub!("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head><title></title>\r\n</head>\r\n<body>\r\n", '')
arr.pop
words = {}
# { word => [] | 'word_as_link' }
arr.each.with_index do |html, i|
if i % 500 == 0
puts "Processing #{i} of #{arr.count} words"
end
begin
process(html, words)
rescue => e
puts "Error on #{i} word, #{html}"
raise e
end
end
File.open("words#{i}.js", 'w') { |file| file.write("window['wordsDict#{i}'] = #{words.to_json}") }
## Этот вариант тяжелее по весу, но с ним редактор не тормозит
# File.open("words#{i}.js", 'w') { |file| file.write("window['wordsDict#{i}'] = #{JSON.pretty_generate(words)}") }
puts "File written words#{i}.js"
end
puts "Finish"