-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathepub.py
205 lines (181 loc) · 7.88 KB
/
epub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os, glob, re
import shutil, zipfile
from typing import List
from utils import EpubBase
class Epub(EpubBase):
def __init__(self, path: str, resume=False):
super(Epub, self).__init__(path)
self.catalog = {} # {id:title}
if resume:
self.resume()
def resume(self) -> None:
"""
resume from an existed dir, obtain chapters' id and title,
fill self.catalog
"""
l = glob.glob(os.path.join(self.path, 'chapter_*.xhtml'))
l = sorted(l, key=lambda x: int(x.split('_')[-1].split('.')[0]))
for f in l:
cid = int(f.split('_')[-1].split('.')[0])
with open(f, 'r') as _f:
title = re.findall('<title>(.*?)</title>', _f.read())[0]
self.catalog[cid] = title
def create_chapter(self,
cid: int,
title: str,
text: str or List[str],
html: bool = True,
full: bool = False) -> None:
''' create a chapter from text.
The text can be \n
(1) normal content text, i.e. a full chapter in string, set html=False;\n
(2) a string list, each element is a paragraph of a chapter;\n
(3) xhtml text, it can be text from a completed XHTML file (set html=True, full=True),
or text without XHTML head, meta, just keep body, such as
"<p>para1</p><p>para2<\p>" (html=True, full=False, default settings)
Args:
cid: chapter id in opf file, ncx file
title: chapter name
text: content
html: if True, text in html format, default is True
full: if True, text can be created a full XHTML file
'''
self.catalog[cid] = title
chapter = EpubChapter(title)
if isinstance(text, list) or not html:
content = chapter.from_text(text)
else:
content = chapter.from_html_text(text, full)
with open(os.path.join(self.path, f'chapter_{cid}.xhtml'),
'w',
encoding='utf-8') as f:
f.write(content)
def chapter_from_file(self, cid: str, title: str, filename: str) -> None:
''' create a chapter from a XHTML file
Args:
cid: chapter id in opf file, ncx file
title: chapter name
filename: XHTML file path
'''
if not os.path.exists(filename):
print(f'File: {filename} not exists!')
exit(1)
self.catalog[cid] = title
shutil.copy(filename, os.path.join(self.path, f'chapter_{cid}.xhtml'))
def create(self, clean: bool = True) -> None:
''' create a epub file
Args:
clean: if True, delete temp file (unzip epub file can obtain it), default True
'''
self.write_META_INF()
self.write_opf()
self.write_page()
self.write_catalog()
self.write_mimetype()
self.write_stylesheet()
self.write_toc()
self.compression(clean)
def compression(self, clean: bool = True) -> None:
file_list = os.listdir(self.path)
z = zipfile.ZipFile(self.path + '.epub', 'w', zipfile.ZIP_DEFLATED)
z.write(os.path.join(self.path, 'mimetype'),
'mimetype',
compress_type=zipfile.ZIP_STORED)
file_list.remove('mimetype')
for f in file_list:
if not os.path.isdir(os.path.join(self.path, f)):
z.write(os.path.join(self.path, f), f)
else:
fs = os.listdir(os.path.join(self.path, f))
z.write(os.path.join(self.path, f, fs[0]),
os.path.join(f, fs[0]))
if clean:
shutil.rmtree(self.path)
def write_opf(self) -> None:
''' write content.opf file'''
self.write_opf_head()
cont1 = ' <item href=\"catalog.xhtml\" id=\"catalog\" media-type=\"application/xhtml+xml\"/>\n' + \
' <item href=\"stylesheet.css\" id=\"css\" media-type=\"text/css\"/>\n' + \
' <item href=\"page.xhtml\" id=\"page\" media-type=\"application/xhtml+xml\"/>\n' + \
' <item href=\"toc.ncx\" media-type=\"application/x-dtbncx+xml\" id=\"ncx\"/>\n' + \
'</manifest>\n' + \
'\n' + \
'<spine toc=\"ncx\">\n' + \
' <itemref idref=\"page\"/>\n' + \
' <itemref idref=\"catalog\"/>\n'
with open(os.path.join(self.path, 'content.opf'),
'a',
encoding='utf-8') as f:
for cid in self.catalog.keys():
f.write(
f' <item href=\"chapter_{cid}.xhtml\" id=\"{cid}\" media-type=\"application/xhtml+xml\"/>\n'
)
if self.cover_img_path is not None:
f.write(
f' <item id="cover-image" href="cover.{self.suffix}" media-type="image/{self.media_type}"/>\n'
)
f.write(cont1)
for cid in self.catalog.keys():
f.write(f' <itemref idref=\"{cid}\"/>\n')
f.write('</spine>\n')
f.write('<guide>\n')
f.write(
' <reference href=\"catalog.xhtml\" type=\"toc\" title=\"目录\"/>\n'
)
f.write('</guide>\n')
f.write('</package>\n')
def write_catalog(self) -> None:
'''write catalog.xhtml file'''
self.write_catalog_head()
with open(os.path.join(self.path, 'catalog.xhtml'),
'a',
encoding='utf-8') as f:
for cid, title in self.catalog.items():
f.write(
f' <li class=\"catalog\"><a href=\"chapter_{cid}.xhtml\">{title}</a></li>\n'
)
f.write(
' </ul>\n <div class=\"mbppagebreak\"></div>\n</body>\n</html>'
)
def write_toc(self) -> None:
'''write toc.ncx file'''
self.write_toc_head()
with open(os.path.join(self.path, 'toc.ncx'), 'a',
encoding='utf-8') as f:
idx = 1
for cid, title in self.catalog.items():
f.write(
f'<navPoint id=\"{cid}\" playOrder=\"{idx}\"><navLabel><text>{title}</text></navLabel><content src=\"chapter_{cid}.xhtml\"/></navPoint>\n'
)
idx += 1
f.write('</navMap>\n')
f.write('</ncx>')
class EpubChapter():
def __init__(self, title=''):
self.chapter_title = title
self.chapter = ''
self.head = '<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n' + \
'<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n' + \
'<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"zh-CN\">\n' + \
'<head>\n' + \
f' <title>{title}</title>\n' + \
' <link href=\"stylesheet.css\" type=\"text/css\" rel=\"stylesheet\"/>\n' + \
' <style type=\"text/css\">\n' + \
' @page { margin-bottom: 5.000000pt; margin-top: 5.000000pt; }\n' + \
' </style>\n' + \
'</head>\n' + \
'<body>\n' + \
f' <h2><span style=\"border-bottom:1px solid\">{title}</span></h2>\n'
self.tail = ' <div class=\"mbppagebreak\"></div>\n</body>\n</html>'
def from_html_text(self, html_text: str, full: bool = False) -> str:
if full:
self.chapter = html_text
else:
self.chapter = self.head + html_text + self.tail
return self.chapter
def from_text(self, text: str or List[str]) -> str:
if not isinstance(text, List):
text = text.split('\n')
html_text = ' <p>' + '</p>\n <p>'.join(text) + '</p>\n'
self.chapter = self.head + html_text + self.tail
return self.chapter