-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransPDF.py
45 lines (40 loc) · 1.12 KB
/
transPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
import io
import fitz
import certifi
import urllib.request as urllib
from translate import translate
def clear_txt(txt):
return txt.replace("-\n", "").replace("\n", " ")
def pdf_to_cn(pdf_file_path):
result = open('result.html','w')
doc = fitz.open(pdf_file_path)
for page in doc:
# blocks = page.getText("blocks")
# for txt in blocks:
# line = clear_txt(txt[4])
# result.write(line + "\n")
# trans = translate(line)
# result.write(trans + "\n")
for line in page.getText("xhtml").splitlines():
result.write(line + "\n")
result.close()
def main():
print("Downloading...")
url = "pdf_url.pdf"
data = urllib.urlopen(url, cafile=certifi.where()).read()
f = open("tmp.pdf", "wb")
f.write(data)
f.close()
print("Downloaded.")
file = "tmp.pdf"
extension_name = os.path.splitext(file)[1]
if extension_name != '.pdf':
return
pdf_file = file
print('正在处理: ', file)
pdf_to_cn(pdf_file)
if __name__ == '__main__':
main()