-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauto-translater.py
356 lines (306 loc) · 13.7 KB
/
auto-translater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# -*- coding: utf-8 -*-
import os
import sys
import re
import yaml # type: ignore # pip install PyYAML
import ollama # type: ignore
# 设置最大输入字段,超出会拆分输入,防止超出输入字数限制
max_length = 1800
# 设置翻译的路径
dir_to_translate = "docs"
excluded_dirs = ["en"] # 替换为你要排除的目录名
translates = ['en'] # 需要翻译成什么语言 当前的模型比较垃圾,没有办法翻译 es 和 ar
# 不进行翻译的文件列表
exclude_list = ["Contact-and-Subscribe.md", "WeChat.md"]
# 已处理的 Markdown 文件名的列表,会自动生成
processed_list = "processed_list.txt"
# 即使在已处理的列表中,仍需要重新翻译的标记
marker_force_translate = "\n[translate]\n"
# Front Matter 处理规则
front_matter_translation_rules = {
"title": lambda value, lang: translate_text(value, lang),
"description": lambda value, lang: translate_text(value, lang),
"text": lambda value, lang: translate_text(value, lang),
"tagline": lambda value, lang: translate_text(value, lang),
"details": lambda value, lang: translate_text(value, lang),
# "actions": lambda value, lang: translate_text(value, lang),
# "features": lambda value, lang: translate_text(value, lang),
# 使用固定的替换规则
"categories": lambda value, lang: front_matter_replace(value, lang),
"tags": lambda value, lang: front_matter_replace(value, lang),
# 未添加的字段将默认不翻译
}
# 固定字段替换规则。文章中一些固定的字段,不需要每篇都进行翻译,且翻译结果可能不一致,所以直接替换掉。
replace_rules = [
{
# 版权信息手动翻译
"orginal_text": "> 原文地址:<https://wiki-power.com/>",
"replaced_text": {
"en": "> Original: <https://wiki-power.com/>",
"es": "> Dirección original del artículo: <https://wiki-power.com/>",
"ar": "> عنوان النص: <https://wiki-power.com/>",
}
},
{
# 版权信息手动翻译
"orginal_text": "> 本篇文章受 [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by/4.0/deed.zh) 协议保护,转载请注明出处。",
"replaced_text": {
"en": "> This post is protected by [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by/4.0/deed.en) agreement, should be reproduced with attribution.",
"es": "> Este artículo está protegido por la licencia [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by/4.0/deed.zh). Si desea reproducirlo, por favor indique la fuente.",
"ar": "> يتم حماية هذا المقال بموجب اتفاقية [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by/4.0/deed.zh)، يُرجى ذكر المصدر عند إعادة النشر.",
}
},
{
# 文章中的站内链接,跳转为当前相同语言的网页
"orginal_text": "](https://wiki-power.com/",
"replaced_text": {
"en": "](https://wiki-power.com/en/",
"es": "](https://wiki-power.com/es/",
"ar": "](https://wiki-power.com/ar/",
}
}
# {
# # 不同语言可使用不同图床
# "orginal_text": ":
for index in range(len(value)):
element = value[index]
# print(f"element[{index}] = {element}")
for replacement in front_matter_replace_rules:
if replacement["orginal_text"] in element:
# 使用 replace 函数逐个替换
element = element.replace(
replacement["orginal_text"], replacement["replaced_text"][lang])
value[index] = element
# print(f"element[{index}] = {element}")
return value
# 定义调用 ChatGPT API 翻译的函数
def translate_text(text, lang):
target_lang = {
"en": "English",
"es": "Spanish",
"ar": "Arabic"
}[lang]
completion = ollama.chat(model="qwen2",
messages=[
{"role": "system", "content": "You are a translation expert proficient in various languages, focusing solely on translating text without interpretation. You accurately understand the meanings of proper nouns, idioms, metaphors, allusions, and other obscure words in sentences, translating them appropriately based on the context and language environment. The translation should be natural and fluent. Only return the translated text, without including redundant quotes or additional notes."},
{"role": "user", "content": f"Translate the following Simplified-Chinese text into {target_lang} text: {text}"},
])
# 获取翻译结果
output_text = completion['message']['content']
return output_text
# Front Matter 处理规则
def translate_front_matter(front_matter, lang):
def recursive_translate(value, lang):
if isinstance(value, dict):
translated_dict = {}
for k, v in value.items():
if k in front_matter_translation_rules:
processed_value = front_matter_translation_rules[k](v, lang)
else:
processed_value = recursive_translate(v, lang)
translated_dict[k] = processed_value
return translated_dict
elif isinstance(value, list): #处理数组
translated_list = []
for item in value:
translated_list.append(recursive_translate(item, lang))
return translated_list
else:
return value
return recursive_translate(front_matter, lang)
# 定义文章拆分函数
def split_text(text, max_length):
# 根据段落拆分文章
paragraphs = text.split("\n\n")
output_paragraphs = []
current_paragraph = ""
for paragraph in paragraphs:
if len(current_paragraph) + len(paragraph) + 2 <= max_length:
# 如果当前段落加上新段落的长度不超过最大长度,就将它们合并
if current_paragraph:
current_paragraph += "\n\n"
current_paragraph += paragraph
else:
# 否则将当前段落添加到输出列表中,并重新开始一个新段落
output_paragraphs.append(current_paragraph)
current_paragraph = paragraph
# 将最后一个段落添加到输出列表中
if current_paragraph:
output_paragraphs.append(current_paragraph)
# 将输出段落合并为字符串
output_text = "\n\n".join(output_paragraphs)
return output_text
# 定义翻译文件的函数
def translate_file(input_file, lang):
print(f"Translating into {lang}: {input_file}")
sys.stdout.flush()
# 定义输出文件
output_file = input_file.replace("docs",f"docs/{lang}")
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 读取输入文件内容
with open(input_file, "r", encoding="utf-8") as f:
input_text = f.read()
# 创建一个字典来存储占位词和对应的替换文本
placeholder_dict = {}
# 使用 for 循环应用替换规则,并将匹配的文本替换为占位词
for i, rule in enumerate(replace_rules):
find_text = rule["orginal_text"]
replace_with = rule["replaced_text"][lang]
placeholder = f"[to_be_replace[{i + 1}]]"
input_text = input_text.replace(find_text, placeholder)
placeholder_dict[placeholder] = replace_with
# 删除译文中指示强制翻译的 marker
input_text = input_text.replace(marker_force_translate, "")
# 使用正则表达式来匹配 Front Matter
front_matter_match = re.search(r'---\n(.*?)\n---', input_text, re.DOTALL)
if front_matter_match:
front_matter_text = front_matter_match.group(1)
# 使用PyYAML加载YAML格式的数据
front_matter_data = yaml.safe_load(front_matter_text)
# 按照前文的规则对 Front Matter 进行翻译
front_matter_data = translate_front_matter(front_matter_data, lang)
# 将处理完的数据转换回 YAML
front_matter_text_processed = yaml.dump(
front_matter_data, allow_unicode=True, default_style=None, sort_keys=False)
# 暂时删除未处理的 Front Matter
input_text = input_text.replace(
"---\n"+front_matter_text+"\n---\n", "")
else:
# print("没有找到front matter,不进行处理。")
pass
# print(input_text) # debug 用,看看输入的是什么
# 拆分文章
paragraphs = input_text.split("\n\n")
input_text = ""
output_paragraphs = []
current_paragraph = ""
for paragraph in paragraphs:
if len(current_paragraph) + len(paragraph) + 2 <= max_length:
# 如果当前段落加上新段落的长度不超过最大长度,就将它们合并
if current_paragraph:
current_paragraph += "\n\n"
current_paragraph += paragraph
else:
# 否则翻译当前段落,并将翻译结果添加到输出列表中
output_paragraphs.append(translate_text(current_paragraph, lang))
current_paragraph = paragraph
# 处理最后一个段落
if current_paragraph:
if len(current_paragraph) + len(input_text) <= max_length:
# 如果当前段落加上之前的文本长度不超过最大长度,就将它们合并
input_text += "\n\n" + current_paragraph
else:
# 否则翻译当前段落,并将翻译结果添加到输出列表中
output_paragraphs.append(translate_text(current_paragraph, lang))
# 如果还有未翻译的文本,就将它们添加到输出列表中
if input_text:
output_paragraphs.append(translate_text(input_text, lang))
# 将输出段落合并为字符串
output_text = "\n\n".join(output_paragraphs)
if front_matter_match:
# 加入 Front Matter
output_text = "---\n" + front_matter_text_processed + "---\n\n" + output_text
# 最后,将占位词替换为对应的替换文本
for placeholder, replacement in placeholder_dict.items():
output_text = output_text.replace(placeholder, replacement)
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
f.write(output_text)
def process_file(fullPath):
# 读取 Markdown 文件的内容
with open(fullPath, "r", encoding="utf-8") as f:
md_content = f.read()
# 读取processed_list内容
with open(processed_list, "r", encoding="utf-8") as f:
processed_list_content = f.read()
if marker_force_translate in md_content: # 如果有强制翻译的标识,则执行这部分的代码
for lang in translates:
translate_file(fullPath, lang)
elif fullPath in exclude_list: # 不进行翻译
print(f"Pass the post in exclude_list: {fullPath}")
sys.stdout.flush()
elif fullPath in processed_list_content: # 不进行翻译
print(f"Pass the post in processed_list: {fullPath}")
sys.stdout.flush()
else: # 翻译为所有语言
for lang in translates:
translate_file(fullPath, lang)
# 将处理完成的文件名加到列表,下次跳过不处理
if fullPath not in processed_list_content:
print(f"Added into processed_list: {fullPath}")
with open(processed_list, "a", encoding="utf-8") as f:
f.write("\n")
f.write(fullPath)
# 强制将缓冲区中的数据刷新到终端中,使用 GitHub Action 时方便实时查看过程
sys.stdout.flush()
def recursive_process_dir(dir_to_translate):
# 按文件名称顺序排序
file_list = [f for f in os.listdir(dir_to_translate) if f not in excluded_dirs]
sorted_file_list = sorted(file_list)
for filename in sorted_file_list:
full_path = os.path.join(dir_to_translate, filename)
if os.path.isdir(full_path):
# 如果遇到的是目录,则递归处理
recursive_process_dir(full_path)
elif filename.endswith(".md"):
try:
process_file(full_path)
except Exception as e:
# 捕获异常并输出错误信息
print(f"An error has occurred while processing file {full_path}: {e}")
sys.stdout.flush()
# 开始递归处理目录
try:
recursive_process_dir(dir_to_translate)
# 所有任务完成的提示
print("Congratulations! All files processed done.")
sys.stdout.flush()
except Exception as e:
print(f"An error has occurred: {e}")
sys.stdout.flush()
raise SystemExit(1) # 1 表示非正常退出