forked from GDXN/MangaCMS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchCleaner.py
301 lines (215 loc) · 8.55 KB
/
archCleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import os
import os.path
import hashlib
import settings
import logging
import magic
import UniversalArchiveInterface
import rarfile
import traceback
try:
import pyximport
pyximport.install()
import czipfile.czipfile as zipfile
except ImportError:
print("Unzipping performance can be increased MASSIVELY by")
print("installing cython, which will result in the use of a ")
print("cythonized unzipping package, rather then the (default)")
print("pure-python zip decyption. ")
print("")
print("The speedup achieved via cython can reach ~100x faster then ")
print("the pure-python implementation!")
traceback.print_exc()
print("Falling back to the pure-python implementation due to the lack of cython.")
import zipfile
class NotAnArchive(Exception):
pass
class DamagedArchive(Exception):
pass
# Utility class for common archive cleaning and manipulation tasks.
# `cleanZip()` scans the contents of a archive for any of a number of "bad" files (loaded from settings.badImageDir).
# If it finds any of the bad files in the archive, it re-creates the archive with with the bad file deleted.
# MD5 is used for haching, because cryptographic security is not important here.
# If `cleanZip` is passed a rar file, and it finds files it wants to delete in said rar, it will
# convert the rar to a zip
# `unprotectZip()` removes password-based encryption from a zip file
# `processNewArchive()` takes an archive, and passes it through `cleanZip()` and `unprotectZip()`, and then
# checks it to see if it's duplicated by other files aready in the database.
# It should eventually add it to the database if it's new, or delete it if it's redundant.
# (WIP)
import processDownload
class ArchCleaner(object):
loggerPath = "Main.ZipClean"
def __init__(self):
self.proc = [processDownload.MangaProcessor(), processDownload.HentaiProcessor()]
self.log = logging.getLogger(self.loggerPath)
badIms = os.listdir(settings.badImageDir)
self.badHashes = []
for im in badIms:
with open(os.path.join(settings.badImageDir, im), "rb") as fp:
md5 = hashlib.md5()
md5.update(fp.read())
self.badHashes.append(md5.hexdigest())
self.log.info("Bad Image = '%s', Hash = '%s'", im, md5.hexdigest())
# So starkana, in an impressive feat of douchecopterness, inserts an annoying self-promotion image
# in EVERY manga archive the serve. Furthermore, they insert it in the MIDDLE of the manga.
# Therefore, this function edits the zip and removes this stupid annoying file.
def cleanZip(self, archPath):
origPath = archPath
if not os.path.exists(archPath):
raise ValueError("Trying to clean non-existant file?")
fType = magic.from_file(archPath, mime=True)
if not fType == 'application/zip' and \
not fType == 'application/x-rar' and \
not fType == 'application/x-7z-compressed':
raise NotAnArchive("Trying to clean a file that is not a zip/rar/7z archive! File=%s" % archPath)
file_count = 0
self.log.info("Scanning arch '%s'", archPath)
try:
old_zfp = UniversalArchiveInterface.ArchiveReader(archPath)
files = []
hadBadFile = False
if fType == 'application/x-7z-compressed':
# Cause fuck 7z files. They're slowwwww
hadBadFile = True
if fType == 'application/x-rar':
# Fukkit, convert ALL THE FILES
hadBadFile = True
for fileN, fileCtnt in old_zfp:
file_count += 1
if fileN.endswith("Thumbs.db"):
hadBadFile = True
self.log.info("Had windows 'Thumbs.db' file. Removing")
continue
if "/__MACOSX/" in fileN or fileN.startswith("__MACOSX/"):
hadBadFile = True
self.log.info("Have apple bullshit files. Removing")
continue
if ".DS_Store" in fileN:
hadBadFile = True
self.log.info("Have apple bullshit '.DS_Store' files. Removing")
continue
fctnt = fileCtnt.read()
md5 = hashlib.md5()
md5.update(fctnt)
# Replace bad image with a text-file with the same name, and an explanation in it.
if md5.hexdigest() in self.badHashes:
self.log.info("File %s was the advert. Removing!", fileN)
fileN = fileN + ".deleted.txt"
fctnt = "This was an advertisement. It has been automatically removed.\n"
fctnt += "Don't worry, there are no missing files, despite the gap in the numbering."
hadBadFile = True
files.append((fileN, fctnt))
old_zfp.close()
# only replace the file if we need to
if hadBadFile:
# Now, recreate the zip file without the ad
if not archPath.endswith(".zip"):
archPath = os.path.splitext(archPath)[0]
archPath += ".zip"
self.log.info("Had advert. Rebuilding zip as '%s'.", archPath)
new_zfp = zipfile.ZipFile(archPath, "w")
for fileInfo, contents in files:
new_zfp.writestr(fileInfo, contents)
new_zfp.close()
if origPath != archPath:
os.remove(origPath)
for proc in self.proc:
proc.updatePath(origPath, archPath)
else:
self.log.info("No offending contents. No changes made to file.")
except UniversalArchiveInterface.ArchiveError:
self.log.error("Bad archive file!")
for line in traceback.format_exc().split("\n"):
self.log.error(line)
raise DamagedArchive()
return archPath, file_count
# Rebuild zipfile `zipPath` that has a password as a non-password protected zip
# Pre-emptively checks if the zip is really password-protected, and does not
# rebuild zips that are not password protected.
def unprotectZip(self, zipPath, password):
password = password.encode("ascii")
try:
old_zfp = zipfile.ZipFile(zipPath, "r")
files = old_zfp.infolist()
for fileN in files:
old_zfp.open(fileN).read()
self.log.info("Do not need to decrypt zip")
return
except RuntimeError:
self.log.error("Error in archive checker?")
for line in traceback.format_exc().split("\n"):
self.log.error(line)
return
except zipfile.BadZipFile:
self.log.error("Archive is corrupt/damaged?")
for line in traceback.format_exc().split("\n"):
self.log.error(line)
return
except:
self.log.error("Unknown error??")
for line in traceback.format_exc().split("\n"):
self.log.error(line)
return
self.log.info("Removing password from zip '%s'", zipPath)
old_zfp = zipfile.ZipFile(zipPath, "r")
old_zfp.setpassword(password)
fileNs = old_zfp.namelist()
files = []
# The zip decryption is STUPID slow. It's really insane how shitty
# the python integrated library is.
# See czipfile.pyd for some work on making it faster.
for fileInfo in fileNs:
fctnt = old_zfp.open(fileInfo).read()
files.append((fileInfo, fctnt))
old_zfp.close()
os.remove(zipPath)
# only replace the file if we need to
# Now, recreate the zip file without the ad
self.log.info("Rebuilding zip without password.")
new_zfp = zipfile.ZipFile(zipPath, "w")
for fileInfo, contents in files:
new_zfp.writestr(fileInfo, contents)
new_zfp.close()
# Process a newly downloaded archive. If deleteDups is true, and the archive is duplicated, it is deleted.
# If includePHash is true as well, the duplicate search is done using phashes of the images, in addition
# to just raw file-hashing.
def processNewArchive(self, archPath, passwd=""):
if magic.from_file(archPath, mime=True) == 'application/zip':
self.unprotectZip(archPath, passwd)
elif magic.from_file(archPath, mime=True) == 'application/x-rar':
pass
else:
self.log.error("ArchCleaner called on file that isn't a rar or zip!")
self.log.error("Called on file %s", archPath)
self.log.error("Specified password '%s'", passwd)
self.log.error("Inferred file type %s", magic.from_file(archPath, mime=True))
raise NotAnArchive("ArchCleaner called on file that isn't a rar or zip!")
# ArchPath will convert from rar to zip if needed, and returns the name of the resulting
# file in either case
try:
archPath, filecount = self.cleanZip(archPath)
if filecount <= 2:
return "fewfiles", archPath
return "", archPath
except (zipfile.BadZipFile, rarfile.BadRarFile, DamagedArchive, NotAnArchive):
self.log.error("Ignoring archive because it appears damaged.")
return "damaged", archPath
except:
self.log.error("Unknown error??")
for line in traceback.format_exc().split("\n"):
self.log.error(line)
return "damaged", archPath
if __name__ == "__main__":
import logSetup
logSetup.initLogging()
run = ArchCleaner()
basePath = '/media/Storage/Manga/'
for root, dirs, files in os.walk(basePath):
for name in files:
fileP = os.path.join(root, name)
if not os.path.exists(fileP):
raise ValueError
fType = magic.from_file(fileP, mime=True)
if fType == 'application/zip' or fType == 'application/x-rar':
run.processNewArchive(fileP)