This repository has been archived by the owner on Aug 27, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
388 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,3 +162,4 @@ cython_debug/ | |
# SQLite database | ||
*.db | ||
*.bin | ||
output/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
"""Book downloader""" | ||
import os | ||
from shutil import rmtree | ||
from requests import Session | ||
from .fetcher import Fetcher | ||
from .util import Mode | ||
|
||
|
||
class Downloader(Fetcher): | ||
"""Book downloader""" | ||
|
||
def __init__(self, path: str = "output/tmp/") -> None: | ||
super().__init__() | ||
self.path = path | ||
self.ignored_errors = [] | ||
self.urls = [] | ||
self.num_threads = 25 | ||
self.workload.job_size = 10 | ||
|
||
def clean_up(self) -> None: | ||
"""Delete tmp files if terminated""" | ||
if self.terminated: | ||
self.rmdir() | ||
return super().clean_up() | ||
|
||
def job_worker(self, job: tuple[int, int], thread_id: int) -> None: | ||
"""Fetch images of pages""" | ||
with Session() as session: | ||
for i in range(job[0], job[1]): | ||
if self.terminated: | ||
return | ||
self.progress[thread_id] = i | ||
url = self.urls[i] | ||
got = self.get(url, session) | ||
|
||
with open(f"{self.path}/{str(i + 1).rjust(4, '0')}.jpg", "wb") as ofile: | ||
for chunk in got.iter_content(chunk_size=8192): | ||
ofile.write(chunk) | ||
|
||
def rmdir(self): | ||
"""Remove output dir""" | ||
try: | ||
rmtree(self.path) | ||
except FileNotFoundError: | ||
pass | ||
|
||
def mkdir(self): | ||
"""Create output dir""" | ||
os.makedirs(self.path, exist_ok=True) | ||
|
||
def download(self, urls: list): | ||
"""Clean up output dir and download files""" | ||
self.rmdir() | ||
self.mkdir() | ||
self.urls = urls | ||
self.start(Mode.FIXED, len(urls)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
"""The main module""" | ||
from . import DB, RawPages, PageCrawler, BookCrawler, Downloader | ||
from .util import Converter, gen_pdf | ||
|
||
|
||
class Main: | ||
"""The main class""" | ||
|
||
def __init__( | ||
self, | ||
database: str = "data/pubu.db", | ||
raw_pages: str = "data/", | ||
output: str = "output/", | ||
verbose: bool = False, | ||
change_decode: bool = False, | ||
) -> None: | ||
self.database = DB(database) | ||
self.raw_pages = RawPages(raw_pages) | ||
self.output = output | ||
self.verbose = verbose | ||
|
||
# sub-modules | ||
self.page_crawler = PageCrawler(self.database, self.raw_pages) | ||
self.book_crawler = BookCrawler(self.database) | ||
self.downloader = Downloader(output + "tmp/") | ||
self.converter = Converter(output + "tmp/", change_decode) | ||
|
||
def download(self, book_id: int) -> None: | ||
"""Download a book""" | ||
if self.verbose: | ||
print(f"[*] Downloading book with book_id {book_id}") | ||
book = self.database.search_book(book_id) | ||
if book is None or book.error > 0: | ||
if self.verbose: | ||
cause = "not found in database" if book is None else "invalid" | ||
print(f"[!] Book is {cause}. Fetching online information...") | ||
|
||
book = self.book_crawler.job_worker([book_id, book_id + 1], 0)[0] | ||
if book.error > 0 or book.doc_id == 0 or book.pages == 0: | ||
print(f"[!] Online info is invalid - book info: {book.to_tuple()}") | ||
return | ||
|
||
if self.verbose: | ||
print(f"[*] Found book: {book.to_tuple()}") | ||
print("[*] Getting pages...") | ||
|
||
pages = self.database.get_pages(book.doc_id) | ||
if len(pages) < book.pages: | ||
lack = book.pages - len(pages) | ||
if self.verbose: | ||
print(f"[!] Missing {lack} pages, continue in search mode") | ||
raise NotImplementedError("Search mode is not implemented") | ||
elif len(pages) > book.pages: | ||
if self.verbose: | ||
print(f"[!] Extra {len(pages) - book.pages} pages in local files.") | ||
pages = pages[: book.pages] | ||
|
||
# download pages | ||
if self.verbose: | ||
print(f"[*] Downloading {len(pages)} pages...") | ||
self.downloader.download([page.to_url(self.database) for page in pages]) | ||
|
||
# convert images | ||
if self.verbose: | ||
print("[*] Converting images...") | ||
self.converter.convert(book.doc_id) | ||
|
||
# generate PDF | ||
if self.verbose: | ||
print("[*] Generating PDF...") | ||
gen_pdf(self.output + "/tmp", self.output, book.title) | ||
self.downloader.rmdir() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
"""Utilities""" | ||
from .counter import Counter | ||
from .workload import Workload, Mode | ||
from .converter import Converter | ||
from .pdf import gen_pdf |
Oops, something went wrong.