Skip to content

Commit

Permalink
Tech: met en cache local les PDF distants
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbgk committed Dec 6, 2020
1 parent 67cfd21 commit 8e4f00e
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ cert.pem
dist/
node_modules/
src/index.html
src/pdfs/*
.cache
.nyc_output
__pycache__
Expand Down
73 changes: 69 additions & 4 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#!/usr/bin/env python3
import fnmatch
import os
from html.parser import HTMLParser
from http import HTTPStatus
from pathlib import Path
from time import perf_counter

import httpx
import mistune
from jinja2 import Environment as JinjaEnv
from jinja2 import FileSystemLoader, StrictUndefined
Expand Down Expand Up @@ -55,11 +58,74 @@ def build_responses(source_dir):
return responses


class PDFLinkExtractor(HTMLParser):
def reset(self):
HTMLParser.reset(self)
self.pdf_links = set()

def handle_starttag(self, tag, attrs):
if tag == "a":
attrs = dict(attrs)
url = attrs["href"]
if url.startswith("http") and url.endswith(".pdf"):
self.pdf_links.update([url])


def url_to_file_name(url: str) -> str:
file_name = (
url.replace("http://", "")
.replace("https://", "")
.replace(".", "-")
.replace("/", "-")
# Tempting to do `.replace("-pdf", ".pdf")` here but there are some
# use-cases where it fails if the URL contains `/pdf/` for instance.
)
return file_name


def save_binary_response(file_path: Path, response: "httpx.Response"):
with open(file_path, "wb") as download_file:
for chunk in response.iter_bytes():
download_file.write(chunk)


def put_pdfs_in_local_cache(content: str, timeout: int = 10) -> str:
pdfs_file_path = HERE / "src" / "pdfs"
if not pdfs_file_path.exists():
pdfs_file_path.mkdir(parents=True)
parser = PDFLinkExtractor()
parser.feed(content)
for pdf_link in sorted(parser.pdf_links):
file_name = url_to_file_name(pdf_link)
target = f"pdfs/{file_name}.pdf"
if (SRC_DIR / target).exists():
print(f"SKIP: {pdf_link} exists in {SRC_DIR / target}")
else:
print(f"FETCH: {pdf_link} to {SRC_DIR / target}")

with httpx.stream(
"GET",
pdf_link,
timeout=timeout,
verify=False, # ignore SSL certificate validation errors
) as response:
if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
print("Warning: we’re being throttled, skipping link (429)")
continue
if response.status_code != HTTPStatus.OK:
raise Exception(f"{pdf_link} is broken! ({response.status_code})")
save_binary_response(pdfs_file_path / f"{file_name}.pdf", response)
content = content.replace(pdf_link, target)
return content


@cli
def index():
"""Build the index with contents from markdown dedicated folder."""
responses = build_responses(CONTENUS_DIR)
render_template("template.html", SRC_DIR / "index.html", **responses)
content = render_template("template.html", **responses)
content = put_pdfs_in_local_cache(content)
(SRC_DIR / "index.html").write_text(content)


def me_or_them(value):
Expand All @@ -70,11 +136,10 @@ def me_or_them(value):
return value


def render_template(src, output, **context):
def render_template(src, **context):
jinja_env.filters["me_or_them"] = me_or_them
template = jinja_env.get_template(src)
content = template.render(**context,)
output.open("w").write(content)
return template.render(**context)


@cli
Expand Down
18 changes: 10 additions & 8 deletions check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from build import each_folder_from, each_file_from

HERE = Path(__file__).parent
SRC_DIR = HERE / "src"
CONTENUS_DIR = HERE / "contenus"


class LinkExtractor(HTMLParser):
Expand All @@ -37,7 +39,7 @@ def handle_starttag(self, tag, attrs):
@cli
def links(timeout: int = 10, delay: float = 0.1):
parser = LinkExtractor()
content = open(HERE / "src" / "index.html").read()
content = (SRC_DIR / "index.html").read_text()
parser.feed(content)
for link in sorted(parser.links):
print(link)
Expand All @@ -61,7 +63,7 @@ def versions():
data = json.loads(content)
version = data["version"]
line_prefix = "const CACHE_NAME = "
for line in open(HERE / "src" / "service-worker.js"):
for line in open(SRC_DIR / "service-worker.js"):
if line.startswith(line_prefix):
break
if version not in line:
Expand Down Expand Up @@ -90,7 +92,7 @@ def service_worker():
# Retrieving the list from CACHE_FILES.
sw_filenames = set()
start = False
for line in open(HERE / "src" / "service-worker.js"):
for line in open(SRC_DIR / "service-worker.js"):
# Parsing a JS file in Python, what could potentially go wrong?
if line.startswith("const CACHE_FILES = ["):
start = True
Expand Down Expand Up @@ -125,7 +127,7 @@ def service_worker():
fonts_file_names = {
f"fonts/{filename}"
for file_path, filename in each_file_from(
HERE / "src" / "fonts", file_name="*.woff2", exclude=[".DS_Store"]
SRC_DIR / "fonts", file_name="*.woff2", exclude=[".DS_Store"]
)
}
if not fonts_file_names.issubset(sw_filenames):
Expand All @@ -137,7 +139,7 @@ def service_worker():
illustrations_file_names = {
f"illustrations/{filename}"
for file_path, filename in each_file_from(
HERE / "src" / "illustrations",
SRC_DIR / "illustrations",
exclude=[".DS_Store"],
)
}
Expand All @@ -150,7 +152,7 @@ def service_worker():
src_file_names = {
filename
for file_path, filename in each_file_from(
HERE / "src",
SRC_DIR,
file_name="*.*",
exclude=[".DS_Store"],
)
Expand All @@ -163,8 +165,8 @@ def service_worker():

@cli
def orphelins():
template = (HERE / "src" / "template.html").read_text()
for folder in each_folder_from(HERE / "contenus", exclude=["nouveaux_contenus"]):
template = (SRC_DIR / "template.html").read_text()
for folder in each_folder_from(CONTENUS_DIR, exclude=["nouveaux_contenus"]):
for file_path, filename in each_file_from(
folder, file_name="*.md", exclude=["README.md"]
):
Expand Down

0 comments on commit 8e4f00e

Please sign in to comment.