Tech: met en cache local les PDF distants

Pour éviter les liens cassés en production. Cf. https://github.com/Delegation-numerique-en-sante/mesconseilscovid/pulls?q=is%3Apr+label%3Alien-cass%C3%A9+
chaibax · Dec 6, 2020 · 8e4f00e · 8e4f00e
1 parent 67cfd21
commit 8e4f00e
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ cert.pem
 dist/
 node_modules/
 src/index.html
+src/pdfs/*
 .cache
 .nyc_output
 __pycache__

diff --git a/build.py b/build.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python3
 import fnmatch
 import os
+from html.parser import HTMLParser
+from http import HTTPStatus
 from pathlib import Path
 from time import perf_counter
 
+import httpx
 import mistune
 from jinja2 import Environment as JinjaEnv
 from jinja2 import FileSystemLoader, StrictUndefined
@@ -55,11 +58,74 @@ def build_responses(source_dir):
     return responses
 
 
+class PDFLinkExtractor(HTMLParser):
+    def reset(self):
+        HTMLParser.reset(self)
+        self.pdf_links = set()
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "a":
+            attrs = dict(attrs)
+            url = attrs["href"]
+            if url.startswith("http") and url.endswith(".pdf"):
+                self.pdf_links.update([url])
+
+
+def url_to_file_name(url: str) -> str:
+    file_name = (
+        url.replace("http://", "")
+        .replace("https://", "")
+        .replace(".", "-")
+        .replace("/", "-")
+        # Tempting to do `.replace("-pdf", ".pdf")` here but there are some
+        # use-cases where it fails if the URL contains `/pdf/` for instance.
+    )
+    return file_name
+
+
+def save_binary_response(file_path: Path, response: "httpx.Response"):
+    with open(file_path, "wb") as download_file:
+        for chunk in response.iter_bytes():
+            download_file.write(chunk)
+
+
+def put_pdfs_in_local_cache(content: str, timeout: int = 10) -> str:
+    pdfs_file_path = HERE / "src" / "pdfs"
+    if not pdfs_file_path.exists():
+        pdfs_file_path.mkdir(parents=True)
+    parser = PDFLinkExtractor()
+    parser.feed(content)
+    for pdf_link in sorted(parser.pdf_links):
+        file_name = url_to_file_name(pdf_link)
+        target = f"pdfs/{file_name}.pdf"
+        if (SRC_DIR / target).exists():
+            print(f"SKIP: {pdf_link} exists in {SRC_DIR / target}")
+        else:
+            print(f"FETCH: {pdf_link} to {SRC_DIR / target}")
+
+            with httpx.stream(
+                "GET",
+                pdf_link,
+                timeout=timeout,
+                verify=False,  # ignore SSL certificate validation errors
+            ) as response:
+                if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
+                    print("Warning: we’re being throttled, skipping link (429)")
+                    continue
+                if response.status_code != HTTPStatus.OK:
+                    raise Exception(f"{pdf_link} is broken! ({response.status_code})")
+                save_binary_response(pdfs_file_path / f"{file_name}.pdf", response)
+        content = content.replace(pdf_link, target)
+    return content
+
+
 @cli
 def index():
     """Build the index with contents from markdown dedicated folder."""
     responses = build_responses(CONTENUS_DIR)
-    render_template("template.html", SRC_DIR / "index.html", **responses)
+    content = render_template("template.html", **responses)
+    content = put_pdfs_in_local_cache(content)
+    (SRC_DIR / "index.html").write_text(content)
 
 
 def me_or_them(value):
@@ -70,11 +136,10 @@ def me_or_them(value):
     return value
 
 
-def render_template(src, output, **context):
+def render_template(src, **context):
     jinja_env.filters["me_or_them"] = me_or_them
     template = jinja_env.get_template(src)
-    content = template.render(**context,)
-    output.open("w").write(content)
+    return template.render(**context)
 
 
 @cli

diff --git a/check.py b/check.py
@@ -11,6 +11,8 @@
 from build import each_folder_from, each_file_from
 
 HERE = Path(__file__).parent
+SRC_DIR = HERE / "src"
+CONTENUS_DIR = HERE / "contenus"
 
 
 class LinkExtractor(HTMLParser):
@@ -37,7 +39,7 @@ def handle_starttag(self, tag, attrs):
 @cli
 def links(timeout: int = 10, delay: float = 0.1):
     parser = LinkExtractor()
-    content = open(HERE / "src" / "index.html").read()
+    content = (SRC_DIR / "index.html").read_text()
     parser.feed(content)
     for link in sorted(parser.links):
         print(link)
@@ -61,7 +63,7 @@ def versions():
     data = json.loads(content)
     version = data["version"]
     line_prefix = "const CACHE_NAME = "
-    for line in open(HERE / "src" / "service-worker.js"):
+    for line in open(SRC_DIR / "service-worker.js"):
         if line.startswith(line_prefix):
             break
     if version not in line:
@@ -90,7 +92,7 @@ def service_worker():
     # Retrieving the list from CACHE_FILES.
     sw_filenames = set()
     start = False
-    for line in open(HERE / "src" / "service-worker.js"):
+    for line in open(SRC_DIR / "service-worker.js"):
         # Parsing a JS file in Python, what could potentially go wrong?
         if line.startswith("const CACHE_FILES = ["):
             start = True
@@ -125,7 +127,7 @@ def service_worker():
     fonts_file_names = {
         f"fonts/{filename}"
         for file_path, filename in each_file_from(
-            HERE / "src" / "fonts", file_name="*.woff2", exclude=[".DS_Store"]
+            SRC_DIR / "fonts", file_name="*.woff2", exclude=[".DS_Store"]
         )
     }
     if not fonts_file_names.issubset(sw_filenames):
@@ -137,7 +139,7 @@ def service_worker():
     illustrations_file_names = {
         f"illustrations/{filename}"
         for file_path, filename in each_file_from(
-            HERE / "src" / "illustrations",
+            SRC_DIR / "illustrations",
             exclude=[".DS_Store"],
         )
     }
@@ -150,7 +152,7 @@ def service_worker():
     src_file_names = {
         filename
         for file_path, filename in each_file_from(
-            HERE / "src",
+            SRC_DIR,
             file_name="*.*",
             exclude=[".DS_Store"],
         )
@@ -163,8 +165,8 @@ def service_worker():
 
 @cli
 def orphelins():
-    template = (HERE / "src" / "template.html").read_text()
-    for folder in each_folder_from(HERE / "contenus", exclude=["nouveaux_contenus"]):
+    template = (SRC_DIR / "template.html").read_text()
+    for folder in each_folder_from(CONTENUS_DIR, exclude=["nouveaux_contenus"]):
         for file_path, filename in each_file_from(
             folder, file_name="*.md", exclude=["README.md"]
         ):