diff --git a/.dockerignore b/.dockerignore index 93f76c2..d19710b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,5 @@ public/ .github/ -.git/ \ No newline at end of file +.git/ +fireside-scraper +scraped-data \ No newline at end of file diff --git a/.gitignore b/.gitignore index 941d8e7..33ad939 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,7 @@ $RECYCLE.BIN/ # Editor .idea +.vscode + +# Ignore all the scraped data +scraped-data diff --git a/Makefile b/Makefile index fca8af9..b28191c 100644 --- a/Makefile +++ b/Makefile @@ -5,4 +5,19 @@ build: hugo -D run: - docker-compose up -d --build \ No newline at end of file + docker-compose up -d --build jbsite + +# Clean the scraped data +scrape-clean: + rm -r scraped-data && mkdir scraped-data + +# Execute scrapig all the data from fireside into scraped-data dir +scrape: scrape-clean + docker-compose up -d --build fireside-scraper && \ + docker-compose logs --no-log-prefix -f fireside-scraper + +# Copy contents of the scraped-data into the project +scrape-copy: + ./scrape-copy.sh && ./generate-guests-symlinks.sh + +scrape-full: scrape scrape-copy diff --git a/README.md b/README.md index 8f6f7e7..46b40c2 100644 --- a/README.md +++ b/README.md @@ -69,9 +69,50 @@ Deployment is done with Github Actions, see workflow file in `.github/workflows/ At the moment it is only triggered when something in the `main` branch is changing, but it can also be set up to run at certain times. This would also enable scheduled publishing, since Hugo per default only build pages which have set `date` in frontmatter to <= `now` + +## Fireside Scraper + +The [fireside-scraper](./fireside-scraper/) is based on [JB Show Notes](https://github.com/selfhostedshow/show-notes) that was written by [ironicbadger](https://github.com/ironicbadger). + +It goes over all the JB firside shows and scrapes the episodes into the format that is expected by hugo for each episode (using [this template](./fireside-scraper/src/templates/episode.md.j2)). + +Besides the episodes it also scrapes and creates the json files for: + +- sponsors +- hosts +- guests (every host is symlinked into the [guests dir](./data/guests/) since a host of one show, could be a guest on an episode of a different show) + +There are makefile commands that should me used to run it. + +### Run the scraper + +The command below would build, and start up the container which would save all the data into `scraped-data` dir. + +``` +make scrape +``` + +The files are organised in the same way as the files in the root project. This makes it very trivial to just copy the contents of `scraped-data` over to the root dir of the repo to include all the scraped content. And it can be done with: + +``` +make scrape-copy +``` + +or you could just run the following to scrape and copy over the root dir all at once: + +``` +make scrape-full +``` + +### Configuring the scraper + +Configure the scraper by modifying this [config.yml file](./fireside-scraper/src/config.yml) + ## Credits -I took parts of the functionality from the Castanet Theme: https://github.com/mattstratton/castanet +- I took parts of the functionality from the Castanet Theme: https://github.com/mattstratton/castanet Mainly the RSS feed generation and managing of hosts / guests. +- [ironicbadger](https://github.com/ironicbadger) and [JB Show Notes](https://github.com/selfhostedshow/show-notes) project which was used as the base for the `fireside-scraper` + Time spend so far: 13h diff --git a/config.toml b/config.toml index 3931d31..ea41a5e 100644 --- a/config.toml +++ b/config.toml @@ -1,4 +1,4 @@ -baseURL = 'https://jb.codefighters.net/' +baseURL = 'http://localhost:1111/' languageCode = 'en-us' title = 'Jupiter Broadcasting' diff --git a/data/guests/alex.json b/data/guests/alex.json deleted file mode 100644 index ee7ce5e..0000000 --- a/data/guests/alex.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "username": "alex", - "name": "Alex Kretzschmar", - "bio": "Red Hatter. Drone Racer. Photographer. Dog lover.", - "avatar": "/images/guests/alex_kretzschmar.jpeg", - "twitter": "https://twitter.com/ironicbadger", - "homepage":"https://www.linuxserver.io/", - "linkedin":"https://www.linkedin.com/in/alex-kretzschmar/" -} diff --git a/docker-compose.yml b/docker-compose.yml index 1a3731f..5c89148 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,3 +8,12 @@ services: context: . ports: - 1111:80 + fireside-scraper: + user: 1000:1000 + image: fireside-scraper + container_name: fireside-scraper + build: + context: ./fireside-scraper + volumes: + - ./scraped-data:/data + - ./data:/hugo-data:ro diff --git a/fireside-scraper/Dockerfile b/fireside-scraper/Dockerfile new file mode 100644 index 0000000..e6f7498 --- /dev/null +++ b/fireside-scraper/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.10-alpine + +RUN mkdir /data && chown -R 1000:1000 /data + +COPY ./src/ / +RUN chown 1000:1000 /scraper.py +RUN pip install -U -r requirements.txt + +USER 1000 +CMD [ "python3", "scraper.py" ] \ No newline at end of file diff --git a/fireside-scraper/src/config.yml b/fireside-scraper/src/config.yml new file mode 100644 index 0000000..ebb5aac --- /dev/null +++ b/fireside-scraper/src/config.yml @@ -0,0 +1,31 @@ +shows: + selfhosted: + fireside_url: https://selfhosted.show + header_image: /images/shows/selfhosted.png + acronym: SSH + name: Self-Hosted + coderradio: + fireside_url: https://coder.show + header_image: /images/shows/coderradio.png + acronym: CR + name: Coder Radio + linux-action-news: + fireside_url: https://linuxactionnews.com + header_image: /images/shows/linux-action-news.png + acronym: LAN + name: Linux Action News + linuxun: + fireside_url: https://linuxunplugged.com + header_image: /images/shows/linuxun.png + acronym: LUP + name: LINUX Unplugged + extras: + fireside_url: https://extras.show + header_image: /images/shows/extras.png + acronym: JE + name: Jupiter EXTRAS + officehours: + fireside_url: https://www.officehours.hair + header_image: /images/shows/officehours.png + acronym: JE + name: Office Hours diff --git a/fireside-scraper/src/requirements.txt b/fireside-scraper/src/requirements.txt new file mode 100644 index 0000000..71eb0eb --- /dev/null +++ b/fireside-scraper/src/requirements.txt @@ -0,0 +1,7 @@ +beautifulsoup4==4.9.3 +requests==2.25.1 +jinja2==3.0.1 +pymdown-extensions==8.2 +html2text==2020.1.16 +pyyaml==5.4.1 +python-dateutil==2.8.2 \ No newline at end of file diff --git a/fireside-scraper/src/scraper.py b/fireside-scraper/src/scraper.py new file mode 100644 index 0000000..057de53 --- /dev/null +++ b/fireside-scraper/src/scraper.py @@ -0,0 +1,439 @@ +import concurrent.futures +import json +import operator +import os +from re import S +from urllib.parse import urlparse + +import html2text +import requests +import yaml +from bs4 import BeautifulSoup +from dateutil.parser import parse as date_parse +from jinja2 import Template + +DATA_ROOT_DIR = "/data" + +# Missing data found in a show. Used to scrape and/or create these files after the +# episode files been created. +MISSING_SPONSORS = {} +MISSING_HOSTS = set() +MISSING_GUESTS = set() + + +with open("templates/episode.md.j2") as f: + TEMPLATE = Template(f.read()) + + +def log_warn(show, ep, msg): + print(f"WARN | {show} {ep} | {msg}") + + +def mkdir_safe(directory): + try: + os.makedirs(directory) + except FileExistsError: + pass + + +def get_list(soup, pre_title): + """ + Blocks of links are preceded by a `p` saying what it is. + """ + pre_element = soup.find("p", string=pre_title) + if pre_element is None: + return None + return pre_element.find_next_sibling("ul") + + +def get_duration(seconds): + minutes, seconds = divmod(seconds, 60) + hours, minutes = divmod(minutes, 60) + return f"{hours:02}:{minutes:02}:{seconds:02}" + + +def get_plain_title(title: str): + """ + Get just the show title, without any numbering etc + """ + # Remove number before colon + title = title.split(":", 1)[-1] + + # Remove data after the pipe + title = title.rsplit("|", 1)[0] + + # Strip any stray spaces + return title.strip() + + +def create_episode(api_episode, show_config, hugo_data, output_dir): + try: + mkdir_safe(output_dir) + + # RANT: What kind of API doesn't give the episode number?! + episode_number = int(api_episode["url"].split("/")[-1]) + episode_number_padded = f"{episode_number:03}" + + output_file = f"{output_dir}/{episode_number}.md" + + if os.path.isfile(output_file): + print("Skipping", api_episode['url'], "as it already exists") + return + + publish_date = date_parse(api_episode['date_published']) + + api_soup = BeautifulSoup(api_episode["content_html"], "html.parser") + page_soup = BeautifulSoup(requests.get( + api_episode["url"]).content, "html.parser") + + blurb = api_episode["summary"] + + sponsors = parse_sponsors( + hugo_data, api_soup, page_soup, show_config["acronym"], episode_number) + + links = html2text.html2text( + str(get_list(api_soup, "Links:") or get_list(api_soup, "Episode Links:"))) + + tags = [] + for link in page_soup.find_all("a", class_="tag"): + tags.append(link.get_text().strip()) + + tags = sorted(tags) + + hosts = parse_hosts(hugo_data, page_soup, + show_config, episode_number) + + guests = parse_guests(hugo_data, page_soup, + show_config, episode_number) + + show_attachment = api_episode["attachments"][0] + + output = TEMPLATE.render( + { + # "title": api_episode["title"], + "title_plain": get_plain_title(api_episode["title"]), + "blurb": blurb, + "date_published": publish_date.date().isoformat(), + "is_draft": "false", + # TODO: In what case should the "Featured" category be added? + "categories": [show_config["name"]], + "tags": tags, + "hosts": hosts, + "guests": guests, + "sponsors": sponsors, + "header_image": show_config["header_image"], + + "episode_number": episode_number, + "episode_number_padded": episode_number_padded, + "podcast_duration": get_duration(int(show_attachment['duration_in_seconds'])), + # TODO: the url in fireside is prefixed using https://chtbl.com not http://www.podtrac.com. Should this be left as is or changed to use podtrac? + "podcast_file": show_attachment["url"], + "podcast_bytes": show_attachment["size_in_bytes"], + # "url": api_episode["url"], + + "youtube_link": "", # TODO: leave empty or use None? + "video_file": "", # TODO: leave empty or use None? + "links": links + } + ) + + with open(output_file, "w") as f: + print("Saving", api_episode["url"]) + f.write(output) + + except Exception as e: + print(f"ERROR | Failed to create an episode from url `{api_episode.get('url')}`. Exception: {e}") + +def parse_hosts(hugo_data, page_soup: BeautifulSoup, show_config, ep): + show = show_config["acronym"] + base_url = show_config["fireside_url"] + + hosts = [] + + # assumes the hosts are ALWAYS the first