Skip to content

Commit

Permalink
fix: add docs, move util functions into useful places.
Browse files Browse the repository at this point in the history
  • Loading branch information
jdangerx committed Feb 6, 2025
1 parent d526ff5 commit 18f3c44
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 50 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
8 changes: 8 additions & 0 deletions src/pudl_archiver/archivers/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pathlib import Path

import aiohttp
import bs4
import pandas as pd

from pudl_archiver.archivers import validate
Expand Down Expand Up @@ -129,6 +130,13 @@ def __init__(
self.logger = logging.getLogger(f"catalystcoop.{__name__}")
self.logger.info(f"Archiving {self.name}")

async def __get_soup(self, url: str) -> bs4.BeautifulSoup:
"""Get a BeautifulSoup instance for a URL using our existing session."""
response = await retry_async(self.session.get, args=[url])
# TODO 2025-02-03: for some reason, lxml fails to grab the closing div
# tag for tab content - so we use html.parser, which is slower.
return bs4.BeautifulSoup(await response.text(), "html.parser")

@abstractmethod
def get_resources(self) -> ArchiveAwaitable:
"""Abstract method that each data source must implement to download all resources.
Expand Down
83 changes: 37 additions & 46 deletions src/pudl_archiver/archivers/eia/eiarecs.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,18 @@
"""Archive EIA Residential Energy Consumption Survey (RECS)."""

import logging
import re
from collections import defaultdict
from dataclasses import dataclass
from io import BytesIO
from pathlib import Path
from urllib.parse import urljoin, urlparse

import bs4

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout
from pudl_archiver.utils import retry_async

logger = logging.getLogger(f"catalystcoop.{__name__}")
from pudl_archiver.utils import is_html_file

BASE_URL = "https://www.eia.gov/consumption/residential/data/"

Expand All @@ -38,13 +32,6 @@ class EiaRECSArchiver(AbstractDatasetArchiver):
name = "eiarecs"
base_url = "https://www.eia.gov/consumption/residential/data/2020/"

async def __get_soup(self, url: str) -> bs4.BeautifulSoup:
"""Get a BeautifulSoup instance for a URL using our existing session."""
response = await retry_async(self.session.get, args=[url])
# TODO 2025-02-03: for some reason, lxml fails to grab the closing div
# tag for tab content - so we use html.parser, which is slower.
return bs4.BeautifulSoup(await response.text(), "html.parser")

async def get_resources(self) -> ArchiveAwaitable:
"""Download EIA-RECS resources.
Expand Down Expand Up @@ -86,25 +73,15 @@ async def __get_year_resources(self, url: str, year: int) -> ResourceInfo:

tab_infos = await self.__select_tabs(url)

# most tabs for most years can be handled the same way
tab_handlers = {
"housing-characteristics": defaultdict(lambda: self.__get_tab_links),
"consumption-expenditures": defaultdict(lambda: self.__get_tab_links),
"microdata": defaultdict(lambda: self.__get_tab_html_and_links),
"methodology": defaultdict(lambda: self.__get_tab_html_and_links),
"state-data": defaultdict(lambda: self.__get_tab_links),
}

# Add the exceptions - skip the 2009 and 2015 methodology sections for now
tab_handlers["methodology"][2015] = self.__skip
tab_handlers["methodology"][2009] = self.__skip
tab_handlers_overrides = {"methodology": {2009: self.__skip, 2015: self.__skip}}

zip_path = self.download_directory / f"eia-recs-{year}.zip"
zip_path = self.download_directory / f"eiarecs-{year}.zip"
paths_within_archive = []
for tab in tab_infos:
paths_within_archive += await tab_handlers[tab.name][tab.year](
tab_info=tab, zip_path=zip_path
tab_handler = tab_handlers_overrides.get(tab.name, {}).get(
tab.year, self.__get_tab_html_and_links
)
paths_within_archive += await tab_handler(tab_info=tab, zip_path=zip_path)

self.logger.info(f"Looking for original forms for {year}")
original_forms_within_archive = await self.__get_original_forms(year, zip_path)
Expand Down Expand Up @@ -137,27 +114,36 @@ async def __add_links_to_archive(
data_paths_in_archive = []
for link, output_filename in url_paths.items():
download_path = self.download_directory / output_filename
logger.debug(f"Fetching {link} to {download_path}")
self.logger.debug(f"Fetching {link} to {download_path}")
await self.download_file(link, download_path, timeout=120)
with download_path.open("rb") as f:
# TODO 2025-02-04: check html-ness against the suffix... if we
# have a php/html/cfm/etc. we probably actually *do* want the
# html file.
if self.__is_html_file(f):
logger.info(f"{link} was HTML file - skipping.")
if is_html_file(f):
self.logger.info(f"{link} was HTML file - skipping.")
continue
self.add_to_archive(
zip_path=zip_path,
filename=output_filename,
blob=f,
)
logger.debug(f"Added {link} to {zip_path} as {output_filename}")
self.logger.debug(f"Added {link} to {zip_path} as {output_filename}")
data_paths_in_archive.append(output_filename)
download_path.unlink()
return data_paths_in_archive

async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
"""Get the data files for a single tab."""
"""Get the data files for a single tab.
First, gets a list of all of the <a> tags within the tab contents which have an href attribute.
These tag objects have the HTML attrs accessible as if they were dictionaries - href, src, etc.
They also have some Python attributes of their own that you can read: text, contents, children, etc.
See https://beautiful-soup-4.readthedocs.io/en/latest/#tag for details.
"""
soup = await self.__get_soup(tab_info.url)
links_in_tab = soup.select("div.tab-contentbox a[href]")
log_scope = f"{tab_info.year}:{tab_info.name}"
Expand All @@ -177,7 +163,7 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
urljoin(tab_info.url, link["href"]) for link in links_filtered
]
links_with_filenames = {
link: f"eia-recs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}"
link: f"eiarecs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}"
for link in resolved_links
}

Expand All @@ -194,11 +180,23 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
async def __get_tab_html_and_links(
self, tab_info: TabInfo, zip_path: Path
) -> list[str]:
"""Get the data files in the tab, *and* get the tab content itself."""
"""Get the data files in the tab, *and* get the tab content itself.
First, get all the links within the tab that aren't HTML files and
aren't mailtos.
Then, gets the entire HTML contents of div.tab-contentbox, which
contains the tab contents.
Then, makes a new HTML document with an html and a body tag, and shoves
the old tab contents in there.
This makes a new HTML file that can be opened by one's browser and
includes the tab's contents - but any links/images will not work.
"""
log_scope = f"{tab_info.year}:{tab_info.name}"
self.logger.info(f"{log_scope}: Getting links in tab")
links = await self.__get_tab_links(tab_info=tab_info, zip_path=zip_path)
self.logger.info(f"{log_scope}: Got {len(links)} links")

soup = await self.__get_soup(tab_info.url)
tab_content = soup.select_one("div.tab-contentbox")
Expand All @@ -210,7 +208,7 @@ async def __get_tab_html_and_links(
# TODO 2025-02-03: consider using some sort of html-to-pdf converter here.
# use html-sanitizer or something before feeding it into pdf.

filename = f"eia-recs-{tab_info.year}-{tab_info.name}-tab-contents.html"
filename = f"eiarecs-{tab_info.year}-{tab_info.name}-tab-contents.html"
self.add_to_archive(
zip_path=zip_path,
filename=filename,
Expand All @@ -235,7 +233,7 @@ async def __get_original_forms(self, year: int, zip_path: Path) -> list[str]:
resolved_links = [urljoin(forms_url, link["href"]) for link in links_filtered]

links_with_filenames = {
link: f"eia-recs-{year}-form-{self.__get_filename_from_link(link)}"
link: f"eiarecs-{year}-form-{self.__get_filename_from_link(link)}"
for link in resolved_links
}

Expand All @@ -248,13 +246,6 @@ def __get_filename_from_link(self, url: str) -> str:
stem = re.sub(r"\W+", "-", filepath.stem)
return f"{stem}{filepath.suffix}".lower()

def __is_html_file(self, fileobj: BytesIO) -> bool:
"""Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
fileobj.seek(0)
header = fileobj.read(30).lower().strip()
fileobj.seek(0)
return b"<!doctype html" in header

async def __select_tabs(self, url: str) -> set[TabInfo]:
"""Get the clickable tab links from the EIA RECS page layout."""

Expand Down
7 changes: 5 additions & 2 deletions src/pudl_archiver/archivers/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pydantic import BaseModel

from pudl_archiver.frictionless import DataPackage, Resource, ZipLayout
from pudl_archiver.utils import Url
from pudl_archiver.utils import Url, is_html_file

logger = logging.getLogger(f"catalystcoop.{__name__}")

Expand Down Expand Up @@ -277,7 +277,7 @@ def _process_resource_diffs(
return [*changed_resources, *created_resources, *deleted_resources]


def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
def _validate_file_type(path: Path, buffer: BytesIO) -> bool: # noqa:C901
"""Check that file appears valid based on extension."""
extension = path.suffix

Expand Down Expand Up @@ -310,6 +310,9 @@ def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
# magic bytes for old-school xls file
return header.hex() == "d0cf11e0a1b11ae1"

if extension == ".html":
return is_html_file(buffer)

if extension == ".txt":
return _validate_text(buffer)

Expand Down
9 changes: 9 additions & 0 deletions src/pudl_archiver/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import zipfile
from collections.abc import Awaitable, Callable
from hashlib import md5
from io import BytesIO
from pathlib import Path

import aiohttp
Expand Down Expand Up @@ -145,3 +146,11 @@ def compute_md5(file_path: UPath) -> str:
hash_md5.update(chunk)

return hash_md5.hexdigest()


def is_html_file(self, fileobj: BytesIO) -> bool:
"""Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
fileobj.seek(0)
header = fileobj.read(30).lower().strip()
fileobj.seek(0)
return b"<!doctype html" in header

0 comments on commit 18f3c44

Please sign in to comment.