From 18f3c440d95630be0c75d8c4ac17ac44bd492c4d Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Thu, 6 Feb 2025 14:50:32 -0500 Subject: [PATCH] fix: add docs, move util functions into useful places. --- .github/workflows/run-archiver.yml | 4 +- src/pudl_archiver/archivers/classes.py | 8 +++ src/pudl_archiver/archivers/eia/eiarecs.py | 83 ++++++++++------------ src/pudl_archiver/archivers/validate.py | 7 +- src/pudl_archiver/utils.py | 9 +++ 5 files changed, 61 insertions(+), 50 deletions(-) diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index a5e2e621..a643a9b7 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' + default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py index 512a0230..2598b655 100644 --- a/src/pudl_archiver/archivers/classes.py +++ b/src/pudl_archiver/archivers/classes.py @@ -14,6 +14,7 @@ from pathlib import Path import aiohttp +import bs4 import pandas as pd from pudl_archiver.archivers import validate @@ -129,6 +130,13 @@ def __init__( self.logger = logging.getLogger(f"catalystcoop.{__name__}") self.logger.info(f"Archiving {self.name}") + async def __get_soup(self, url: str) -> bs4.BeautifulSoup: + """Get a BeautifulSoup instance for a URL using our existing session.""" + response = await retry_async(self.session.get, args=[url]) + # TODO 2025-02-03: for some reason, lxml fails to grab the closing div + # tag for tab content - so we use html.parser, which is slower. + return bs4.BeautifulSoup(await response.text(), "html.parser") + @abstractmethod def get_resources(self) -> ArchiveAwaitable: """Abstract method that each data source must implement to download all resources. diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index 345022b3..c67964b0 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -1,24 +1,18 @@ """Archive EIA Residential Energy Consumption Survey (RECS).""" -import logging import re -from collections import defaultdict from dataclasses import dataclass from io import BytesIO from pathlib import Path from urllib.parse import urljoin, urlparse -import bs4 - from pudl_archiver.archivers.classes import ( AbstractDatasetArchiver, ArchiveAwaitable, ResourceInfo, ) from pudl_archiver.frictionless import ZipLayout -from pudl_archiver.utils import retry_async - -logger = logging.getLogger(f"catalystcoop.{__name__}") +from pudl_archiver.utils import is_html_file BASE_URL = "https://www.eia.gov/consumption/residential/data/" @@ -38,13 +32,6 @@ class EiaRECSArchiver(AbstractDatasetArchiver): name = "eiarecs" base_url = "https://www.eia.gov/consumption/residential/data/2020/" - async def __get_soup(self, url: str) -> bs4.BeautifulSoup: - """Get a BeautifulSoup instance for a URL using our existing session.""" - response = await retry_async(self.session.get, args=[url]) - # TODO 2025-02-03: for some reason, lxml fails to grab the closing div - # tag for tab content - so we use html.parser, which is slower. - return bs4.BeautifulSoup(await response.text(), "html.parser") - async def get_resources(self) -> ArchiveAwaitable: """Download EIA-RECS resources. @@ -86,25 +73,15 @@ async def __get_year_resources(self, url: str, year: int) -> ResourceInfo: tab_infos = await self.__select_tabs(url) - # most tabs for most years can be handled the same way - tab_handlers = { - "housing-characteristics": defaultdict(lambda: self.__get_tab_links), - "consumption-expenditures": defaultdict(lambda: self.__get_tab_links), - "microdata": defaultdict(lambda: self.__get_tab_html_and_links), - "methodology": defaultdict(lambda: self.__get_tab_html_and_links), - "state-data": defaultdict(lambda: self.__get_tab_links), - } - - # Add the exceptions - skip the 2009 and 2015 methodology sections for now - tab_handlers["methodology"][2015] = self.__skip - tab_handlers["methodology"][2009] = self.__skip + tab_handlers_overrides = {"methodology": {2009: self.__skip, 2015: self.__skip}} - zip_path = self.download_directory / f"eia-recs-{year}.zip" + zip_path = self.download_directory / f"eiarecs-{year}.zip" paths_within_archive = [] for tab in tab_infos: - paths_within_archive += await tab_handlers[tab.name][tab.year]( - tab_info=tab, zip_path=zip_path + tab_handler = tab_handlers_overrides.get(tab.name, {}).get( + tab.year, self.__get_tab_html_and_links ) + paths_within_archive += await tab_handler(tab_info=tab, zip_path=zip_path) self.logger.info(f"Looking for original forms for {year}") original_forms_within_archive = await self.__get_original_forms(year, zip_path) @@ -137,27 +114,36 @@ async def __add_links_to_archive( data_paths_in_archive = [] for link, output_filename in url_paths.items(): download_path = self.download_directory / output_filename - logger.debug(f"Fetching {link} to {download_path}") + self.logger.debug(f"Fetching {link} to {download_path}") await self.download_file(link, download_path, timeout=120) with download_path.open("rb") as f: # TODO 2025-02-04: check html-ness against the suffix... if we # have a php/html/cfm/etc. we probably actually *do* want the # html file. - if self.__is_html_file(f): - logger.info(f"{link} was HTML file - skipping.") + if is_html_file(f): + self.logger.info(f"{link} was HTML file - skipping.") continue self.add_to_archive( zip_path=zip_path, filename=output_filename, blob=f, ) - logger.debug(f"Added {link} to {zip_path} as {output_filename}") + self.logger.debug(f"Added {link} to {zip_path} as {output_filename}") data_paths_in_archive.append(output_filename) download_path.unlink() return data_paths_in_archive async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]: - """Get the data files for a single tab.""" + """Get the data files for a single tab. + + First, gets a list of all of the tags within the tab contents which have an href attribute. + + These tag objects have the HTML attrs accessible as if they were dictionaries - href, src, etc. + + They also have some Python attributes of their own that you can read: text, contents, children, etc. + + See https://beautiful-soup-4.readthedocs.io/en/latest/#tag for details. + """ soup = await self.__get_soup(tab_info.url) links_in_tab = soup.select("div.tab-contentbox a[href]") log_scope = f"{tab_info.year}:{tab_info.name}" @@ -177,7 +163,7 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]: urljoin(tab_info.url, link["href"]) for link in links_filtered ] links_with_filenames = { - link: f"eia-recs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}" + link: f"eiarecs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}" for link in resolved_links } @@ -194,11 +180,23 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]: async def __get_tab_html_and_links( self, tab_info: TabInfo, zip_path: Path ) -> list[str]: - """Get the data files in the tab, *and* get the tab content itself.""" + """Get the data files in the tab, *and* get the tab content itself. + + First, get all the links within the tab that aren't HTML files and + aren't mailtos. + + Then, gets the entire HTML contents of div.tab-contentbox, which + contains the tab contents. + + Then, makes a new HTML document with an html and a body tag, and shoves + the old tab contents in there. + + This makes a new HTML file that can be opened by one's browser and + includes the tab's contents - but any links/images will not work. + """ log_scope = f"{tab_info.year}:{tab_info.name}" self.logger.info(f"{log_scope}: Getting links in tab") links = await self.__get_tab_links(tab_info=tab_info, zip_path=zip_path) - self.logger.info(f"{log_scope}: Got {len(links)} links") soup = await self.__get_soup(tab_info.url) tab_content = soup.select_one("div.tab-contentbox") @@ -210,7 +208,7 @@ async def __get_tab_html_and_links( # TODO 2025-02-03: consider using some sort of html-to-pdf converter here. # use html-sanitizer or something before feeding it into pdf. - filename = f"eia-recs-{tab_info.year}-{tab_info.name}-tab-contents.html" + filename = f"eiarecs-{tab_info.year}-{tab_info.name}-tab-contents.html" self.add_to_archive( zip_path=zip_path, filename=filename, @@ -235,7 +233,7 @@ async def __get_original_forms(self, year: int, zip_path: Path) -> list[str]: resolved_links = [urljoin(forms_url, link["href"]) for link in links_filtered] links_with_filenames = { - link: f"eia-recs-{year}-form-{self.__get_filename_from_link(link)}" + link: f"eiarecs-{year}-form-{self.__get_filename_from_link(link)}" for link in resolved_links } @@ -248,13 +246,6 @@ def __get_filename_from_link(self, url: str) -> str: stem = re.sub(r"\W+", "-", filepath.stem) return f"{stem}{filepath.suffix}".lower() - def __is_html_file(self, fileobj: BytesIO) -> bool: - """Check the first 30 bytes of a file to see if there's an HTML header hiding in there.""" - fileobj.seek(0) - header = fileobj.read(30).lower().strip() - fileobj.seek(0) - return b" set[TabInfo]: """Get the clickable tab links from the EIA RECS page layout.""" diff --git a/src/pudl_archiver/archivers/validate.py b/src/pudl_archiver/archivers/validate.py index 9a8c1828..98491a82 100644 --- a/src/pudl_archiver/archivers/validate.py +++ b/src/pudl_archiver/archivers/validate.py @@ -12,7 +12,7 @@ from pydantic import BaseModel from pudl_archiver.frictionless import DataPackage, Resource, ZipLayout -from pudl_archiver.utils import Url +from pudl_archiver.utils import Url, is_html_file logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -277,7 +277,7 @@ def _process_resource_diffs( return [*changed_resources, *created_resources, *deleted_resources] -def _validate_file_type(path: Path, buffer: BytesIO) -> bool: +def _validate_file_type(path: Path, buffer: BytesIO) -> bool: # noqa:C901 """Check that file appears valid based on extension.""" extension = path.suffix @@ -310,6 +310,9 @@ def _validate_file_type(path: Path, buffer: BytesIO) -> bool: # magic bytes for old-school xls file return header.hex() == "d0cf11e0a1b11ae1" + if extension == ".html": + return is_html_file(buffer) + if extension == ".txt": return _validate_text(buffer) diff --git a/src/pudl_archiver/utils.py b/src/pudl_archiver/utils.py index 17c30915..41fa5591 100644 --- a/src/pudl_archiver/utils.py +++ b/src/pudl_archiver/utils.py @@ -6,6 +6,7 @@ import zipfile from collections.abc import Awaitable, Callable from hashlib import md5 +from io import BytesIO from pathlib import Path import aiohttp @@ -145,3 +146,11 @@ def compute_md5(file_path: UPath) -> str: hash_md5.update(chunk) return hash_md5.hexdigest() + + +def is_html_file(self, fileobj: BytesIO) -> bool: + """Check the first 30 bytes of a file to see if there's an HTML header hiding in there.""" + fileobj.seek(0) + header = fileobj.read(30).lower().strip() + fileobj.seek(0) + return b"