Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make EIA CBECS Archive #554

Merged
merged 13 commits into from
Jan 31, 2025
111 changes: 111 additions & 0 deletions src/pudl_archiver/archivers/eia/eiacbecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Archive EIA Commercial Buildings Energy Consumption Survey (CBECS)."""

import logging
import re
from pathlib import Path
from urllib.parse import urljoin

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = "https://www.eia.gov/consumption/commercial/data/"
logger = logging.getLogger(f"catalystcoop.{__name__}")
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved


class EiaCbecsArchiver(AbstractDatasetArchiver):
"""EIA CBECS archiver."""

name = "eiacbecs"

async def get_resources(self) -> ArchiveAwaitable:
"""Download EIA-CBECS resources."""
link_pattern = re.compile(r"commercial/data/(\d{4})/$", re.IGNORECASE)

for link in await self.get_hyperlinks(BASE_URL, link_pattern):
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
match = link_pattern.search(link)
year = match.group(1)
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
yield self.get_year_resources(year)

async def get_year_resources(self, year: int) -> list[ResourceInfo]:
"""Download all files from all views for a year."""
data_paths_in_archive = set()
zip_path = self.download_directory / f"eiacbecs-{year}.zip"
char_and_cons_pattern = (
rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$"
)
data_view_patterns = {
"characteristics": re.compile(char_and_cons_pattern),
"consumption": re.compile(char_and_cons_pattern),
# some of the mircodata links are like csv/file01.csv which doesn't include
# the year or archive. instead of adding a null option for that first group
# we add a whole new pattern for these two years because if we don't
# we'd pick up some of the 2018 pdf files that are on the right hand side
# of these pages
"microdata": re.compile(
rf"(?:{year}/|archive/)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$"
if year not in ["2003", "1999"]
else r"^(?:csv|pdf)/(.*)(.csv|.pdf)$"
),
# the most recent cbecs doesn't a year or archive in the methodology links
# BUT there are almost always pdf files from 2018 that get caught up in
# these scrapers if we don't include year or archive. so we have a special
# 2018 pattern
"methodology": re.compile(
rf"(?:{year}|archive/pubs)(?:/pdf|)/(.*)(.pdf$)"
if year != "2018"
else r"/consumption/commercial(?:/data/2018|)/pdf/(.*)(.pdf)$"
),
}

for view, table_link_pattern in data_view_patterns.items():
year_url = f"{BASE_URL}{year}/index.php?view={view}"
for link in await self.get_hyperlinks(year_url, table_link_pattern):
match = table_link_pattern.search(link)
unique_id = (
match.group(1).replace("_", "-").replace(" ", "-").lower().strip()
)
file_extension = match.group(2)
filename = f"eiacbecs-{year}-{view}-{unique_id}{file_extension}"
file_url = urljoin(year_url, link)
download_path = self.download_directory / filename
await self.download_file(file_url, download_path)
with Path.open(download_path, "rb") as f:
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
first_bytpes = f.read(20)
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
if b"html" in first_bytpes.lower().strip():
logger.warning(
f"Skipping {file_url} because it appears to be a redirect/html page."
)
pass
else:
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
# Don't want to leave multiple files on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()
# Check if all of the views found any links
year_has_all_views: dict[str, bool] = {
view: any(fn for fn in data_paths_in_archive if view in fn)
for view in data_view_patterns
}
views_without_files = [
view for (view, has_files) in year_has_all_views.items() if not has_files
]
if views_without_files:
raise AssertionError(
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
"We expect all years of EIA CBECS to have some data from all four "
f"views, but we found these views without files for {year}: {views_without_files}"
)

return ResourceInfo(
local_path=zip_path,
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)