Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make EIA CBECS Archive #554

Merged
merged 13 commits into from
Jan 31, 2025
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
120 changes: 120 additions & 0 deletions src/pudl_archiver/archivers/eia/eiacbecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""Archive EIA Commercial Buildings Energy Consumption Survey (CBECS)."""

import re
from pathlib import Path
from urllib.parse import urljoin

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = "https://www.eia.gov/consumption/commercial/data/"


class EiaCbecsArchiver(AbstractDatasetArchiver):
"""EIA CBECS archiver."""

name = "eiacbecs"

async def get_resources(self) -> ArchiveAwaitable:
"""Download EIA-CBECS resources."""
# we use this link and pattern to determine which years of CBECS data exists,
# but these base year links are only a portion of the view links so we
# construct the full links within get_year_resources
link_pattern = re.compile(r"commercial/data/(\d{4})/$", re.IGNORECASE)
for link in await self.get_hyperlinks(BASE_URL, link_pattern):
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
match = link_pattern.search(link)
year = match.group(1)
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
if int(year) > 2018:
raise self.logger.warning(
f"There is a new year of data: {year}! This will almost certainly "
"require some updating of this archive."
)
yield self.get_year_resources(year)

async def get_year_resources(self, year: int) -> list[ResourceInfo]:
"""Download all files from all views for a year."""
data_paths_in_archive = set()
zip_path = self.download_directory / f"eiacbecs-{year}.zip"
char_and_cons_pattern = (
rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$"
)
data_view_patterns = {
"characteristics": re.compile(char_and_cons_pattern),
"consumption": re.compile(char_and_cons_pattern),
# some of the mircodata links are like csv/file01.csv which doesn't include
# the year or archive. instead of adding a null option for that first group
# we add a whole new pattern for these two years because if we don't
# we'd pick up some of the 2018 pdf files that are on the right hand side
# of these pages
"microdata": re.compile(
rf"(?:{year}/|archive/)(?:xls|pdf|csv|sas)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$"
if year not in ["2003", "1999"]
else r"^(?:csv|pdf)/(.*)(.csv|.pdf)$"
),
# the most recent cbecs doesn't a year or archive in the methodology links
# BUT there are almost always pdf files from 2018 that get caught up in
# these scrapers if we don't include year or archive. so we have a special
# 2018 pattern
"methodology": re.compile(
rf"(?:{year}|archive/pubs)(?:/pdf|)/(.*)(.pdf$)"
if year != "2018"
else r"/consumption/commercial(?:/data/2018|)/pdf/(.*)(.pdf)$"
),
}

for view, table_link_pattern in data_view_patterns.items():
year_url = f"{BASE_URL}{year}/index.php?view={view}"
for link in await self.get_hyperlinks(year_url, table_link_pattern):
match = table_link_pattern.search(link)
unique_id = (
match.group(1).replace("_", "-").replace(" ", "-").lower().strip()
)
file_extension = match.group(2)
filename = f"eiacbecs-{year}-{view}-{unique_id}{file_extension}"
file_url = urljoin(year_url, link)
download_path = self.download_directory / filename
await self.download_file(file_url, download_path)
# there are a small-ish handful of files who's links redirect to the main
# cbecs page. presumably its a broken link. we want to skip those files,
# so we are going to check to see if the doctype of the bytes of the file
# are html. if so we move on, otherwise add to the archive
with Path.open(download_path, "rb") as f:
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
first_bytes = f.read(20)
if b"html" in first_bytes.lower().strip():
self.logger.warning(
f"Skipping {file_url} because it appears to be a redirect/html page."
)
pass
else:
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
# Don't want to leave multiple files on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()
# Check if all of the views found any links
year_has_all_views: dict[str, bool] = {
view: any(fn for fn in data_paths_in_archive if view in fn)
for view in data_view_patterns
}
views_without_files = [
view for (view, has_files) in year_has_all_views.items() if not has_files
]
if views_without_files:
raise AssertionError(
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
"We expect all years of EIA CBECS to have some data from all four "
f"views, but we found these views without files for {year}: {views_without_files}"
)

return ResourceInfo(
local_path=zip_path,
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ eia930:
eiaaeo:
production_doi: 10.5281/zenodo.10838488
sandbox_doi: 10.5072/zenodo.37746
eiacbecs:
production_doi: 10.5281/zenodo.14782474
sandbox_doi: 10.5072/zenodo.161000
eia_bulk_elec:
production_doi: 10.5281/zenodo.7067366
sandbox_doi: 10.5072/zenodo.2356
Expand Down