Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new archiver for EPA eGRID #549

Merged
merged 20 commits into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
883bb97
draft version of archiving egrid
cmgosnell Jan 27, 2025
098001b
fix download link and make recent year work
cmgosnell Jan 27, 2025
1527f13
add the pm emissions files
cmgosnell Jan 27, 2025
dba6515
remove is_recent bool arg
cmgosnell Jan 27, 2025
30e26bb
Merge branch 'main' into epaegrid
cmgosnell Jan 29, 2025
3945df7
Merge branch 'main' into epaegrid
cmgosnell Jan 29, 2025
c1098af
Update src/pudl_archiver/archivers/epaegrid.py
cmgosnell Jan 29, 2025
35c39cd
Update src/pudl_archiver/archivers/epaegrid.py
cmgosnell Jan 29, 2025
d9e6151
Merge branch 'epaegrid' of github.com:catalyst-cooperative/pudl-archi…
cmgosnell Jan 29, 2025
80bca99
add bespoke pm methodologies and add a bb helper function
cmgosnell Jan 29, 2025
be0a4a6
add valid year and to script docs
cmgosnell Jan 29, 2025
13cb3d8
move the underscore replace to the table name and add TODOs into the …
cmgosnell Jan 29, 2025
5b391c0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 29, 2025
f686bd4
docs udpates
cmgosnell Jan 29, 2025
7a36e5d
Merge branch 'epaegrid' of github.com:catalyst-cooperative/pudl-archi…
cmgosnell Jan 29, 2025
e34342e
Merge branch 'main' into epaegrid
cmgosnell Jan 29, 2025
6daf46e
mannually grab methodology and generalize link pattern to include man…
cmgosnell Jan 29, 2025
bcd0c31
migrate little helper method into ABC
cmgosnell Jan 29, 2025
6655161
add dois, add to GHA, move to new epa dir
cmgosnell Jan 29, 2025
ebae22e
Merge branch 'main' into epaegrid
cmgosnell Jan 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
21 changes: 21 additions & 0 deletions src/pudl_archiver/archivers/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,27 @@ def add_to_archive(self, zip_path: Path, filename: str, blob: typing.BinaryIO):
archive=archive, filename=filename, data=blob.read()
)

async def download_add_to_archive_and_unlink(
self, url: str, filename: str, zip_path: Path
):
"""Download a file, add it to an zip file in and archive and unlink.

Little helper function that combines three common steps often repeated together:
* :meth:`download_file`
* :meth:`add_to_archive`
* :meth:`Path.unlink`
"""
download_path = self.download_directory / filename
await self.download_file(url, download_path)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
# Don't want to leave multiple files on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()

async def get_json(self, url: str, **kwargs) -> dict[str, str]:
"""Get a JSON and return it as a dictionary."""
response = await retry_async(self.session.get, args=[url], kwargs=kwargs)
Expand Down
87 changes: 87 additions & 0 deletions src/pudl_archiver/archivers/epa/epaegrid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Download EPA eGRID data."""

import re

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = "https://www.epa.gov/egrid/historical-egrid-data"


class EpaEgridArchiver(AbstractDatasetArchiver):
"""EPA eGrid archiver."""

name = "epaegrid"

async def get_resources(self) -> ArchiveAwaitable:
"""Download EPA eGrid resources."""
# All of the "historical" data is stored on one page while the most
# recent data is stored on the main dataset page. So we need to
# go grab all the old data first and then get the newest data.
link_pattern = re.compile(r"egrid(\d{4})_data(_v(\d{1})|).xlsx", re.IGNORECASE)
years = []
for link in await self.get_hyperlinks(BASE_URL, link_pattern):
match = link_pattern.search(link)
year = int(match.group(1))
years += [year]
if self.valid_year(year):
yield self.get_year_resource(
year, [BASE_URL, "https://www.epa.gov/egrid/egrid-pm25"]
)

recent_year = max(years) + 1
recent_urls = [
"https://www.epa.gov/egrid/detailed-data",
"https://www.epa.gov/egrid/summary-data",
"https://www.epa.gov/egrid/egrid-technical-guide",
"https://www.epa.gov/egrid/egrid-pm25",
]
if self.valid_year(recent_year):
yield self.get_year_resource(recent_year, recent_urls)

async def get_year_resource(self, year: int, base_urls: list[str]) -> ResourceInfo:
"""Download all files pertaining to an eGRID year."""
zip_path = self.download_directory / f"epaegrid-{year}.zip"
table_link_pattern = re.compile(
rf"egrid{year}(?:_|-)([a-z,_\d,-]*)(.xlsx|.pdf|.txt)$", re.IGNORECASE
)
data_paths_in_archive = set()
for base_url in base_urls:
for url in await self.get_hyperlinks(base_url, table_link_pattern):
match = table_link_pattern.search(url)
# TODO: this setup leaves in all the _rev# _r# _r#_# and _{date}
# in this table name. It would be ideal to remove this all together
table = match.group(1).replace("_", "-").lower().strip()
file_extension = match.group(2)
filename = f"epaegrid-{year}-{table}{file_extension}"
await self.download_add_to_archive_and_unlink(url, filename, zip_path)
data_paths_in_archive.add(filename)
# there is one file with PM 2.5 data in it which says its for 2018-2022
# add this file to every one of the yearly zips
pm_combo_years = [2018, 2019, 2020, 2021]
if year in pm_combo_years:
url = "https://www.epa.gov/system/files/documents/2024-06/egrid-draft-pm-emissions.xlsx"
filename = f"epaegrid-{year}-pm-emissions.xlsx"
await self.download_add_to_archive_and_unlink(url, filename, zip_path)
data_paths_in_archive.add(filename)
# There are two special case links on the PM 2.5 page that don't adhere to a
# clear pattern. so we'll hardcode how to grab them.
pm_special_year_links = {
2020: "https://www.epa.gov/system/files/documents/2022-12/eGRID2020%20DRAFT%20PM%20Memo.pdf",
2019: "https://www.epa.gov/system/files/documents/2023-01/DRAFT%202019%20PM%20Memo.pdf",
2018: "https://www.epa.gov/sites/default/files/2020-07/documents/draft_egrid_pm_white_paper_7-20-20.pdf",
}
if year in pm_special_year_links:
url = pm_special_year_links[year]
filename = f"epaegrid-{year}-pm-emissions-methodology.pdf"
await self.download_add_to_archive_and_unlink(url, filename, zip_path)
data_paths_in_archive.add(filename)
return ResourceInfo(
local_path=zip_path,
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)
4 changes: 2 additions & 2 deletions src/pudl_archiver/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def parse_main(args=None):
nargs="*",
help="Years to download data for. Supported datasets: censusdp1tract, censuspep, "
"eia176, eia191, eia757a, eia860, eia860m, eia861, eia923, eia930, eia_bulk_elec, "
"eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, ferc1, ferc2, ferc6, ferc60, ferc714, "
"mshamines, nrelatb, phmsagas",
"eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, epaegrid,ferc1, ferc2, ferc6, "
"ferc60, ferc714, mshamines, nrelatb, phmsagas",
type=int,
)
parser.add_argument(
Expand Down
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ epacamd_eia:
epacems:
production_doi: 10.5281/zenodo.10233185
sandbox_doi: 10.5072/zenodo.12943
epaegrid:
production_doi: 10.5281/zenodo.14767235
sandbox_doi: 10.5072/zenodo.159996
epapcap:
production_doi: 10.5281/zenodo.14757598
#sandbox_doi: # Update!!
Expand Down