Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

520 write an archiver for doe low income energy affordability data lead #536

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9504deb
[wip] feat: permit get_hyperlinks to accept a 'headers' argument that…
Jan 22, 2025
7fe9ce4
[wip] feat: add new archiver for DOE LEAD
Jan 22, 2025
c5ba88e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 22, 2025
fe0d44e
the rest of the owl
Jan 22, 2025
89937dd
resolve conflicts
Jan 22, 2025
1170476
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 22, 2025
51667b2
[fix] Add missing import
Jan 23, 2025
c6e1245
Merge branch '520-write-an-archiver-for-doe-low-income-energy-afforda…
Jan 23, 2025
020b3cd
[docs] Add more detail to doelead docstring
krivard Jan 23, 2025
3583b4e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 23, 2025
cb77f65
Merge branch 'main' into 520-write-an-archiver-for-doe-low-income-ene…
krivard Jan 23, 2025
de0ba16
[fix] switch to hard-coded DOIs for known releases, check LEAD Tool p…
Jan 24, 2025
96b064d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 24, 2025
e46f6d9
[fix] missing refactor in fstring
Jan 24, 2025
7fd864b
Merge branch '520-write-an-archiver-for-doe-low-income-energy-afforda…
Jan 24, 2025
15cee78
Merge branch 'main' into 520-write-an-archiver-for-doe-low-income-ene…
krivard Jan 24, 2025
2a3ef26
Merge branch 'main' into 520-write-an-archiver-for-doe-low-income-ene…
e-belfer Jan 28, 2025
ac73e65
Drop site that no longer exists, fix class
e-belfer Jan 28, 2025
9a2a149
Download methodology PDFs
e-belfer Jan 28, 2025
99ca4f4
Add PDF metadata to archive, add placeholder DOI
e-belfer Jan 28, 2025
283db91
Restore entire archiving workflow
e-belfer Jan 28, 2025
1f73afb
Update production DOI
e-belfer Jan 28, 2025
9a98f7d
Merge branch 'main' into 520-write-an-archiver-for-doe-low-income-ene…
e-belfer Jan 28, 2025
aded9cf
Merge branch 'main' into 520-write-an-archiver-for-doe-low-income-ene…
e-belfer Jan 29, 2025
f87e9f7
Add to GHA
e-belfer Jan 29, 2025
285479d
Oops also MECS
e-belfer Jan 29, 2025
b9cbbe5
Fix bad merge resolution
e-belfer Jan 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sneaking in eiamecs which didn't land in #516.

required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
13 changes: 10 additions & 3 deletions src/pudl_archiver/archivers/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ async def get_hyperlinks(
url: str,
filter_pattern: typing.Pattern | None = None,
verify: bool = True,
headers: dict | None = None,
) -> list[str]:
"""Return all hyperlinks from a specific web page.

Expand All @@ -235,12 +236,18 @@ async def get_hyperlinks(
url: URL of web page.
filter_pattern: If present, only return links that contain pattern.
verify: Verify ssl certificate (EPACEMS https source has bad certificate).
headers: Additional headers to send in the GET request.
"""
# Parse web page to get all hyperlinks
parser = _HyperlinkExtractor()

response = await retry_async(
self.session.get, args=[url], kwargs={"ssl": verify}
self.session.get,
args=[url],
kwargs={
"ssl": verify,
**({"headers": headers} if headers is not None else {}),
},
)
text = await retry_async(response.text)
parser.feed(text)
Expand All @@ -253,8 +260,8 @@ async def get_hyperlinks(
# Warn if no links are found
if not hyperlinks:
self.logger.warning(
f"The archiver couldn't find any hyperlinks that match {filter_pattern}."
f"Make sure your filter_pattern is correct or if the structure of the {url} page changed."
f"The archiver couldn't find any hyperlinks{('that match: ' + filter_pattern.pattern) if filter_pattern else ''}."
f"Make sure your filter_pattern is correct, check if the structure of the {url} page changed, or if you are missing HTTP headers."
)

return hyperlinks
Expand Down
138 changes: 138 additions & 0 deletions src/pudl_archiver/archivers/doelead.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Download DOE LEAD data.

Each partition includes:
- Data Dictionary
- Census Tracts List
- Cities List
- Counties List
- States List
- Tribal Areas List
- Cities Census Track Overlaps
- Tribal Areas Tract Overlaps
- One .zip file per state, each of which includes:
- AMI Census Tracts
- SMI Census Tracts
- LLSI Census Tracts
- FPL Census Tracts
- LLSI Counties
- SMI Counties
- FPL Counties
- AMI Counties
"""

import re

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

# This site is no longer online as of 01/28/2025.
# TOOL_URL = "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool"

YEARS_DOIS = {
2022: "https://doi.org/10.25984/2504170",
2018: "https://doi.org/10.25984/1784729",
}

# verified working 2025-01-22 via
# $ wget "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" -O foo.html -U "Mozilla/5.0 Catalyst/2025 Cooperative/2025"
HEADERS = {"User-Agent": "Mozilla/5.0 Catalyst/2025 Cooperative/2025"}


class DoeLeadArchiver(AbstractDatasetArchiver):
"""DOE LEAD archiver."""

name = "doelead"

async def get_resources(self) -> ArchiveAwaitable:
"""Download DOE LEAD resources.

The DOE LEAD Tool is down as of 01/28/2025. It didn't provide direct access
to the raw data, but instead linked to the current raw data release hosted on
OEDI. It did not provide links to past data releases. So, we hard-code the
DOIs for all known releases and archive those. Based on the removal of the main
page, it's safe to assume this won't be updated any time soon. If it is, we'll
need to manually update the DOIs.
"""
# e.g.: https://data.openei.org/files/6219/DC-2022-LEAD-data.zip
# https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx
# https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx
data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))")
"""Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names."""

for year, doi in YEARS_DOIS.items():
self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}")
filenames_links = {}
for data_link in await self.get_hyperlinks(doi, data_link_pattern):
matches = data_link_pattern.search(data_link)
if not matches:
continue
link_year = int(matches.group(2))
if link_year != year:
raise AssertionError(
f"We expect all files at {doi} to be for {year}, but we found: {link_year} from {data_link}"
)
filenames_links[matches.group(1)] = data_link
if filenames_links:
self.logger.info(f"Downloading: {year}, {len(filenames_links)} items")
yield self.get_year_resource(filenames_links, year)

# Download LEAD methodology PDF and other metadata separately
metadata_links = {
"lead-methodology-122024.pdf": "https://www.energy.gov/sites/default/files/2024-12/lead-methodology_122024.pdf",
"lead-tool-factsheet-072624.pdf": "https://www.energy.gov/sites/default/files/2024-07/lead-tool-factsheet_072624.pdf",
}
for filename, link in metadata_links.items():
yield self.get_metadata_resource(filename=filename, link=link)

async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo:
"""Download all available data for a year.

Resulting resource contains one zip file of CSVs per state/territory, plus a handful of .xlsx dictionary and geocoding files.

Args:
links: filename->URL mapping for files to download
year: the year we're downloading data for
"""
host = "https://data.openei.org"
zip_path = self.download_directory / f"doelead-{year}.zip"
data_paths_in_archive = set()
for filename, link in sorted(links.items()):
self.logger.info(f"Downloading {link}")
download_path = self.download_directory / filename
await self.download_file(f"{host}{link}", download_path)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
# Don't want to leave multiple giant files on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()
return ResourceInfo(
local_path=zip_path,
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)

async def get_metadata_resource(self, filename: str, link: str) -> ResourceInfo:
"""Download metadata resource.

Resulting resource contains one PDF file with metadata about the LEAD dataset.

Args:
links: filename->URL mapping for files to download
year: the year we're downloading data for
"""
self.logger.info(f"Downloading {link}")
download_path = self.download_directory / filename
await self.download_file(url=link, file_path=download_path, headers=HEADERS)

return ResourceInfo(
local_path=download_path,
partitions={},
)
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ censuspep:
doeiraec:
production_doi: 10.5281/zenodo.14757121
sandbox_doi: 10.5072/zenodo.157934
doelead:
production_doi: 10.5281/zenodo.14758684
# sandbox_doi: TODO once server 413 error resolves
eia176:
production_doi: 10.5281/zenodo.7682357
sandbox_doi: 10.5072/zenodo.3158
Expand Down