-
-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Archiver for EIA RECS #534
base: main
Are you sure you want to change the base?
Changes from 4 commits
693714f
de35d0e
274b7dd
abd3566
67cbc8c
b08b0e1
1b87417
f06f7e3
ed70331
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
"""Archive EIA Residential Energy Consumption Survey (RECS).""" | ||
|
||
import logging | ||
import re | ||
|
||
from pudl_archiver.archivers.classes import ( | ||
AbstractDatasetArchiver, | ||
ArchiveAwaitable, | ||
ResourceInfo, | ||
) | ||
from pudl_archiver.frictionless import ZipLayout | ||
|
||
LINK_PATTERNS = [ | ||
# housing characteristics | ||
{ | ||
"base_url": "https://www.eia.gov/consumption/residential/data", | ||
"php_extension": "index.php?view=characteristics", | ||
"prefix": "hc", | ||
"pattern": re.compile(r"HC (\d{1,2})\.(\d{1,2})\.xlsx"), | ||
}, | ||
# consumption & expenditures | ||
{ | ||
"base_url": "https://www.eia.gov/consumption/residential/data", | ||
"php_extension": "index.php?view=consumption", | ||
"prefix": "ce", | ||
"pattern": re.compile(r"ce(\d)\.(\d{1,2})([a-z]?)\.xlsx"), | ||
}, | ||
# state data (housing characteristics) | ||
{ | ||
"base_url": "https://www.eia.gov/consumption/residential/data", | ||
"php_extension": "index.php?view=state", | ||
"prefix": "state", | ||
"pattern": re.compile(r"State (.*)\.xlsx"), | ||
}, | ||
# state data (consumption & expenditures) | ||
{ | ||
"base_url": "https://www.eia.gov/consumption/residential/data", | ||
"php_extension": "index.php?view=state", | ||
"prefix": "state-ce", | ||
"pattern": re.compile(r"ce(\d)\.(\d{1,2})\.(.*)\.xlsx"), | ||
}, | ||
# microdata | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm skipping the microdata for now. Happy to come back and rewrite everything to be more general once I get a better sense from folks who have a sense of how deep we want to go down this rabbit hole. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good - we can revisit later. I'm unaware of the context of this rabbit hole - assuming you all had a discussion during the hackathon? @cmgosnell |
||
# adding this in will require major changes+cleanup to the code below | ||
# { | ||
# "base_url": "https://www.eia.gov/consumption/residential/data", | ||
# "php_extension": "index.php?view=microdata", | ||
# "prefix": "udata", | ||
# "pattern": re.compile(r"(recs.*\d{4}.*public.*)\.(?:zip|csv|xlsx)", re.IGNORECASE), | ||
# } | ||
] | ||
logger = logging.getLogger(f"catalystcoop.{__name__}") | ||
|
||
|
||
class EiaRECSArchiver(AbstractDatasetArchiver): | ||
"""EIA RECS archiver.""" | ||
|
||
name = "eiarecs" | ||
|
||
async def get_resources(self) -> ArchiveAwaitable: | ||
"""Download EIA-RECS resources.""" | ||
for year in [2020]: | ||
yield self.get_year_resources(year) | ||
|
||
async def get_year_resources(self, year: int) -> list[ResourceInfo]: | ||
"""Download all excel tables for a year.""" | ||
# Loop through all download links for tables | ||
tables = [] | ||
zip_path = self.download_directory / f"eia-recs-{year}.zip" | ||
data_paths_in_archive = set() | ||
# Loop through different categories of data (all .xlsx) | ||
for pattern_dict in LINK_PATTERNS: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What a fun webpage! |
||
# Each category of data has its own url, etc. | ||
year_url = f"{pattern_dict['base_url']}/{year}" | ||
url = f"{year_url}/{pattern_dict['php_extension']}" | ||
table_link_pattern = pattern_dict["pattern"] | ||
for table_link in await self.get_hyperlinks(url, table_link_pattern): | ||
table_link = f"{year_url}/{table_link}" | ||
logger.info(f"Fetching {table_link}") | ||
# Get table major/minor number from links | ||
match = table_link_pattern.search(table_link) | ||
# We've gotta do a bit of wrangling to get the output filename | ||
# to match the url somewhat | ||
n_groups = len(match.groups()) | ||
output_filename = f"eia-recs-{year}-{pattern_dict['prefix']}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What follows is kinda ugly wrangling to set the output filename appropriately. It would probably be better to specify directly in the dictionary of LINK_PATTERNS above, instead of weird if statements catching the behavior of the regexes in LINK_PATTERNS. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, from a cursory reading, this might do something similar-enough so as to be still human-readable? Otherwise defining some helper functions like output_filename += "-".join(g for g in match.groups() if g).replace(" ", "_") |
||
if n_groups == 1: | ||
output_filename += "-" + match.group(1).lower().replace(" ", "_") | ||
else: | ||
major_num, minor_num = ( | ||
match.group(1), | ||
match.group(2), | ||
) | ||
output_filename += f"-{major_num}-{minor_num}" | ||
if n_groups == 3 and match.group(3) != "": | ||
output_filename += "-" + match.group(3) | ||
output_filename += ".xlsx" | ||
|
||
# Download file | ||
download_path = self.download_directory / output_filename | ||
await self.download_file(table_link, download_path) | ||
self.add_to_archive( | ||
zip_path=zip_path, | ||
filename=output_filename, | ||
blob=download_path.open("rb"), | ||
) | ||
data_paths_in_archive.add(output_filename) | ||
download_path.unlink() | ||
|
||
tables.append( | ||
ResourceInfo( | ||
local_path=zip_path, | ||
partitions={"year": year}, | ||
layout=ZipLayout(file_paths=data_paths_in_archive), | ||
) | ||
) | ||
return tables |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The RECS webpage is a little bit tricky. It has tabs containing different sequences of datafiles (and PDFs, etc.) that can't be reached directly from the base_url. Instead we need to tack on these strings to the end of the url, which are different for different tabs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be worth defining the shape of this dictionary as a dataclass, but definitely not a blocking concern.