Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make a multi-year EIA MECS archive #542

Merged
merged 9 commits into from
Jan 29, 2025
90 changes: 71 additions & 19 deletions src/pudl_archiver/archivers/eia/eiamecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = "https://www.eia.gov/consumption/manufacturing/data"
logger = logging.getLogger(f"catalystcoop.{__name__}")
Expand All @@ -20,34 +21,85 @@ class EiaMECSArchiver(AbstractDatasetArchiver):

async def get_resources(self) -> ArchiveAwaitable:
"""Download EIA-MECS resources."""
for year in [2018]:
years_url = "https://www.eia.gov/consumption/data.php#mfg"
year_link_pattern = re.compile(r"(manufacturing/data/)(\d{4})/$")
for link in await self.get_hyperlinks(years_url, year_link_pattern):
match = year_link_pattern.search(link)
year = match.groups()[1]
yield self.get_year_resources(year)

async def get_year_resources(self, year: int) -> list[ResourceInfo]:
"""Download all excel tables for a year."""
table_link_pattern = re.compile(r"[Tt]able(\d{1,2})_(\d{1,2}).xlsx")
logger.info(f"Attempting to find resources for: {year}")
data_paths_in_archive = set()
year_url = f"{BASE_URL}/{year}"
zip_path = self.download_directory / f"eiamecs-{year}.zip"
if int(year) >= 2006:
table_link_pattern = re.compile(
r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)"
)
elif int(year) == 2002:
table_link_pattern = re.compile(
r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)"
)
elif int(year) == 1998:
table_link_pattern = re.compile(
r"(d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls)"
)
elif int(year) == 1994:
# These earlier years the pattern is functional but not actually that informative.
# so we will just use the original name by making the whole pattern a match
table_link_pattern = re.compile(
r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))"
)
elif int(year) == 1991:
table_link_pattern = re.compile(r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love this whole situation but i didn't know what else to do. i thought about making a dict w/ year as key and pattern as value but then it wouldn't naturally grab the next year if the newer pattern holds.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could make the latest year be the default pattern and update with a dict-key if it's one of the other years? I agree it's a bit verbose.


# Loop through all download links for tables
tables = []
year_url = f"{BASE_URL}/{year}"
for table_link in await self.get_hyperlinks(year_url, table_link_pattern):
table_link = f"{year_url}/{table_link}"
logger.info(f"Fetching {table_link}")
# Get table major/minor number from links
# We are going to rename the files in a standard format by extracting
# patterns from the table_link_pattern
# From 1998 and before there are a bunch of letters in the file names
# in patterns that are probably parsable somehow, but for now we are
# just going to keep the original file names
match = table_link_pattern.search(table_link)
major_num, minor_num = match.group(1), match.group(2)

# Download file
download_path = (
self.download_directory
/ f"eia-mecs-{year}-table-{major_num}-{minor_num}.xlsx"
)
await self.download_zipfile(table_link, download_path)

tables.append(
ResourceInfo(
local_path=download_path,
partitions={"year": year, "table": f"{major_num}_{minor_num}"},
filename = match.group(1)
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
if int(year) > 1998:
is_rse = match.group(1)
# there are several ways the they indicate that the files are
# "data" vs "rse". we will add this to the end of the file name
# but only for rse bc for many years data and the rse are together
rse_map = {
"": "",
"d": "",
"RSE": "-rse",
"e": "-rse",
}
rse = rse_map[is_rse]
major_num = match.group(2)
minor_num = match.group(3)
extension = match.group(4)
# Download filename
filename = (
f"eia-mecs-{year}-table-{major_num}-{minor_num}{rse}{extension}"
)
download_path = self.download_directory / filename
await self.download_file(table_link, download_path)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
return tables
data_paths_in_archive.add(filename)
# Don't want to leave multiple giant CSVs on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()

resource_info = ResourceInfo(
local_path=zip_path,
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)
return resource_info
2 changes: 2 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ eiaaeo:
eia_bulk_elec:
production_doi: 10.5281/zenodo.7067366
sandbox_doi: 10.5072/zenodo.2356
eiamecs:
sandbox_doi: 10.5072/zenodo.149504
eiawater:
production_doi: 10.5281/zenodo.7683135
sandbox_doi: 10.5072/zenodo.3160
Expand Down
Loading