From f35c87bdbfb5785af5c9883578dc9ca1d6df3aa9 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 28 Jan 2025 17:23:44 -0500 Subject: [PATCH 01/10] make eia cbecs archive --- src/pudl_archiver/archivers/eia/eiacbecs.py | 72 +++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 src/pudl_archiver/archivers/eia/eiacbecs.py diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py new file mode 100644 index 00000000..bf489913 --- /dev/null +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -0,0 +1,72 @@ +"""Archive EIA Commercial Buildings Energy Consumption Survey (CBECS).""" + +import logging +import re +from urllib.parse import urljoin + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, +) +from pudl_archiver.frictionless import ZipLayout + +BASE_URL = "https://www.eia.gov/consumption/commercial/data/" +logger = logging.getLogger(f"catalystcoop.{__name__}") + + +class EiaCbecsArchiver(AbstractDatasetArchiver): + """EIA CBECS archiver.""" + + name = "eiacbecs" + + async def get_resources(self) -> ArchiveAwaitable: + """Download EIA-CBECS resources.""" + link_pattern = re.compile(r"commercial/data/(\d{4})/$", re.IGNORECASE) + + for link in await self.get_hyperlinks(BASE_URL, link_pattern): + match = link_pattern.search(link) + year = match.group(1) + yield self.get_year_resources(year) + + async def get_year_resources(self, year: int) -> list[ResourceInfo]: + """Download all excel tables for a year.""" + data_paths_in_archive = set() + zip_path = self.download_directory / f"eiacbecs-{year}.zip" + data_views = ["characteristics", "consumption"] + for view in data_views: + year_url = f"{BASE_URL}{year}/index.php?view={view}" + table_link_pattern = re.compile( + rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$" + ) + for link in await self.get_hyperlinks(year_url, table_link_pattern): + match = table_link_pattern.search(link) + unique_id = match.group(1) + file_extension = match.group(2) + filename = f"eiacbecs-{year}-{view}-{unique_id}{file_extension}" + file_url = urljoin(year_url, link) + download_path = self.download_directory / filename + await self.download_file(file_url, download_path) + with open(download_path, "rb") as f: + first_bytpes = f.read(20) + if b"html" in first_bytpes.lower().strip(): + logger.warning( + f"Skipping {file_url} because it appears to be a redirect/html page." + ) + pass + else: + self.add_to_archive( + zip_path=zip_path, + filename=filename, + blob=download_path.open("rb"), + ) + data_paths_in_archive.add(filename) + # Don't want to leave multiple giant CSVs on disk, so delete + # immediately after they're safely stored in the ZIP + download_path.unlink() + + return ResourceInfo( + local_path=zip_path, + partitions={"year": year}, + layout=ZipLayout(file_paths=data_paths_in_archive), + ) From c4095ef551999183c88832823d95b893d603d4b9 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 28 Jan 2025 17:29:57 -0500 Subject: [PATCH 02/10] add in pathlib path for opening --- src/pudl_archiver/archivers/eia/eiacbecs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index bf489913..e5b8300f 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -2,6 +2,7 @@ import logging import re +from pathlib import Path from urllib.parse import urljoin from pudl_archiver.archivers.classes import ( @@ -47,7 +48,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: file_url = urljoin(year_url, link) download_path = self.download_directory / filename await self.download_file(file_url, download_path) - with open(download_path, "rb") as f: + with Path.open(download_path, "rb") as f: first_bytpes = f.read(20) if b"html" in first_bytpes.lower().strip(): logger.warning( From c645000851a7f1eb15bd373404f84974c091a028 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 29 Jan 2025 14:51:09 -0500 Subject: [PATCH 03/10] wip adding of the mircrodata --- src/pudl_archiver/archivers/eia/eiacbecs.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index e5b8300f..4530f0fc 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -34,12 +34,16 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" data_paths_in_archive = set() zip_path = self.download_directory / f"eiacbecs-{year}.zip" - data_views = ["characteristics", "consumption"] - for view in data_views: + pattern = rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$" + data_view_patterns = { + "characteristics": re.compile(pattern), + "consumption": re.compile(pattern), + "mircodata": re.compile( + rf"{year}/(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$" + ), + } + for view, table_link_pattern in data_view_patterns.items(): year_url = f"{BASE_URL}{year}/index.php?view={view}" - table_link_pattern = re.compile( - rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$" - ) for link in await self.get_hyperlinks(year_url, table_link_pattern): match = table_link_pattern.search(link) unique_id = match.group(1) From 8b2e898b82a8c5b7e76fe70f5c60a92fce0df465 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 29 Jan 2025 15:50:05 -0500 Subject: [PATCH 04/10] attempt to add microdata w/ a very failing assertion --- src/pudl_archiver/archivers/eia/eiacbecs.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index 4530f0fc..70efc2d2 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -69,6 +69,18 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: # Don't want to leave multiple giant CSVs on disk, so delete # immediately after they're safely stored in the ZIP download_path.unlink() + # Check if all of the views found any links + year_has_all_views: dict[str, bool] = { + view: any(fn for fn in data_paths_in_archive if view in fn) + for view in data_view_patterns + } + views_without_files = [ + view for (view, has_files) in year_has_all_views.items() if not has_files + ] + if views_without_files: + raise AssertionError( + f"We expect all years of EIA CBECS to have some data from all four views, but we found these views without files: {views_without_files}" + ) return ResourceInfo( local_path=zip_path, From 2b0b615fa2f334ba01b082a7c61f3ef0f2f2d02c Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 29 Jan 2025 18:06:17 -0500 Subject: [PATCH 05/10] make the microdata work and cleanup docs a bit --- src/pudl_archiver/archivers/eia/eiacbecs.py | 25 ++++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index 70efc2d2..36d62ba1 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -31,22 +31,34 @@ async def get_resources(self) -> ArchiveAwaitable: yield self.get_year_resources(year) async def get_year_resources(self, year: int) -> list[ResourceInfo]: - """Download all excel tables for a year.""" + """Download all files from all views for a year.""" data_paths_in_archive = set() zip_path = self.download_directory / f"eiacbecs-{year}.zip" - pattern = rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$" + pattern = rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$" data_view_patterns = { "characteristics": re.compile(pattern), "consumption": re.compile(pattern), "mircodata": re.compile( - rf"{year}/(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$" + rf"(?:{year}/|archive/|)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$" + ), + # the most recent cbecs doesn't a year or archive in the methodology links + # BUT there are almost always pdf files from 2018 that get caught up in + # these scrapers if we don't include year or archive. so we have a special + # 2018 pattern + "methodology": re.compile( + rf"(?:{year}|archive/pubs)(?:/pdf|)/(.*)(.pdf$)" + if year != "2018" + else r"/consumption/commercial(?:/data/2018|)/pdf/(.*)(.pdf)$" ), } + for view, table_link_pattern in data_view_patterns.items(): year_url = f"{BASE_URL}{year}/index.php?view={view}" for link in await self.get_hyperlinks(year_url, table_link_pattern): match = table_link_pattern.search(link) - unique_id = match.group(1) + unique_id = ( + match.group(1).replace("_", "-").replace(" ", "-").lower().strip() + ) file_extension = match.group(2) filename = f"eiacbecs-{year}-{view}-{unique_id}{file_extension}" file_url = urljoin(year_url, link) @@ -66,7 +78,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: blob=download_path.open("rb"), ) data_paths_in_archive.add(filename) - # Don't want to leave multiple giant CSVs on disk, so delete + # Don't want to leave multiple files on disk, so delete # immediately after they're safely stored in the ZIP download_path.unlink() # Check if all of the views found any links @@ -79,7 +91,8 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: ] if views_without_files: raise AssertionError( - f"We expect all years of EIA CBECS to have some data from all four views, but we found these views without files: {views_without_files}" + "We expect all years of EIA CBECS to have some data from all four " + f"views, but we found these views without files for {year}: {views_without_files}" ) return ResourceInfo( From be00ca1814f56d0efd8ec36e5ea2e0c2d3e4e4fd Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 31 Jan 2025 12:55:05 -0500 Subject: [PATCH 06/10] fix the microdata view patterns --- src/pudl_archiver/archivers/eia/eiacbecs.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index 36d62ba1..717ca118 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -34,12 +34,21 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all files from all views for a year.""" data_paths_in_archive = set() zip_path = self.download_directory / f"eiacbecs-{year}.zip" - pattern = rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$" + char_and_cons_pattern = ( + rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$" + ) data_view_patterns = { - "characteristics": re.compile(pattern), - "consumption": re.compile(pattern), - "mircodata": re.compile( - rf"(?:{year}/|archive/|)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$" + "characteristics": re.compile(char_and_cons_pattern), + "consumption": re.compile(char_and_cons_pattern), + # some of the mircodata links are like csv/file01.csv which doesn't include + # the year or archive. instead of adding a null option for that first group + # we add a whole new pattern for these two years because if we don't + # we'd pick up some of the 2018 pdf files that are on the right hand side + # of these pages + "microdata": re.compile( + rf"(?:{year}/|archive/)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$" + if year not in ["2003", "1999"] + else r"^(?:csv|pdf)/(.*)(.csv|.pdf)$" ), # the most recent cbecs doesn't a year or archive in the methodology links # BUT there are almost always pdf files from 2018 that get caught up in From 1ae779a4ebec7e6452cc1b1cde0a11947d25cef0 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 31 Jan 2025 13:49:31 -0500 Subject: [PATCH 07/10] add sas files --- src/pudl_archiver/archivers/eia/eiacbecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index 717ca118..f1c53e51 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -46,7 +46,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: # we'd pick up some of the 2018 pdf files that are on the right hand side # of these pages "microdata": re.compile( - rf"(?:{year}/|archive/)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$" + rf"(?:{year}/|archive/)(?:xls|pdf|csv|sas)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$" if year not in ["2003", "1999"] else r"^(?:csv|pdf)/(.*)(.csv|.pdf)$" ), From 66d36d877a7fa121a99f3b9609b510bd194489a6 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 31 Jan 2025 11:57:16 -0700 Subject: [PATCH 08/10] Update src/pudl_archiver/archivers/eia/eiacbecs.py Co-authored-by: E. Belfer <37471869+e-belfer@users.noreply.github.com> --- src/pudl_archiver/archivers/eia/eiacbecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index f1c53e51..48016e1d 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -74,7 +74,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: download_path = self.download_directory / filename await self.download_file(file_url, download_path) with Path.open(download_path, "rb") as f: - first_bytpes = f.read(20) + first_bytes = f.read(20) if b"html" in first_bytpes.lower().strip(): logger.warning( f"Skipping {file_url} because it appears to be a redirect/html page." From 519528c7b5878bce8d4ed03b1cdcf6be3098f936 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 31 Jan 2025 14:05:54 -0500 Subject: [PATCH 09/10] add more docs responding to ella pr comments --- src/pudl_archiver/archivers/eia/eiacbecs.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py index 48016e1d..8c916cf0 100644 --- a/src/pudl_archiver/archivers/eia/eiacbecs.py +++ b/src/pudl_archiver/archivers/eia/eiacbecs.py @@ -1,6 +1,5 @@ """Archive EIA Commercial Buildings Energy Consumption Survey (CBECS).""" -import logging import re from pathlib import Path from urllib.parse import urljoin @@ -13,7 +12,6 @@ from pudl_archiver.frictionless import ZipLayout BASE_URL = "https://www.eia.gov/consumption/commercial/data/" -logger = logging.getLogger(f"catalystcoop.{__name__}") class EiaCbecsArchiver(AbstractDatasetArchiver): @@ -23,11 +21,18 @@ class EiaCbecsArchiver(AbstractDatasetArchiver): async def get_resources(self) -> ArchiveAwaitable: """Download EIA-CBECS resources.""" + # we use this link and pattern to determine which years of CBECS data exists, + # but these base year links are only a portion of the view links so we + # construct the full links within get_year_resources link_pattern = re.compile(r"commercial/data/(\d{4})/$", re.IGNORECASE) - for link in await self.get_hyperlinks(BASE_URL, link_pattern): match = link_pattern.search(link) year = match.group(1) + if int(year) > 2018: + raise self.logger.warning( + f"There is a new year of data: {year}! This will almost certainly " + "require some updating of this archive." + ) yield self.get_year_resources(year) async def get_year_resources(self, year: int) -> list[ResourceInfo]: @@ -73,10 +78,14 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: file_url = urljoin(year_url, link) download_path = self.download_directory / filename await self.download_file(file_url, download_path) + # there are a small-ish handful of files who's links redirect to the main + # cbecs page. presumably its a broken link. we want to skip those files, + # so we are going to check to see if the doctype of the bytes of the file + # are html. if so we move on, otherwise add to the archive with Path.open(download_path, "rb") as f: first_bytes = f.read(20) - if b"html" in first_bytpes.lower().strip(): - logger.warning( + if b"html" in first_bytes.lower().strip(): + self.logger.warning( f"Skipping {file_url} because it appears to be a redirect/html page." ) pass From 9b241fd35a126763d91ebe686d11b49bc2bbb3e9 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 31 Jan 2025 14:13:14 -0500 Subject: [PATCH 10/10] add dois and dataset into gha: --- .github/workflows/run-archiver.yml | 4 ++-- src/pudl_archiver/package_data/zenodo_doi.yaml | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index f57a262f..56c2c9d9 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' + default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index 66457184..3c4683a6 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -37,6 +37,9 @@ eia930: eiaaeo: production_doi: 10.5281/zenodo.10838488 sandbox_doi: 10.5072/zenodo.37746 +eiacbecs: + production_doi: 10.5281/zenodo.14782474 + sandbox_doi: 10.5072/zenodo.161000 eia_bulk_elec: production_doi: 10.5281/zenodo.7067366 sandbox_doi: 10.5072/zenodo.2356