From f35c87bdbfb5785af5c9883578dc9ca1d6df3aa9 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Tue, 28 Jan 2025 17:23:44 -0500
Subject: [PATCH 01/10] make eia cbecs archive

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 72 +++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 src/pudl_archiver/archivers/eia/eiacbecs.py

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
new file mode 100644
index 00000000..bf489913
--- /dev/null
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -0,0 +1,72 @@
+"""Archive EIA  Commercial Buildings Energy Consumption Survey (CBECS)."""
+
+import logging
+import re
+from urllib.parse import urljoin
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+from pudl_archiver.frictionless import ZipLayout
+
+BASE_URL = "https://www.eia.gov/consumption/commercial/data/"
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+
+class EiaCbecsArchiver(AbstractDatasetArchiver):
+    """EIA CBECS archiver."""
+
+    name = "eiacbecs"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download EIA-CBECS resources."""
+        link_pattern = re.compile(r"commercial/data/(\d{4})/$", re.IGNORECASE)
+
+        for link in await self.get_hyperlinks(BASE_URL, link_pattern):
+            match = link_pattern.search(link)
+            year = match.group(1)
+            yield self.get_year_resources(year)
+
+    async def get_year_resources(self, year: int) -> list[ResourceInfo]:
+        """Download all excel tables for a year."""
+        data_paths_in_archive = set()
+        zip_path = self.download_directory / f"eiacbecs-{year}.zip"
+        data_views = ["characteristics", "consumption"]
+        for view in data_views:
+            year_url = f"{BASE_URL}{year}/index.php?view={view}"
+            table_link_pattern = re.compile(
+                rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$"
+            )
+            for link in await self.get_hyperlinks(year_url, table_link_pattern):
+                match = table_link_pattern.search(link)
+                unique_id = match.group(1)
+                file_extension = match.group(2)
+                filename = f"eiacbecs-{year}-{view}-{unique_id}{file_extension}"
+                file_url = urljoin(year_url, link)
+                download_path = self.download_directory / filename
+                await self.download_file(file_url, download_path)
+                with open(download_path, "rb") as f:
+                    first_bytpes = f.read(20)
+                    if b"html" in first_bytpes.lower().strip():
+                        logger.warning(
+                            f"Skipping {file_url} because it appears to be a redirect/html page."
+                        )
+                        pass
+                    else:
+                        self.add_to_archive(
+                            zip_path=zip_path,
+                            filename=filename,
+                            blob=download_path.open("rb"),
+                        )
+                        data_paths_in_archive.add(filename)
+                        # Don't want to leave multiple giant CSVs on disk, so delete
+                        # immediately after they're safely stored in the ZIP
+                        download_path.unlink()
+
+        return ResourceInfo(
+            local_path=zip_path,
+            partitions={"year": year},
+            layout=ZipLayout(file_paths=data_paths_in_archive),
+        )

From c4095ef551999183c88832823d95b893d603d4b9 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Tue, 28 Jan 2025 17:29:57 -0500
Subject: [PATCH 02/10] add in pathlib path for opening

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index bf489913..e5b8300f 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -2,6 +2,7 @@
 
 import logging
 import re
+from pathlib import Path
 from urllib.parse import urljoin
 
 from pudl_archiver.archivers.classes import (
@@ -47,7 +48,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                 file_url = urljoin(year_url, link)
                 download_path = self.download_directory / filename
                 await self.download_file(file_url, download_path)
-                with open(download_path, "rb") as f:
+                with Path.open(download_path, "rb") as f:
                     first_bytpes = f.read(20)
                     if b"html" in first_bytpes.lower().strip():
                         logger.warning(

From c645000851a7f1eb15bd373404f84974c091a028 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Wed, 29 Jan 2025 14:51:09 -0500
Subject: [PATCH 03/10] wip adding of the mircrodata

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index e5b8300f..4530f0fc 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -34,12 +34,16 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         """Download all excel tables for a year."""
         data_paths_in_archive = set()
         zip_path = self.download_directory / f"eiacbecs-{year}.zip"
-        data_views = ["characteristics", "consumption"]
-        for view in data_views:
+        pattern = rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$"
+        data_view_patterns = {
+            "characteristics": re.compile(pattern),
+            "consumption": re.compile(pattern),
+            "mircodata": re.compile(
+                rf"{year}/(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$"
+            ),
+        }
+        for view, table_link_pattern in data_view_patterns.items():
             year_url = f"{BASE_URL}{year}/index.php?view={view}"
-            table_link_pattern = re.compile(
-                rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$"
-            )
             for link in await self.get_hyperlinks(year_url, table_link_pattern):
                 match = table_link_pattern.search(link)
                 unique_id = match.group(1)

From 8b2e898b82a8c5b7e76fe70f5c60a92fce0df465 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Wed, 29 Jan 2025 15:50:05 -0500
Subject: [PATCH 04/10] attempt to add microdata w/ a very failing assertion

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index 4530f0fc..70efc2d2 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -69,6 +69,18 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                         # Don't want to leave multiple giant CSVs on disk, so delete
                         # immediately after they're safely stored in the ZIP
                         download_path.unlink()
+        # Check if all of the views found any links
+        year_has_all_views: dict[str, bool] = {
+            view: any(fn for fn in data_paths_in_archive if view in fn)
+            for view in data_view_patterns
+        }
+        views_without_files = [
+            view for (view, has_files) in year_has_all_views.items() if not has_files
+        ]
+        if views_without_files:
+            raise AssertionError(
+                f"We expect all years of EIA CBECS to have some data from all four views, but we found these views without files: {views_without_files}"
+            )
 
         return ResourceInfo(
             local_path=zip_path,

From 2b0b615fa2f334ba01b082a7c61f3ef0f2f2d02c Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Wed, 29 Jan 2025 18:06:17 -0500
Subject: [PATCH 05/10] make the microdata work and cleanup docs a bit

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 25 ++++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index 70efc2d2..36d62ba1 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -31,22 +31,34 @@ async def get_resources(self) -> ArchiveAwaitable:
             yield self.get_year_resources(year)
 
     async def get_year_resources(self, year: int) -> list[ResourceInfo]:
-        """Download all excel tables for a year."""
+        """Download all files from all views for a year."""
         data_paths_in_archive = set()
         zip_path = self.download_directory / f"eiacbecs-{year}.zip"
-        pattern = rf"{year}(?:.*)/([a-z,\d]{{1,5}})(.xls|.xlsx|.pdf)$"
+        pattern = rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$"
         data_view_patterns = {
             "characteristics": re.compile(pattern),
             "consumption": re.compile(pattern),
             "mircodata": re.compile(
-                rf"{year}/(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$"
+                rf"(?:{year}/|archive/|)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$"
+            ),
+            # the most recent cbecs doesn't a year or archive in the methodology links
+            # BUT there are almost always pdf files from 2018 that get caught up in
+            # these scrapers if we don't include year or archive. so we have a special
+            # 2018 pattern
+            "methodology": re.compile(
+                rf"(?:{year}|archive/pubs)(?:/pdf|)/(.*)(.pdf$)"
+                if year != "2018"
+                else r"/consumption/commercial(?:/data/2018|)/pdf/(.*)(.pdf)$"
             ),
         }
+
         for view, table_link_pattern in data_view_patterns.items():
             year_url = f"{BASE_URL}{year}/index.php?view={view}"
             for link in await self.get_hyperlinks(year_url, table_link_pattern):
                 match = table_link_pattern.search(link)
-                unique_id = match.group(1)
+                unique_id = (
+                    match.group(1).replace("_", "-").replace(" ", "-").lower().strip()
+                )
                 file_extension = match.group(2)
                 filename = f"eiacbecs-{year}-{view}-{unique_id}{file_extension}"
                 file_url = urljoin(year_url, link)
@@ -66,7 +78,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                             blob=download_path.open("rb"),
                         )
                         data_paths_in_archive.add(filename)
-                        # Don't want to leave multiple giant CSVs on disk, so delete
+                        # Don't want to leave multiple files on disk, so delete
                         # immediately after they're safely stored in the ZIP
                         download_path.unlink()
         # Check if all of the views found any links
@@ -79,7 +91,8 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         ]
         if views_without_files:
             raise AssertionError(
-                f"We expect all years of EIA CBECS to have some data from all four views, but we found these views without files: {views_without_files}"
+                "We expect all years of EIA CBECS to have some data from all four "
+                f"views, but we found these views without files for {year}: {views_without_files}"
             )
 
         return ResourceInfo(

From be00ca1814f56d0efd8ec36e5ea2e0c2d3e4e4fd Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Fri, 31 Jan 2025 12:55:05 -0500
Subject: [PATCH 06/10] fix the microdata view patterns

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index 36d62ba1..717ca118 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -34,12 +34,21 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         """Download all files from all views for a year."""
         data_paths_in_archive = set()
         zip_path = self.download_directory / f"eiacbecs-{year}.zip"
-        pattern = rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$"
+        char_and_cons_pattern = (
+            rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$"
+        )
         data_view_patterns = {
-            "characteristics": re.compile(pattern),
-            "consumption": re.compile(pattern),
-            "mircodata": re.compile(
-                rf"(?:{year}/|archive/|)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv)$"
+            "characteristics": re.compile(char_and_cons_pattern),
+            "consumption": re.compile(char_and_cons_pattern),
+            # some of the mircodata links are like csv/file01.csv which doesn't include
+            # the year or archive. instead of adding a null option for that first group
+            # we add a whole new pattern for these two years because if we don't
+            # we'd pick up some of the 2018 pdf files that are on the right hand side
+            # of these pages
+            "microdata": re.compile(
+                rf"(?:{year}/|archive/)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$"
+                if year not in ["2003", "1999"]
+                else r"^(?:csv|pdf)/(.*)(.csv|.pdf)$"
             ),
             # the most recent cbecs doesn't a year or archive in the methodology links
             # BUT there are almost always pdf files from 2018 that get caught up in

From 1ae779a4ebec7e6452cc1b1cde0a11947d25cef0 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Fri, 31 Jan 2025 13:49:31 -0500
Subject: [PATCH 07/10] add sas files

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index 717ca118..f1c53e51 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -46,7 +46,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
             # we'd pick up some of the 2018 pdf files that are on the right hand side
             # of these pages
             "microdata": re.compile(
-                rf"(?:{year}/|archive/)(?:xls|pdf|csv)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$"
+                rf"(?:{year}/|archive/)(?:xls|pdf|csv|sas)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$"
                 if year not in ["2003", "1999"]
                 else r"^(?:csv|pdf)/(.*)(.csv|.pdf)$"
             ),

From 66d36d877a7fa121a99f3b9609b510bd194489a6 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Fri, 31 Jan 2025 11:57:16 -0700
Subject: [PATCH 08/10] Update src/pudl_archiver/archivers/eia/eiacbecs.py

Co-authored-by: E. Belfer <37471869+e-belfer@users.noreply.github.com>
---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index f1c53e51..48016e1d 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -74,7 +74,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                 download_path = self.download_directory / filename
                 await self.download_file(file_url, download_path)
                 with Path.open(download_path, "rb") as f:
-                    first_bytpes = f.read(20)
+                    first_bytes = f.read(20)
                     if b"html" in first_bytpes.lower().strip():
                         logger.warning(
                             f"Skipping {file_url} because it appears to be a redirect/html page."

From 519528c7b5878bce8d4ed03b1cdcf6be3098f936 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Fri, 31 Jan 2025 14:05:54 -0500
Subject: [PATCH 09/10] add more docs responding to ella pr comments

---
 src/pudl_archiver/archivers/eia/eiacbecs.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiacbecs.py b/src/pudl_archiver/archivers/eia/eiacbecs.py
index 48016e1d..8c916cf0 100644
--- a/src/pudl_archiver/archivers/eia/eiacbecs.py
+++ b/src/pudl_archiver/archivers/eia/eiacbecs.py
@@ -1,6 +1,5 @@
 """Archive EIA  Commercial Buildings Energy Consumption Survey (CBECS)."""
 
-import logging
 import re
 from pathlib import Path
 from urllib.parse import urljoin
@@ -13,7 +12,6 @@
 from pudl_archiver.frictionless import ZipLayout
 
 BASE_URL = "https://www.eia.gov/consumption/commercial/data/"
-logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
 class EiaCbecsArchiver(AbstractDatasetArchiver):
@@ -23,11 +21,18 @@ class EiaCbecsArchiver(AbstractDatasetArchiver):
 
     async def get_resources(self) -> ArchiveAwaitable:
         """Download EIA-CBECS resources."""
+        # we use this link and pattern to determine which years of CBECS data exists,
+        # but these base year links are only a portion of the view links so we
+        # construct the full links within get_year_resources
         link_pattern = re.compile(r"commercial/data/(\d{4})/$", re.IGNORECASE)
-
         for link in await self.get_hyperlinks(BASE_URL, link_pattern):
             match = link_pattern.search(link)
             year = match.group(1)
+            if int(year) > 2018:
+                raise self.logger.warning(
+                    f"There is a new year of data: {year}! This will almost certainly "
+                    "require some updating of this archive."
+                )
             yield self.get_year_resources(year)
 
     async def get_year_resources(self, year: int) -> list[ResourceInfo]:
@@ -73,10 +78,14 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                 file_url = urljoin(year_url, link)
                 download_path = self.download_directory / filename
                 await self.download_file(file_url, download_path)
+                # there are a small-ish handful of files who's links redirect to the main
+                # cbecs page. presumably its a broken link. we want to skip those files,
+                # so we are going to check to see if the doctype of the bytes of the file
+                # are html. if so we move on, otherwise add to the archive
                 with Path.open(download_path, "rb") as f:
                     first_bytes = f.read(20)
-                    if b"html" in first_bytpes.lower().strip():
-                        logger.warning(
+                    if b"html" in first_bytes.lower().strip():
+                        self.logger.warning(
                             f"Skipping {file_url} because it appears to be a redirect/html page."
                         )
                         pass

From 9b241fd35a126763d91ebe686d11b49bc2bbb3e9 Mon Sep 17 00:00:00 2001
From: Christina Gosnell <cgosnell@catalyst.coop>
Date: Fri, 31 Jan 2025 14:13:14 -0500
Subject: [PATCH 10/10] add dois and dataset into gha:

---
 .github/workflows/run-archiver.yml             | 4 ++--
 src/pudl_archiver/package_data/zenodo_doi.yaml | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml
index f57a262f..56c2c9d9 100644
--- a/.github/workflows/run-archiver.yml
+++ b/.github/workflows/run-archiver.yml
@@ -6,7 +6,7 @@ on:
     inputs:
       datasets:
         description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
-        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
+        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
         required: true
         type: string
       create_github_issue:
@@ -26,7 +26,7 @@ jobs:
     strategy:
       matrix:
         # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
-        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
+        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
       fail-fast: false
     runs-on: ubuntu-latest
     permissions:
diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml
index 66457184..3c4683a6 100644
--- a/src/pudl_archiver/package_data/zenodo_doi.yaml
+++ b/src/pudl_archiver/package_data/zenodo_doi.yaml
@@ -37,6 +37,9 @@ eia930:
 eiaaeo:
   production_doi: 10.5281/zenodo.10838488
   sandbox_doi: 10.5072/zenodo.37746
+eiacbecs:
+  production_doi: 10.5281/zenodo.14782474
+  sandbox_doi: 10.5072/zenodo.161000
 eia_bulk_elec:
   production_doi: 10.5281/zenodo.7067366
   sandbox_doi: 10.5072/zenodo.2356