From 1a9739b8db298c3c8dc106be70d46af0b8d4fd89 Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Mon, 22 Apr 2024 07:43:26 -0500
Subject: [PATCH 01/11] Updated .gitignore and create .dockerignore.

---
 .dockerignore | 22 ++++++++++++++++++++++
 .gitignore    |  4 +++-
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..1dd62fc
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,22 @@
+.env
+service-account.json
+gdrive-service-account.json
+.git
+.gitignore
+environment.yaml
+Makefile
+README.md
+LICENSE
+deps/requirements.in
+deps/dev-requirements.in
+deps/dev-requirements.txt
+notebooks/
+scripts/
+pipeline/__pycache__/
+pipeline/scripts/__pycache__/
+pipeline/data/*
+pipeline/bfro_mini_warehouse/logs/
+pipeline/bfro_mini_warehouse/target/
+pipeline/bfro_mini_warehouse/README.md
+pipeline/scraper/__pycache__/
+pipeline/scraper/bfro_scrape/__pycache__/
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index ccca9c5..f04d624 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,6 @@ logs
 data_old
 test.csv
 data_backup
-
+service-account.json
+gdrive-service-account.json
+run-docker.sh

From 81b1ecc00c6cccdb94bdd2fe42b934188477b634 Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Mon, 22 Apr 2024 07:46:07 -0500
Subject: [PATCH 02/11] Switch from black to ruff.

---
 Makefile                 | 27 ++-------------------------
 deps/dev-requirements.in |  1 -
 pyproject.toml           | 14 --------------
 ruff.toml                |  1 +
 4 files changed, 3 insertions(+), 40 deletions(-)
 delete mode 100644 pyproject.toml
 create mode 100644 ruff.toml

diff --git a/Makefile b/Makefile
index b150a6e..605ef16 100644
--- a/Makefile
+++ b/Makefile
@@ -18,17 +18,11 @@ dev-env: deps/dev-requirements.txt
 
 ## Lint project with ruff.
 lint:
-	python -m ruff .
+	python -m ruff check .
 
 ## Format imports and code.
 format:
-	python -m ruff . --fix
-	python -m black .
-
-## Check linting and formatting.
-check:
-	python -m ruff check .
-	python -m black --check .
+	python -m ruff format .
 
 .PHONY: build-docker
 ## Build docker with local registry tag
@@ -40,23 +34,6 @@ build-docker:
 push-docker:
 	docker push localhost:5000/bfro_pipeline:latest
 
-.PHONY: build-deployment
-## Builds the Prefect deployment yaml file.
-build-deployment:
-	cd pipeline && \
-	prefect deployment build \
-		bfro_pipeline_docker:main \
-		--name bfro-pipeline \
-		--pool bfro-agent-pool \
-		--work-queue default \
-		--infra-block process/bfro-local \
-		--storage-block gcs/bfro-pipeline-storage
-
-.PHONY: apply-deployment
-## Sends the Prefect deployment file to the server.
-apply-deployment:
-	prefect deployment apply pipeline/main-deployment.yaml
-
 .PHONY: pull-data
 ## Downloads the data locally for testing
 pull-data:
diff --git a/deps/dev-requirements.in b/deps/dev-requirements.in
index f392554..40cd4be 100644
--- a/deps/dev-requirements.in
+++ b/deps/dev-requirements.in
@@ -1,3 +1,2 @@
 -r requirements.txt
-black
 ruff
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 1378b95..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-[tool.black]
-line-length = 79
-extend-exclude = '''
-/(
-    | .env
-    | environment.yaml
-    | notebooks
-    | data
-    | dvc.lock
-    | dvc.yaml
-    | LICENSE
-    | README.md
-)/
-'''
\ No newline at end of file
diff --git a/ruff.toml b/ruff.toml
new file mode 100644
index 0000000..723e27e
--- /dev/null
+++ b/ruff.toml
@@ -0,0 +1 @@
+line-length = 79
\ No newline at end of file

From 1b568b2b63f94f1eb4ea912c876e4d3bdd190b96 Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Mon, 22 Apr 2024 08:02:48 -0500
Subject: [PATCH 03/11] Upgrade to Python 3.11, switch to uv for dependency
 management.

---
 Makefile                  |   8 +-
 deps/dev-requirements.txt | 586 +++++++++-----------------------------
 environment.yaml          |   4 +-
 3 files changed, 137 insertions(+), 461 deletions(-)

diff --git a/Makefile b/Makefile
index 605ef16..252e05a 100644
--- a/Makefile
+++ b/Makefile
@@ -2,19 +2,19 @@
 
 ## Compile requirements.in into requirements.txt
 deps/requirements.txt: deps/requirements.in
-	pip-compile deps/requirements.in --output-file deps/requirements.txt
+	uv pip compile deps/requirements.in --output-file deps/requirements.txt
 
 ## Compile dev-requirements.in into dev-requirements.txt
 deps/dev-requirements.txt: deps/dev-requirements.in deps/requirements.txt
-	pip-compile deps/dev-requirements.in --output-file deps/dev-requirements.txt
+	uv pip compile deps/dev-requirements.in --output-file deps/dev-requirements.txt
 
 ## Install non-dev dependencies.
 env: deps/requirements.txt
-	pip-sync deps/requirements.txt
+	uv pip sync deps/requirements.txt
 
 ## Install dev and non-dev dependencies.
 dev-env: deps/dev-requirements.txt
-	pip-sync deps/dev-requirements.txt
+	uv pip sync deps/dev-requirements.txt
 
 ## Lint project with ruff.
 lint:
diff --git a/deps/dev-requirements.txt b/deps/dev-requirements.txt
index b0ce1d1..550948b 100644
--- a/deps/dev-requirements.txt
+++ b/deps/dev-requirements.txt
@@ -1,54 +1,30 @@
-#
-# This file is autogenerated by pip-compile with Python 3.9
-# by the following command:
-#
-#    pip-compile --output-file=deps/dev-requirements.txt deps/dev-requirements.in
-#
+# This file was autogenerated by uv via the following command:
+#    uv pip compile deps/dev-requirements.in --output-file deps/dev-requirements.txt
 agate==1.7.0
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+    # via dbt-core
 aiohttp==3.8.4
-    # via
-    #   -r deps/requirements.txt
-    #   gcsfs
+    # via gcsfs
 aiosignal==1.3.1
-    # via
-    #   -r deps/requirements.txt
-    #   aiohttp
+    # via aiohttp
 aiosqlite==0.18.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 alembic==1.10.2
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 anyio==3.7.1
     # via
-    #   -r deps/requirements.txt
     #   httpcore
     #   prefect
     #   starlette
 apprise==1.3.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 asgi-lifespan==2.0.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 async-timeout==4.0.2
-    # via
-    #   -r deps/requirements.txt
-    #   aiohttp
+    # via aiohttp
 asyncpg==0.27.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 attrs==22.2.0
     # via
-    #   -r deps/requirements.txt
     #   aiohttp
     #   automat
     #   jsonschema
@@ -56,26 +32,15 @@ attrs==22.2.0
     #   twisted
     #   visions
 automat==22.10.0
-    # via
-    #   -r deps/requirements.txt
-    #   twisted
+    # via twisted
 babel==2.12.1
-    # via
-    #   -r deps/requirements.txt
-    #   agate
+    # via agate
 betterproto==1.2.5
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
-black==23.9.1
-    # via -r deps/dev-requirements.in
+    # via dbt-core
 cachetools==5.3.0
-    # via
-    #   -r deps/requirements.txt
-    #   google-auth
+    # via google-auth
 certifi==2022.12.7
     # via
-    #   -r deps/requirements.txt
     #   apprise
     #   httpcore
     #   httpx
@@ -83,140 +48,89 @@ certifi==2022.12.7
     #   requests
 cffi==1.15.1
     # via
-    #   -r deps/requirements.txt
     #   cryptography
     #   dbt-core
 charset-normalizer==3.1.0
     # via
-    #   -r deps/requirements.txt
     #   aiohttp
     #   requests
 click==8.1.3
     # via
-    #   -r deps/requirements.txt
     #   apprise
-    #   black
     #   dbt-core
     #   prefect
     #   typer
     #   uvicorn
 cloudpickle==2.2.1
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 colorama==0.4.6
     # via
-    #   -r deps/requirements.txt
     #   dbt-core
     #   griffe
 constantly==15.1.0
-    # via
-    #   -r deps/requirements.txt
-    #   twisted
+    # via twisted
 contourpy==1.0.7
-    # via
-    #   -r deps/requirements.txt
-    #   matplotlib
+    # via matplotlib
 coolname==2.2.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 croniter==1.3.8
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 cryptography==40.0.1
     # via
-    #   -r deps/requirements.txt
     #   prefect
     #   pyopenssl
     #   scrapy
     #   service-identity
 cssselect==1.2.0
     # via
-    #   -r deps/requirements.txt
     #   parsel
     #   scrapy
 cycler==0.11.0
-    # via
-    #   -r deps/requirements.txt
-    #   matplotlib
+    # via matplotlib
 dateparser==1.1.8
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 dbt-core==1.4.5
     # via
-    #   -r deps/requirements.txt
     #   dbt-duckdb
     #   prefect-dbt
 dbt-duckdb==1.4.1
-    # via -r deps/requirements.txt
 dbt-extractor==0.4.1
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+    # via dbt-core
 decorator==5.1.1
-    # via
-    #   -r deps/requirements.txt
-    #   gcsfs
+    # via gcsfs
 dnspython==2.4.2
-    # via
-    #   -r deps/requirements.txt
-    #   email-validator
+    # via email-validator
 docker==6.0.1
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 duckdb==0.7.1
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-duckdb
+    # via dbt-duckdb
 email-validator==2.1.0.post1
-    # via
-    #   -r deps/requirements.txt
-    #   pydantic
+    # via pydantic
 exceptiongroup==1.1.3
-    # via
-    #   -r deps/requirements.txt
-    #   anyio
 filelock==3.10.7
-    # via
-    #   -r deps/requirements.txt
-    #   tldextract
+    # via tldextract
 fonttools==4.39.2
-    # via
-    #   -r deps/requirements.txt
-    #   matplotlib
+    # via matplotlib
 frozenlist==1.3.3
     # via
-    #   -r deps/requirements.txt
     #   aiohttp
     #   aiosignal
 fsspec==2023.1.0
     # via
-    #   -r deps/requirements.txt
     #   gcsfs
     #   prefect
 future==0.18.3
-    # via
-    #   -r deps/requirements.txt
-    #   parsedatetime
+    # via parsedatetime
 gcsfs==2023.1.0
-    # via -r deps/requirements.txt
 google-api-core==2.11.0
     # via
-    #   -r deps/requirements.txt
     #   google-api-python-client
     #   google-cloud-core
     #   google-cloud-storage
 google-api-python-client==2.83.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect-gcp
+    # via prefect-gcp
 google-auth==2.16.3
     # via
-    #   -r deps/requirements.txt
     #   gcsfs
     #   google-api-core
     #   google-api-python-client
@@ -226,103 +140,62 @@ google-auth==2.16.3
     #   google-cloud-storage
     #   kubernetes
 google-auth-httplib2==0.1.0
-    # via
-    #   -r deps/requirements.txt
-    #   google-api-python-client
+    # via google-api-python-client
 google-auth-oauthlib==1.0.0
-    # via
-    #   -r deps/requirements.txt
-    #   gcsfs
+    # via gcsfs
 google-cloud-core==2.3.2
-    # via
-    #   -r deps/requirements.txt
-    #   google-cloud-storage
+    # via google-cloud-storage
 google-cloud-storage==2.8.0
     # via
-    #   -r deps/requirements.txt
     #   gcsfs
     #   prefect-gcp
 google-crc32c==1.5.0
-    # via
-    #   -r deps/requirements.txt
-    #   google-resumable-media
+    # via google-resumable-media
 google-resumable-media==2.4.1
-    # via
-    #   -r deps/requirements.txt
-    #   google-cloud-storage
+    # via google-cloud-storage
 googleapis-common-protos==1.59.0
-    # via
-    #   -r deps/requirements.txt
-    #   google-api-core
+    # via google-api-core
 graphql-core==3.2.3
-    # via
-    #   -r deps/requirements.txt
-    #   sgqlc
+    # via sgqlc
 graphviz==0.20.1
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 greenlet==2.0.2
-    # via
-    #   -r deps/requirements.txt
-    #   sqlalchemy
+    # via sqlalchemy
 griffe==0.25.5
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 grpclib==0.4.3
-    # via
-    #   -r deps/requirements.txt
-    #   betterproto
+    # via betterproto
 h11==0.14.0
     # via
-    #   -r deps/requirements.txt
     #   httpcore
     #   uvicorn
 h2==4.1.0
     # via
-    #   -r deps/requirements.txt
     #   grpclib
     #   httpx
 h3==3.7.6
-    # via -r deps/requirements.txt
 hologram==0.0.15
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+    # via dbt-core
 hpack==4.0.0
-    # via
-    #   -r deps/requirements.txt
-    #   h2
+    # via h2
 htmlmin==0.1.12
-    # via
-    #   -r deps/requirements.txt
-    #   ydata-profiling
+    # via ydata-profiling
 httpcore==0.16.3
     # via
-    #   -r deps/requirements.txt
     #   httpx
     #   prefect
 httplib2==0.22.0
     # via
-    #   -r deps/requirements.txt
     #   google-api-python-client
     #   google-auth-httplib2
-httpx[http2]==0.23.3
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+httpx==0.23.3
+    # via prefect
 hyperframe==6.0.1
-    # via
-    #   -r deps/requirements.txt
-    #   h2
+    # via h2
 hyperlink==21.0.0
-    # via
-    #   -r deps/requirements.txt
-    #   twisted
+    # via twisted
 idna==3.4
     # via
-    #   -r deps/requirements.txt
     #   anyio
     #   dbt-core
     #   email-validator
@@ -333,143 +206,90 @@ idna==3.4
     #   yarl
 imagehash==4.3.1
     # via
-    #   -r deps/requirements.txt
     #   visions
     #   ydata-profiling
 importlib-metadata==6.1.0
-    # via
-    #   -r deps/requirements.txt
-    #   markdown
-    #   prefect
 incremental==22.10.0
-    # via
-    #   -r deps/requirements.txt
-    #   twisted
+    # via twisted
 isodate==0.6.1
     # via
-    #   -r deps/requirements.txt
     #   agate
     #   dbt-core
 itemadapter==0.7.0
     # via
-    #   -r deps/requirements.txt
     #   itemloaders
     #   scrapy
 itemloaders==1.0.6
-    # via
-    #   -r deps/requirements.txt
-    #   scrapy
+    # via scrapy
 jinja2==3.1.2
     # via
-    #   -r deps/requirements.txt
     #   dbt-core
     #   prefect
     #   ydata-profiling
 jmespath==1.0.1
-    # via
-    #   -r deps/requirements.txt
-    #   itemloaders
+    # via itemloaders
 joblib==1.2.0
-    # via
-    #   -r deps/requirements.txt
-    #   phik
+    # via phik
 jsonpatch==1.32
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 jsonpointer==2.3
-    # via
-    #   -r deps/requirements.txt
-    #   jsonpatch
+    # via jsonpatch
 jsonschema==3.2.0
     # via
-    #   -r deps/requirements.txt
     #   hologram
     #   prefect
 kiwisolver==1.4.4
-    # via
-    #   -r deps/requirements.txt
-    #   matplotlib
+    # via matplotlib
 kubernetes==26.1.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 leather==0.3.4
-    # via
-    #   -r deps/requirements.txt
-    #   agate
+    # via agate
 logbook==1.5.3
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+    # via dbt-core
 loguru==0.6.0
-    # via -r deps/requirements.txt
 lxml==4.9.2
     # via
-    #   -r deps/requirements.txt
     #   parsel
     #   scrapy
 mako==1.2.4
-    # via
-    #   -r deps/requirements.txt
-    #   alembic
+    # via alembic
 markdown==3.4.3
-    # via
-    #   -r deps/requirements.txt
-    #   apprise
+    # via apprise
 markdown-it-py==2.2.0
-    # via
-    #   -r deps/requirements.txt
-    #   rich
+    # via rich
 markupsafe==2.1.2
     # via
-    #   -r deps/requirements.txt
     #   jinja2
     #   mako
     #   werkzeug
-mashumaro[msgpack]==3.3.1
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+mashumaro==3.3.1
+    # via dbt-core
 matplotlib==3.6.3
     # via
-    #   -r deps/requirements.txt
     #   phik
     #   seaborn
     #   ydata-profiling
 mdurl==0.1.2
-    # via
-    #   -r deps/requirements.txt
-    #   markdown-it-py
+    # via markdown-it-py
 minimal-snowplow-tracker==0.0.2
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+    # via dbt-core
 msgpack==1.0.5
-    # via
-    #   -r deps/requirements.txt
-    #   mashumaro
+    # via mashumaro
 multidict==6.0.4
     # via
-    #   -r deps/requirements.txt
     #   aiohttp
     #   grpclib
     #   yarl
 multimethod==1.9.1
     # via
-    #   -r deps/requirements.txt
     #   visions
     #   ydata-profiling
-mypy-extensions==1.0.0
-    # via black
 networkx==2.8.8
     # via
-    #   -r deps/requirements.txt
     #   dbt-core
     #   visions
 numpy==1.23.5
     # via
-    #   -r deps/requirements.txt
     #   contourpy
     #   imagehash
     #   matplotlib
@@ -484,17 +304,11 @@ numpy==1.23.5
     #   visions
     #   ydata-profiling
 oauthlib==3.2.2
-    # via
-    #   -r deps/requirements.txt
-    #   requests-oauthlib
+    # via requests-oauthlib
 orjson==3.8.8
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 packaging==23.0
     # via
-    #   -r deps/requirements.txt
-    #   black
     #   dbt-core
     #   docker
     #   matplotlib
@@ -504,122 +318,80 @@ packaging==23.0
     #   statsmodels
 pandas==1.5.3
     # via
-    #   -r deps/requirements.txt
     #   phik
     #   seaborn
     #   statsmodels
     #   visions
     #   ydata-profiling
 parsedatetime==2.4
-    # via
-    #   -r deps/requirements.txt
-    #   agate
+    # via agate
 parsel==1.7.0
     # via
-    #   -r deps/requirements.txt
     #   itemloaders
     #   scrapy
 pathspec==0.10.3
     # via
-    #   -r deps/requirements.txt
-    #   black
     #   dbt-core
     #   prefect
 patsy==0.5.3
-    # via
-    #   -r deps/requirements.txt
-    #   statsmodels
+    # via statsmodels
 pendulum==2.1.2
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 phik==0.12.3
-    # via
-    #   -r deps/requirements.txt
-    #   ydata-profiling
+    # via ydata-profiling
 pillow==9.4.0
     # via
-    #   -r deps/requirements.txt
     #   imagehash
     #   matplotlib
     #   visions
-platformdirs==3.11.0
-    # via black
-polars[pyarrow]==0.16.16
-    # via -r deps/requirements.txt
+polars==0.16.16
 prefect==2.14.2
     # via
-    #   -r deps/requirements.txt
     #   prefect-dbt
     #   prefect-gcp
     #   prefect-shell
 prefect-dbt==0.3.1
-    # via -r deps/requirements.txt
-prefect-gcp[cloud_storage]==0.3.0
-    # via -r deps/requirements.txt
+prefect-gcp==0.3.0
 prefect-shell==0.1.5
-    # via
-    #   -r deps/requirements.txt
-    #   prefect-dbt
+    # via prefect-dbt
 protego==0.2.1
-    # via
-    #   -r deps/requirements.txt
-    #   scrapy
+    # via scrapy
 protobuf==4.22.1
     # via
-    #   -r deps/requirements.txt
     #   google-api-core
     #   googleapis-common-protos
 pyarrow==11.0.0
-    # via
-    #   -r deps/requirements.txt
-    #   polars
+    # via polars
 pyasn1==0.4.8
     # via
-    #   -r deps/requirements.txt
     #   pyasn1-modules
     #   rsa
     #   service-identity
 pyasn1-modules==0.2.8
     # via
-    #   -r deps/requirements.txt
     #   google-auth
     #   service-identity
 pycparser==2.21
+    # via cffi
+pydantic==1.10.7
     # via
-    #   -r deps/requirements.txt
-    #   cffi
-pydantic[email]==1.10.7
-    # via
-    #   -r deps/requirements.txt
     #   prefect
     #   ydata-profiling
 pydispatcher==2.0.7
-    # via
-    #   -r deps/requirements.txt
-    #   scrapy
+    # via scrapy
 pygeohash==1.2.0
-    # via -r deps/requirements.txt
 pygments==2.14.0
-    # via
-    #   -r deps/requirements.txt
-    #   rich
+    # via rich
 pyopenssl==23.1.1
-    # via
-    #   -r deps/requirements.txt
-    #   scrapy
+    # via scrapy
 pyparsing==3.0.9
     # via
-    #   -r deps/requirements.txt
     #   httplib2
     #   matplotlib
 pyrsistent==0.19.3
-    # via
-    #   -r deps/requirements.txt
-    #   jsonschema
+    # via jsonschema
 python-dateutil==2.8.2
     # via
-    #   -r deps/requirements.txt
     #   croniter
     #   dateparser
     #   hologram
@@ -629,58 +401,39 @@ python-dateutil==2.8.2
     #   pendulum
     #   prefect
 python-dotenv==1.0.0
-    # via -r deps/requirements.txt
 python-slugify==8.0.1
     # via
-    #   -r deps/requirements.txt
     #   agate
     #   prefect
 pytimeparse==1.1.8
-    # via
-    #   -r deps/requirements.txt
-    #   agate
+    # via agate
 pytz==2023.2
     # via
-    #   -r deps/requirements.txt
     #   dateparser
     #   dbt-core
     #   pandas
     #   prefect
 pytz-deprecation-shim==0.1.0.post0
-    # via
-    #   -r deps/requirements.txt
-    #   tzlocal
+    # via tzlocal
 pytzdata==2020.1
-    # via
-    #   -r deps/requirements.txt
-    #   pendulum
+    # via pendulum
 pywavelets==1.4.1
-    # via
-    #   -r deps/requirements.txt
-    #   imagehash
+    # via imagehash
 pyyaml==6.0
     # via
-    #   -r deps/requirements.txt
     #   apprise
     #   dbt-core
     #   kubernetes
     #   prefect
     #   ydata-profiling
 queuelib==1.6.2
-    # via
-    #   -r deps/requirements.txt
-    #   scrapy
+    # via scrapy
 readchar==4.0.5
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 regex==2023.3.23
-    # via
-    #   -r deps/requirements.txt
-    #   dateparser
+    # via dateparser
 requests==2.28.2
     # via
-    #   -r deps/requirements.txt
     #   apprise
     #   dbt-core
     #   docker
@@ -694,61 +447,45 @@ requests==2.28.2
     #   tldextract
     #   ydata-profiling
 requests-file==1.5.1
-    # via
-    #   -r deps/requirements.txt
-    #   tldextract
+    # via tldextract
 requests-oauthlib==1.3.1
     # via
-    #   -r deps/requirements.txt
     #   apprise
     #   google-auth-oauthlib
     #   kubernetes
-rfc3986[idna2008]==1.5.0
-    # via
-    #   -r deps/requirements.txt
-    #   httpx
+rfc3986==1.5.0
+    # via httpx
 rich==13.3.3
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 rsa==4.9
-    # via
-    #   -r deps/requirements.txt
-    #   google-auth
+    # via google-auth
 ruamel-yaml==0.17.35
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 ruamel-yaml-clib==0.2.8
-    # via
-    #   -r deps/requirements.txt
-    #   ruamel-yaml
+    # via ruamel-yaml
 ruff==0.0.292
-    # via -r deps/dev-requirements.in
 scipy==1.9.3
     # via
-    #   -r deps/requirements.txt
     #   imagehash
     #   phik
     #   statsmodels
     #   ydata-profiling
 scrapy==2.8.0
-    # via -r deps/requirements.txt
 seaborn==0.12.2
-    # via
-    #   -r deps/requirements.txt
-    #   ydata-profiling
+    # via ydata-profiling
 service-identity==21.1.0
+    # via scrapy
+setuptools==69.5.1
     # via
-    #   -r deps/requirements.txt
+    #   jsonschema
+    #   kubernetes
+    #   readchar
     #   scrapy
+    #   zope-interface
 sgqlc==16.1
-    # via
-    #   -r deps/requirements.txt
-    #   prefect-dbt
+    # via prefect-dbt
 six==1.16.0
     # via
-    #   -r deps/requirements.txt
     #   automat
     #   google-auth
     #   google-auth-httplib2
@@ -764,145 +501,84 @@ six==1.16.0
     #   service-identity
 sniffio==1.3.0
     # via
-    #   -r deps/requirements.txt
     #   anyio
     #   asgi-lifespan
     #   httpcore
     #   httpx
     #   prefect
-sqlalchemy[asyncio]==1.4.47
+sqlalchemy==1.4.47
     # via
-    #   -r deps/requirements.txt
     #   alembic
     #   prefect
 sqlparse==0.4.3
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+    # via dbt-core
 starlette==0.27.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 statsmodels==0.13.5
-    # via
-    #   -r deps/requirements.txt
-    #   ydata-profiling
+    # via ydata-profiling
 stringcase==1.2.0
-    # via
-    #   -r deps/requirements.txt
-    #   betterproto
+    # via betterproto
 tangled-up-in-unicode==0.2.0
-    # via
-    #   -r deps/requirements.txt
-    #   visions
+    # via visions
 text-unidecode==1.3
-    # via
-    #   -r deps/requirements.txt
-    #   python-slugify
+    # via python-slugify
 tldextract==3.4.0
-    # via
-    #   -r deps/requirements.txt
-    #   scrapy
+    # via scrapy
 toml==0.10.2
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
-tomli==2.0.1
-    # via black
+    # via prefect
 toolz==0.12.0
-    # via -r deps/requirements.txt
 tqdm==4.64.1
-    # via
-    #   -r deps/requirements.txt
-    #   ydata-profiling
+    # via ydata-profiling
 twisted==22.10.0
-    # via
-    #   -r deps/requirements.txt
-    #   scrapy
+    # via scrapy
 typeguard==2.13.3
-    # via
-    #   -r deps/requirements.txt
-    #   ydata-profiling
+    # via ydata-profiling
 typer==0.7.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 typing-extensions==4.5.0
     # via
-    #   -r deps/requirements.txt
     #   alembic
-    #   black
     #   dbt-core
     #   mashumaro
-    #   polars
     #   prefect
     #   pydantic
-    #   starlette
     #   twisted
 tzdata==2023.2
-    # via
-    #   -r deps/requirements.txt
-    #   pytz-deprecation-shim
+    # via pytz-deprecation-shim
 tzlocal==4.3
-    # via
-    #   -r deps/requirements.txt
-    #   dateparser
+    # via dateparser
 ujson==5.8.0
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 uritemplate==4.1.1
-    # via
-    #   -r deps/requirements.txt
-    #   google-api-python-client
+    # via google-api-python-client
 urllib3==1.26.15
     # via
-    #   -r deps/requirements.txt
     #   docker
     #   kubernetes
     #   requests
 uvicorn==0.21.1
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
-visions[type_image_path]==0.7.5
-    # via
-    #   -r deps/requirements.txt
-    #   ydata-profiling
+    # via prefect
+visions==0.7.5
+    # via ydata-profiling
 w3lib==2.1.1
     # via
-    #   -r deps/requirements.txt
     #   itemloaders
     #   parsel
     #   scrapy
 websocket-client==1.5.1
     # via
-    #   -r deps/requirements.txt
     #   docker
     #   kubernetes
 websockets==10.4
-    # via
-    #   -r deps/requirements.txt
-    #   prefect
+    # via prefect
 werkzeug==2.2.3
-    # via
-    #   -r deps/requirements.txt
-    #   dbt-core
+    # via dbt-core
 yarl==1.8.2
-    # via
-    #   -r deps/requirements.txt
-    #   aiohttp
+    # via aiohttp
 ydata-profiling==4.1.2
-    # via -r deps/requirements.txt
 zipp==3.15.0
-    # via
-    #   -r deps/requirements.txt
-    #   importlib-metadata
+    # via importlib-metadata
 zope-interface==6.0
     # via
-    #   -r deps/requirements.txt
     #   scrapy
     #   twisted
-
-# The following packages are considered to be unsafe in a requirements file:
-# setuptools
diff --git a/environment.yaml b/environment.yaml
index d9dce72..b0c8947 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -3,6 +3,6 @@ channels:
   - defaults
 dependencies:
   - pip
-  - python=3.9
+  - python=3.11
   - pip:
-    - pip-tools
+    - uv

From 7617be24bef131e96a673dd0fbc3e3dd2285e8b3 Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 08:19:01 -0500
Subject: [PATCH 04/11] Bye, prefect.

---
 .gitignore                                 |   1 +
 README.md                                  |   2 +-
 deps/dev-requirements.txt                  | 256 +----------------
 deps/requirements.in                       |  11 -
 deps/requirements.txt                      | 313 ++-------------------
 pipeline/bfro_pipeline.py                  | 123 --------
 pipeline/bfro_pipeline_docker.py           |  76 -----
 pipeline/main-deployment.yaml              |  65 -----
 pipeline/scripts/combine_raw_reports.py    |  60 ++++
 pipeline/scripts/pull_weather.py           |   3 +
 pipeline/{ => scripts}/upload_to_gdrive.py |  15 +-
 pipeline/update_geocoder.py                |  82 ------
 pipeline/update_reports.py                 | 118 --------
 pipeline/update_weather.py                 | 102 -------
 run-pipeline-local.sh                      |  62 ++++
 15 files changed, 172 insertions(+), 1117 deletions(-)
 delete mode 100644 pipeline/bfro_pipeline.py
 delete mode 100644 pipeline/bfro_pipeline_docker.py
 delete mode 100644 pipeline/main-deployment.yaml
 create mode 100644 pipeline/scripts/combine_raw_reports.py
 rename pipeline/{ => scripts}/upload_to_gdrive.py (93%)
 delete mode 100644 pipeline/update_geocoder.py
 delete mode 100644 pipeline/update_reports.py
 delete mode 100644 pipeline/update_weather.py
 create mode 100644 run-pipeline-local.sh

diff --git a/.gitignore b/.gitignore
index f04d624..43175b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ data_backup
 service-account.json
 gdrive-service-account.json
 run-docker.sh
+run-pipeline-docker.sh
\ No newline at end of file
diff --git a/README.md b/README.md
index 8dd88fb..9c6759d 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ To get started, run the following commands to set up the environment.
 
 ```shell
 conda env create -f environment.yaml
-conda activate bfro
+conda activate bfro-sightings-data
 make dev-env
 ```
 
diff --git a/deps/dev-requirements.txt b/deps/dev-requirements.txt
index 550948b..3682e52 100644
--- a/deps/dev-requirements.txt
+++ b/deps/dev-requirements.txt
@@ -2,30 +2,8 @@
 #    uv pip compile deps/dev-requirements.in --output-file deps/dev-requirements.txt
 agate==1.7.0
     # via dbt-core
-aiohttp==3.8.4
-    # via gcsfs
-aiosignal==1.3.1
-    # via aiohttp
-aiosqlite==0.18.0
-    # via prefect
-alembic==1.10.2
-    # via prefect
-anyio==3.7.1
-    # via
-    #   httpcore
-    #   prefect
-    #   starlette
-apprise==1.3.0
-    # via prefect
-asgi-lifespan==2.0.0
-    # via prefect
-async-timeout==4.0.2
-    # via aiohttp
-asyncpg==0.27.0
-    # via prefect
 attrs==22.2.0
     # via
-    #   aiohttp
     #   automat
     #   jsonschema
     #   service-identity
@@ -40,44 +18,25 @@ betterproto==1.2.5
 cachetools==5.3.0
     # via google-auth
 certifi==2022.12.7
-    # via
-    #   apprise
-    #   httpcore
-    #   httpx
-    #   kubernetes
-    #   requests
+    # via requests
 cffi==1.15.1
     # via
     #   cryptography
     #   dbt-core
 charset-normalizer==3.1.0
-    # via
-    #   aiohttp
-    #   requests
+    # via requests
 click==8.1.3
     # via
-    #   apprise
     #   dbt-core
-    #   prefect
     #   typer
-    #   uvicorn
-cloudpickle==2.2.1
-    # via prefect
 colorama==0.4.6
-    # via
-    #   dbt-core
-    #   griffe
+    # via dbt-core
 constantly==15.1.0
     # via twisted
 contourpy==1.0.7
     # via matplotlib
-coolname==2.2.0
-    # via prefect
-croniter==1.3.8
-    # via prefect
 cryptography==40.0.1
     # via
-    #   prefect
     #   pyopenssl
     #   scrapy
     #   service-identity
@@ -87,92 +46,37 @@ cssselect==1.2.0
     #   scrapy
 cycler==0.11.0
     # via matplotlib
-dateparser==1.1.8
-    # via prefect
 dbt-core==1.4.5
-    # via
-    #   dbt-duckdb
-    #   prefect-dbt
+    # via dbt-duckdb
 dbt-duckdb==1.4.1
 dbt-extractor==0.4.1
     # via dbt-core
-decorator==5.1.1
-    # via gcsfs
-dnspython==2.4.2
-    # via email-validator
-docker==6.0.1
-    # via prefect
 duckdb==0.7.1
     # via dbt-duckdb
-email-validator==2.1.0.post1
-    # via pydantic
-exceptiongroup==1.1.3
 filelock==3.10.7
     # via tldextract
 fonttools==4.39.2
     # via matplotlib
-frozenlist==1.3.3
-    # via
-    #   aiohttp
-    #   aiosignal
-fsspec==2023.1.0
-    # via
-    #   gcsfs
-    #   prefect
 future==0.18.3
     # via parsedatetime
-gcsfs==2023.1.0
 google-api-core==2.11.0
-    # via
-    #   google-api-python-client
-    #   google-cloud-core
-    #   google-cloud-storage
+    # via google-api-python-client
 google-api-python-client==2.83.0
-    # via prefect-gcp
 google-auth==2.16.3
     # via
-    #   gcsfs
     #   google-api-core
     #   google-api-python-client
     #   google-auth-httplib2
     #   google-auth-oauthlib
-    #   google-cloud-core
-    #   google-cloud-storage
-    #   kubernetes
 google-auth-httplib2==0.1.0
     # via google-api-python-client
 google-auth-oauthlib==1.0.0
-    # via gcsfs
-google-cloud-core==2.3.2
-    # via google-cloud-storage
-google-cloud-storage==2.8.0
-    # via
-    #   gcsfs
-    #   prefect-gcp
-google-crc32c==1.5.0
-    # via google-resumable-media
-google-resumable-media==2.4.1
-    # via google-cloud-storage
 googleapis-common-protos==1.59.0
     # via google-api-core
-graphql-core==3.2.3
-    # via sgqlc
-graphviz==0.20.1
-    # via prefect
-greenlet==2.0.2
-    # via sqlalchemy
-griffe==0.25.5
-    # via prefect
 grpclib==0.4.3
     # via betterproto
-h11==0.14.0
-    # via
-    #   httpcore
-    #   uvicorn
 h2==4.1.0
-    # via
-    #   grpclib
-    #   httpx
+    # via grpclib
 h3==3.7.6
 hologram==0.0.15
     # via dbt-core
@@ -180,35 +84,24 @@ hpack==4.0.0
     # via h2
 htmlmin==0.1.12
     # via ydata-profiling
-httpcore==0.16.3
-    # via
-    #   httpx
-    #   prefect
 httplib2==0.22.0
     # via
     #   google-api-python-client
     #   google-auth-httplib2
-httpx==0.23.3
-    # via prefect
 hyperframe==6.0.1
     # via h2
 hyperlink==21.0.0
     # via twisted
 idna==3.4
     # via
-    #   anyio
     #   dbt-core
-    #   email-validator
     #   hyperlink
     #   requests
-    #   rfc3986
     #   tldextract
-    #   yarl
 imagehash==4.3.1
     # via
     #   visions
     #   ydata-profiling
-importlib-metadata==6.1.0
 incremental==22.10.0
     # via twisted
 isodate==0.6.1
@@ -224,24 +117,15 @@ itemloaders==1.0.6
 jinja2==3.1.2
     # via
     #   dbt-core
-    #   prefect
     #   ydata-profiling
 jmespath==1.0.1
     # via itemloaders
 joblib==1.2.0
     # via phik
-jsonpatch==1.32
-    # via prefect
-jsonpointer==2.3
-    # via jsonpatch
 jsonschema==3.2.0
-    # via
-    #   hologram
-    #   prefect
+    # via hologram
 kiwisolver==1.4.4
     # via matplotlib
-kubernetes==26.1.0
-    # via prefect
 leather==0.3.4
     # via agate
 logbook==1.5.3
@@ -251,16 +135,9 @@ lxml==4.9.2
     # via
     #   parsel
     #   scrapy
-mako==1.2.4
-    # via alembic
-markdown==3.4.3
-    # via apprise
-markdown-it-py==2.2.0
-    # via rich
 markupsafe==2.1.2
     # via
     #   jinja2
-    #   mako
     #   werkzeug
 mashumaro==3.3.1
     # via dbt-core
@@ -269,17 +146,12 @@ matplotlib==3.6.3
     #   phik
     #   seaborn
     #   ydata-profiling
-mdurl==0.1.2
-    # via markdown-it-py
 minimal-snowplow-tracker==0.0.2
     # via dbt-core
 msgpack==1.0.5
     # via mashumaro
 multidict==6.0.4
-    # via
-    #   aiohttp
-    #   grpclib
-    #   yarl
+    # via grpclib
 multimethod==1.9.1
     # via
     #   visions
@@ -305,15 +177,11 @@ numpy==1.23.5
     #   ydata-profiling
 oauthlib==3.2.2
     # via requests-oauthlib
-orjson==3.8.8
-    # via prefect
 packaging==23.0
     # via
     #   dbt-core
-    #   docker
     #   matplotlib
     #   parsel
-    #   prefect
     #   scrapy
     #   statsmodels
 pandas==1.5.3
@@ -330,13 +198,9 @@ parsel==1.7.0
     #   itemloaders
     #   scrapy
 pathspec==0.10.3
-    # via
-    #   dbt-core
-    #   prefect
+    # via dbt-core
 patsy==0.5.3
     # via statsmodels
-pendulum==2.1.2
-    # via prefect
 phik==0.12.3
     # via ydata-profiling
 pillow==9.4.0
@@ -345,15 +209,6 @@ pillow==9.4.0
     #   matplotlib
     #   visions
 polars==0.16.16
-prefect==2.14.2
-    # via
-    #   prefect-dbt
-    #   prefect-gcp
-    #   prefect-shell
-prefect-dbt==0.3.1
-prefect-gcp==0.3.0
-prefect-shell==0.1.5
-    # via prefect-dbt
 protego==0.2.1
     # via scrapy
 protobuf==4.22.1
@@ -361,7 +216,6 @@ protobuf==4.22.1
     #   google-api-core
     #   googleapis-common-protos
 pyarrow==11.0.0
-    # via polars
 pyasn1==0.4.8
     # via
     #   pyasn1-modules
@@ -374,14 +228,10 @@ pyasn1-modules==0.2.8
 pycparser==2.21
     # via cffi
 pydantic==1.10.7
-    # via
-    #   prefect
-    #   ydata-profiling
+    # via ydata-profiling
 pydispatcher==2.0.7
     # via scrapy
 pygeohash==1.2.0
-pygments==2.14.0
-    # via rich
 pyopenssl==23.1.1
     # via scrapy
 pyparsing==3.0.9
@@ -392,55 +242,30 @@ pyrsistent==0.19.3
     # via jsonschema
 python-dateutil==2.8.2
     # via
-    #   croniter
-    #   dateparser
     #   hologram
-    #   kubernetes
     #   matplotlib
     #   pandas
-    #   pendulum
-    #   prefect
 python-dotenv==1.0.0
 python-slugify==8.0.1
-    # via
-    #   agate
-    #   prefect
+    # via agate
 pytimeparse==1.1.8
     # via agate
 pytz==2023.2
     # via
-    #   dateparser
     #   dbt-core
     #   pandas
-    #   prefect
-pytz-deprecation-shim==0.1.0.post0
-    # via tzlocal
-pytzdata==2020.1
-    # via pendulum
 pywavelets==1.4.1
     # via imagehash
 pyyaml==6.0
     # via
-    #   apprise
     #   dbt-core
-    #   kubernetes
-    #   prefect
     #   ydata-profiling
 queuelib==1.6.2
     # via scrapy
-readchar==4.0.5
-    # via prefect
-regex==2023.3.23
-    # via dateparser
 requests==2.28.2
     # via
-    #   apprise
     #   dbt-core
-    #   docker
-    #   gcsfs
     #   google-api-core
-    #   google-cloud-storage
-    #   kubernetes
     #   minimal-snowplow-tracker
     #   requests-file
     #   requests-oauthlib
@@ -449,20 +274,9 @@ requests==2.28.2
 requests-file==1.5.1
     # via tldextract
 requests-oauthlib==1.3.1
-    # via
-    #   apprise
-    #   google-auth-oauthlib
-    #   kubernetes
-rfc3986==1.5.0
-    # via httpx
-rich==13.3.3
-    # via prefect
+    # via google-auth-oauthlib
 rsa==4.9
     # via google-auth
-ruamel-yaml==0.17.35
-    # via prefect
-ruamel-yaml-clib==0.2.8
-    # via ruamel-yaml
 ruff==0.0.292
 scipy==1.9.3
     # via
@@ -478,12 +292,8 @@ service-identity==21.1.0
 setuptools==69.5.1
     # via
     #   jsonschema
-    #   kubernetes
-    #   readchar
     #   scrapy
     #   zope-interface
-sgqlc==16.1
-    # via prefect-dbt
 six==1.16.0
     # via
     #   automat
@@ -491,7 +301,6 @@ six==1.16.0
     #   google-auth-httplib2
     #   isodate
     #   jsonschema
-    #   kubernetes
     #   leather
     #   minimal-snowplow-tracker
     #   patsy
@@ -499,21 +308,8 @@ six==1.16.0
     #   python-dateutil
     #   requests-file
     #   service-identity
-sniffio==1.3.0
-    # via
-    #   anyio
-    #   asgi-lifespan
-    #   httpcore
-    #   httpx
-    #   prefect
-sqlalchemy==1.4.47
-    # via
-    #   alembic
-    #   prefect
 sqlparse==0.4.3
     # via dbt-core
-starlette==0.27.0
-    # via prefect
 statsmodels==0.13.5
     # via ydata-profiling
 stringcase==1.2.0
@@ -524,8 +320,6 @@ text-unidecode==1.3
     # via python-slugify
 tldextract==3.4.0
     # via scrapy
-toml==0.10.2
-    # via prefect
 toolz==0.12.0
 tqdm==4.64.1
     # via ydata-profiling
@@ -534,30 +328,16 @@ twisted==22.10.0
 typeguard==2.13.3
     # via ydata-profiling
 typer==0.7.0
-    # via prefect
 typing-extensions==4.5.0
     # via
-    #   alembic
     #   dbt-core
     #   mashumaro
-    #   prefect
     #   pydantic
     #   twisted
-tzdata==2023.2
-    # via pytz-deprecation-shim
-tzlocal==4.3
-    # via dateparser
-ujson==5.8.0
-    # via prefect
 uritemplate==4.1.1
     # via google-api-python-client
 urllib3==1.26.15
-    # via
-    #   docker
-    #   kubernetes
-    #   requests
-uvicorn==0.21.1
-    # via prefect
+    # via requests
 visions==0.7.5
     # via ydata-profiling
 w3lib==2.1.1
@@ -565,19 +345,9 @@ w3lib==2.1.1
     #   itemloaders
     #   parsel
     #   scrapy
-websocket-client==1.5.1
-    # via
-    #   docker
-    #   kubernetes
-websockets==10.4
-    # via prefect
 werkzeug==2.2.3
     # via dbt-core
-yarl==1.8.2
-    # via aiohttp
 ydata-profiling==4.1.2
-zipp==3.15.0
-    # via importlib-metadata
 zope-interface==6.0
     # via
     #   scrapy
diff --git a/deps/requirements.in b/deps/requirements.in
index a17ae38..c069c1c 100644
--- a/deps/requirements.in
+++ b/deps/requirements.in
@@ -6,21 +6,10 @@ scrapy>=2,<3
 toolz
 loguru
 ydata-profiling
-
-# Leaving dvc out for now. The issue is a dependency of dvc[gs] (specifically
-# fsspec/gcsfs requires fsspec==2023.3.0. Prefect explicitly ignores this
-# version of fsspec for ... reasons. Idk. It's a PITA.
-# dvc[gs]
-
 duckdb
 polars[pyarrow]
-prefect~=2.14
-prefect-shell
-prefect-dbt
-prefect-gcp[cloud_storage]
 h3<4.0
 dbt-duckdb
-gcsfs!=2023.3.0
 google-api-python-client
 google-auth-httplib2
 google-auth-oauthlib
\ No newline at end of file
diff --git a/deps/requirements.txt b/deps/requirements.txt
index 406ab41..80ebce9 100644
--- a/deps/requirements.txt
+++ b/deps/requirements.txt
@@ -1,35 +1,9 @@
-#
-# This file is autogenerated by pip-compile with Python 3.9
-# by the following command:
-#
-#    pip-compile --output-file=deps/requirements.txt deps/requirements.in
-#
+# This file was autogenerated by uv via the following command:
+#    uv pip compile deps/requirements.in --output-file deps/requirements.txt
 agate==1.7.0
     # via dbt-core
-aiohttp==3.8.4
-    # via gcsfs
-aiosignal==1.3.1
-    # via aiohttp
-aiosqlite==0.18.0
-    # via prefect
-alembic==1.10.2
-    # via prefect
-anyio==3.7.1
-    # via
-    #   httpcore
-    #   prefect
-    #   starlette
-apprise==1.3.0
-    # via prefect
-asgi-lifespan==2.0.0
-    # via prefect
-async-timeout==4.0.2
-    # via aiohttp
-asyncpg==0.27.0
-    # via prefect
 attrs==22.2.0
     # via
-    #   aiohttp
     #   automat
     #   jsonschema
     #   service-identity
@@ -44,44 +18,25 @@ betterproto==1.2.5
 cachetools==5.3.0
     # via google-auth
 certifi==2022.12.7
-    # via
-    #   apprise
-    #   httpcore
-    #   httpx
-    #   kubernetes
-    #   requests
+    # via requests
 cffi==1.15.1
     # via
     #   cryptography
     #   dbt-core
 charset-normalizer==3.1.0
-    # via
-    #   aiohttp
-    #   requests
+    # via requests
 click==8.1.3
     # via
-    #   apprise
     #   dbt-core
-    #   prefect
     #   typer
-    #   uvicorn
-cloudpickle==2.2.1
-    # via prefect
 colorama==0.4.6
-    # via
-    #   dbt-core
-    #   griffe
+    # via dbt-core
 constantly==15.1.0
     # via twisted
 contourpy==1.0.7
     # via matplotlib
-coolname==2.2.0
-    # via prefect
-croniter==1.3.8
-    # via prefect
 cryptography==40.0.1
     # via
-    #   prefect
     #   pyopenssl
     #   scrapy
     #   service-identity
@@ -91,143 +46,62 @@ cssselect==1.2.0
     #   scrapy
 cycler==0.11.0
     # via matplotlib
-dateparser==1.1.8
-    # via prefect
 dbt-core==1.4.5
-    # via
-    #   dbt-duckdb
-    #   prefect-dbt
+    # via dbt-duckdb
 dbt-duckdb==1.4.1
-    # via -r deps/requirements.in
 dbt-extractor==0.4.1
     # via dbt-core
-decorator==5.1.1
-    # via gcsfs
-dnspython==2.4.2
-    # via email-validator
-docker==6.0.1
-    # via prefect
 duckdb==0.7.1
-    # via
-    #   -r deps/requirements.in
-    #   dbt-duckdb
-email-validator==2.1.0.post1
-    # via pydantic
-exceptiongroup==1.1.3
-    # via anyio
+    # via dbt-duckdb
 filelock==3.10.7
     # via tldextract
 fonttools==4.39.2
     # via matplotlib
-frozenlist==1.3.3
-    # via
-    #   aiohttp
-    #   aiosignal
-fsspec==2023.1.0
-    # via
-    #   gcsfs
-    #   prefect
 future==0.18.3
     # via parsedatetime
-gcsfs==2023.1.0
-    # via -r deps/requirements.in
 google-api-core==2.11.0
-    # via
-    #   google-api-python-client
-    #   google-cloud-core
-    #   google-cloud-storage
+    # via google-api-python-client
 google-api-python-client==2.83.0
-    # via
-    #   -r deps/requirements.in
-    #   prefect-gcp
 google-auth==2.16.3
     # via
-    #   gcsfs
     #   google-api-core
     #   google-api-python-client
     #   google-auth-httplib2
     #   google-auth-oauthlib
-    #   google-cloud-core
-    #   google-cloud-storage
-    #   kubernetes
 google-auth-httplib2==0.1.0
-    # via
-    #   -r deps/requirements.in
-    #   google-api-python-client
+    # via google-api-python-client
 google-auth-oauthlib==1.0.0
-    # via
-    #   -r deps/requirements.in
-    #   gcsfs
-google-cloud-core==2.3.2
-    # via google-cloud-storage
-google-cloud-storage==2.8.0
-    # via
-    #   gcsfs
-    #   prefect-gcp
-google-crc32c==1.5.0
-    # via google-resumable-media
-google-resumable-media==2.4.1
-    # via google-cloud-storage
 googleapis-common-protos==1.59.0
     # via google-api-core
-graphql-core==3.2.3
-    # via sgqlc
-graphviz==0.20.1
-    # via prefect
-greenlet==2.0.2
-    # via sqlalchemy
-griffe==0.25.5
-    # via prefect
 grpclib==0.4.3
     # via betterproto
-h11==0.14.0
-    # via
-    #   httpcore
-    #   uvicorn
 h2==4.1.0
-    # via
-    #   grpclib
-    #   httpx
+    # via grpclib
 h3==3.7.6
-    # via -r deps/requirements.in
 hologram==0.0.15
     # via dbt-core
 hpack==4.0.0
     # via h2
 htmlmin==0.1.12
     # via ydata-profiling
-httpcore==0.16.3
-    # via
-    #   httpx
-    #   prefect
 httplib2==0.22.0
     # via
     #   google-api-python-client
     #   google-auth-httplib2
-httpx[http2]==0.23.3
-    # via prefect
 hyperframe==6.0.1
     # via h2
 hyperlink==21.0.0
     # via twisted
 idna==3.4
     # via
-    #   anyio
     #   dbt-core
-    #   email-validator
     #   hyperlink
     #   requests
-    #   rfc3986
     #   tldextract
-    #   yarl
 imagehash==4.3.1
     # via
     #   visions
     #   ydata-profiling
-importlib-metadata==6.1.0
-    # via
-    #   markdown
-    #   prefect
 incremental==22.10.0
     # via twisted
 isodate==0.6.1
@@ -243,64 +117,41 @@ itemloaders==1.0.6
 jinja2==3.1.2
     # via
     #   dbt-core
-    #   prefect
     #   ydata-profiling
 jmespath==1.0.1
     # via itemloaders
 joblib==1.2.0
     # via phik
-jsonpatch==1.32
-    # via prefect
-jsonpointer==2.3
-    # via jsonpatch
 jsonschema==3.2.0
-    # via
-    #   hologram
-    #   prefect
+    # via hologram
 kiwisolver==1.4.4
     # via matplotlib
-kubernetes==26.1.0
-    # via prefect
 leather==0.3.4
     # via agate
 logbook==1.5.3
     # via dbt-core
 loguru==0.6.0
-    # via -r deps/requirements.in
 lxml==4.9.2
     # via
-    #   -r deps/requirements.in
     #   parsel
     #   scrapy
-mako==1.2.4
-    # via alembic
-markdown==3.4.3
-    # via apprise
-markdown-it-py==2.2.0
-    # via rich
 markupsafe==2.1.2
     # via
     #   jinja2
-    #   mako
     #   werkzeug
-mashumaro[msgpack]==3.3.1
+mashumaro==3.3.1
     # via dbt-core
 matplotlib==3.6.3
     # via
     #   phik
     #   seaborn
     #   ydata-profiling
-mdurl==0.1.2
-    # via markdown-it-py
 minimal-snowplow-tracker==0.0.2
     # via dbt-core
 msgpack==1.0.5
     # via mashumaro
 multidict==6.0.4
-    # via
-    #   aiohttp
-    #   grpclib
-    #   yarl
+    # via grpclib
 multimethod==1.9.1
     # via
     #   visions
@@ -326,15 +177,11 @@ numpy==1.23.5
     #   ydata-profiling
 oauthlib==3.2.2
     # via requests-oauthlib
-orjson==3.8.8
-    # via prefect
 packaging==23.0
     # via
     #   dbt-core
-    #   docker
     #   matplotlib
     #   parsel
-    #   prefect
     #   scrapy
     #   statsmodels
 pandas==1.5.3
@@ -351,13 +198,9 @@ parsel==1.7.0
     #   itemloaders
     #   scrapy
 pathspec==0.10.3
-    # via
-    #   dbt-core
-    #   prefect
+    # via dbt-core
 patsy==0.5.3
     # via statsmodels
-pendulum==2.1.2
-    # via prefect
 phik==0.12.3
     # via ydata-profiling
 pillow==9.4.0
@@ -365,22 +208,7 @@ pillow==9.4.0
     #   imagehash
     #   matplotlib
     #   visions
-polars[pyarrow]==0.16.16
-    # via -r deps/requirements.in
-prefect==2.14.2
-    # via
-    #   -r deps/requirements.in
-    #   prefect-dbt
-    #   prefect-gcp
-    #   prefect-shell
-prefect-dbt==0.3.1
-    # via -r deps/requirements.in
-prefect-gcp[cloud_storage]==0.3.0
-    # via -r deps/requirements.in
-prefect-shell==0.1.5
-    # via
-    #   -r deps/requirements.in
-    #   prefect-dbt
+polars==0.16.16
 protego==0.2.1
     # via scrapy
 protobuf==4.22.1
@@ -400,16 +228,11 @@ pyasn1-modules==0.2.8
     #   service-identity
 pycparser==2.21
     # via cffi
-pydantic[email]==1.10.7
-    # via
-    #   prefect
-    #   ydata-profiling
+pydantic==1.10.7
+    # via ydata-profiling
 pydispatcher==2.0.7
     # via scrapy
 pygeohash==1.2.0
-    # via -r deps/requirements.in
-pygments==2.14.0
-    # via rich
 pyopenssl==23.1.1
     # via scrapy
 pyparsing==3.0.9
@@ -420,56 +243,30 @@ pyrsistent==0.19.3
     # via jsonschema
 python-dateutil==2.8.2
     # via
-    #   croniter
-    #   dateparser
     #   hologram
-    #   kubernetes
     #   matplotlib
     #   pandas
-    #   pendulum
-    #   prefect
 python-dotenv==1.0.0
-    # via -r deps/requirements.in
 python-slugify==8.0.1
-    # via
-    #   agate
-    #   prefect
+    # via agate
 pytimeparse==1.1.8
     # via agate
 pytz==2023.2
     # via
-    #   dateparser
     #   dbt-core
     #   pandas
-    #   prefect
-pytz-deprecation-shim==0.1.0.post0
-    # via tzlocal
-pytzdata==2020.1
-    # via pendulum
 pywavelets==1.4.1
     # via imagehash
 pyyaml==6.0
     # via
-    #   apprise
     #   dbt-core
-    #   kubernetes
-    #   prefect
     #   ydata-profiling
 queuelib==1.6.2
     # via scrapy
-readchar==4.0.5
-    # via prefect
-regex==2023.3.23
-    # via dateparser
 requests==2.28.2
     # via
-    #   apprise
     #   dbt-core
-    #   docker
-    #   gcsfs
     #   google-api-core
-    #   google-cloud-storage
-    #   kubernetes
     #   minimal-snowplow-tracker
     #   requests-file
     #   requests-oauthlib
@@ -478,20 +275,9 @@ requests==2.28.2
 requests-file==1.5.1
     # via tldextract
 requests-oauthlib==1.3.1
-    # via
-    #   apprise
-    #   google-auth-oauthlib
-    #   kubernetes
-rfc3986[idna2008]==1.5.0
-    # via httpx
-rich==13.3.3
-    # via prefect
+    # via google-auth-oauthlib
 rsa==4.9
     # via google-auth
-ruamel-yaml==0.17.35
-    # via prefect
-ruamel-yaml-clib==0.2.8
-    # via ruamel-yaml
 scipy==1.9.3
     # via
     #   imagehash
@@ -499,13 +285,15 @@ scipy==1.9.3
     #   statsmodels
     #   ydata-profiling
 scrapy==2.8.0
-    # via -r deps/requirements.in
 seaborn==0.12.2
     # via ydata-profiling
 service-identity==21.1.0
     # via scrapy
-sgqlc==16.1
-    # via prefect-dbt
+setuptools==69.5.1
+    # via
+    #   jsonschema
+    #   scrapy
+    #   zope-interface
 six==1.16.0
     # via
     #   automat
@@ -513,7 +301,6 @@ six==1.16.0
     #   google-auth-httplib2
     #   isodate
     #   jsonschema
-    #   kubernetes
     #   leather
     #   minimal-snowplow-tracker
     #   patsy
@@ -521,21 +308,8 @@ six==1.16.0
     #   python-dateutil
     #   requests-file
     #   service-identity
-sniffio==1.3.0
-    # via
-    #   anyio
-    #   asgi-lifespan
-    #   httpcore
-    #   httpx
-    #   prefect
-sqlalchemy[asyncio]==1.4.47
-    # via
-    #   alembic
-    #   prefect
 sqlparse==0.4.3
     # via dbt-core
-starlette==0.27.0
-    # via prefect
 statsmodels==0.13.5
     # via ydata-profiling
 stringcase==1.2.0
@@ -546,10 +320,7 @@ text-unidecode==1.3
     # via python-slugify
 tldextract==3.4.0
     # via scrapy
-toml==0.10.2
-    # via prefect
 toolz==0.12.0
-    # via -r deps/requirements.in
 tqdm==4.64.1
     # via ydata-profiling
 twisted==22.10.0
@@ -557,59 +328,27 @@ twisted==22.10.0
 typeguard==2.13.3
     # via ydata-profiling
 typer==0.7.0
-    # via
-    #   -r deps/requirements.in
-    #   prefect
 typing-extensions==4.5.0
     # via
-    #   alembic
     #   dbt-core
     #   mashumaro
-    #   polars
-    #   prefect
     #   pydantic
-    #   starlette
     #   twisted
-tzdata==2023.2
-    # via pytz-deprecation-shim
-tzlocal==4.3
-    # via dateparser
-ujson==5.8.0
-    # via prefect
 uritemplate==4.1.1
     # via google-api-python-client
 urllib3==1.26.15
-    # via
-    #   docker
-    #   kubernetes
-    #   requests
-uvicorn==0.21.1
-    # via prefect
-visions[type_image_path]==0.7.5
+    # via requests
+visions==0.7.5
     # via ydata-profiling
 w3lib==2.1.1
     # via
     #   itemloaders
     #   parsel
     #   scrapy
-websocket-client==1.5.1
-    # via
-    #   docker
-    #   kubernetes
-websockets==10.4
-    # via prefect
 werkzeug==2.2.3
     # via dbt-core
-yarl==1.8.2
-    # via aiohttp
 ydata-profiling==4.1.2
-    # via -r deps/requirements.in
-zipp==3.15.0
-    # via importlib-metadata
 zope-interface==6.0
     # via
     #   scrapy
     #   twisted
-
-# The following packages are considered to be unsafe in a requirements file:
-# setuptools
diff --git a/pipeline/bfro_pipeline.py b/pipeline/bfro_pipeline.py
deleted file mode 100644
index de7edac..0000000
--- a/pipeline/bfro_pipeline.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import typer
-from prefect import flow, task, get_run_logger
-from prefect_dbt.cli.commands import DbtCoreOperation
-from update_reports import update_reports
-from update_geocoder import pull_and_update_geocoded_reports
-from update_weather import update_weather
-from pathlib import Path
-from typing import Optional
-
-
-@flow(name="Update sources")
-def set_up_sources(
-    data_dir: Path = Path("data"),
-    weather_limit: int = 900,
-    test_run: bool = False,
-    visual_crossing_key: Optional[str] = None,
-) -> bool:
-    reports_updated = update_reports(data_dir=data_dir, test_run=test_run)
-    geocoder_updated = pull_and_update_geocoded_reports(data_dir=data_dir)
-    weather_updated = False
-    if geocoder_updated:
-        weather_updated = update_weather(
-            limit=weather_limit,
-            data_dir=data_dir,
-            visual_crossing_key=visual_crossing_key,
-        )
-
-    return reports_updated and geocoder_updated and weather_updated
-
-
-@task(name="DBT test sources")
-def dbt_test_sources(data_dir: Path = Path("data")) -> bool:
-    logger = get_run_logger()
-    logger.info("Testing sources.")
-    DbtCoreOperation(
-        commands=[
-            "dbt test --select source:local_files "
-            f'--vars \'{{"data_dir":"{data_dir.absolute()}"}}\''
-        ],
-        project_dir="bfro_mini_warehouse",
-        profiles_dir="bfro_mini_warehouse",
-    ).run()
-    logger.info("Testing completed.")
-    return True
-
-
-@task(name="DBT run")
-def dbt_run(data_dir: Path = Path("data")) -> bool:
-    logger = get_run_logger()
-    logger.info("Building csv files with DBT.")
-    DbtCoreOperation(
-        commands=[
-            f'dbt run --vars \'{{"data_dir":"{data_dir.absolute()}"}}\''
-        ],
-        project_dir="bfro_mini_warehouse",
-        profiles_dir="bfro_mini_warehouse",
-    ).run()
-    logger.info("DBT run completed.")
-
-    return True
-
-
-@task(name="DBT test")
-def dbt_test(data_dir: Path = Path("data")) -> bool:
-    logger = get_run_logger()
-    logger.info("Testing DBT models.")
-    DbtCoreOperation(
-        commands=[
-            "dbt test --exclude source:* "
-            # Yikes that double escaping 😬
-            f'--vars \'{{"data_dir":"{data_dir.absolute()}"}}\''
-        ],
-        project_dir="bfro_mini_warehouse",
-        profiles_dir="bfro_mini_warehouse",
-    ).run()
-    logger.info("Testing completed.")
-    return True
-
-
-@flow(name="DBT")
-def dbt(data_dir: Path = Path("data")) -> bool:
-    sources_pass = dbt_test_sources(data_dir)
-    if not sources_pass:
-        return False
-    run_completed = dbt_run(data_dir)
-    if not run_completed:
-        return False
-    tests_pass = dbt_test(data_dir)
-    return tests_pass
-
-
-@flow(name="BFRO Pipeline")
-def main(
-    test_run: bool = False,
-    dbt_only: bool = False,
-    data_dir: Path = Path("data"),
-    visual_crossing_key: Optional[str] = None,
-) -> bool:
-    logger = get_run_logger()
-    weather_limit = 900
-    if test_run:
-        logger.info("Test run selected.")
-        weather_limit = 25
-
-    sources_updated = False
-    if not dbt_only:
-        sources_updated = set_up_sources(
-            data_dir=data_dir,
-            weather_limit=weather_limit,
-            test_run=test_run,
-            visual_crossing_key=visual_crossing_key,
-        )
-
-    if not sources_updated and not dbt_only:
-        logger.info("Source update incomplete. Terminating flow.")
-        return False
-    dbt(data_dir=data_dir)
-
-    return True
-
-
-if __name__ == "__main__":
-    typer.run(main)
diff --git a/pipeline/bfro_pipeline_docker.py b/pipeline/bfro_pipeline_docker.py
deleted file mode 100644
index ef407fb..0000000
--- a/pipeline/bfro_pipeline_docker.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import typer
-from prefect import flow, get_run_logger
-from prefect.blocks.system import Secret
-from prefect_gcp import GcpCredentials, GcsBucket
-from bfro_pipeline import main as bfro_pipeline
-from upload_to_gdrive import upload_to_gdrive
-from pathlib import Path
-
-
-@flow(name="BFRO Pipeline (Docker)")
-def main(data_dir: Path = Path("data"), test_run: bool = False):
-    logger = get_run_logger()
-
-    logger.info("Fetching credentials.")
-    gcp_credentials = GcpCredentials.load("prefect-gcs-rw")
-
-    logger.info("Fetching visual crossing key.")
-    visual_crossing_block = Secret.load("visual-crossing-key")
-    visual_crossing_key = visual_crossing_block.get()
-
-    logger.info("Fetching service account location for GDrive upload.")
-    sa_credentials_location = Secret.load(
-        "bfro-gdrive-service-account-location"
-    ).get()
-
-    logger.info("Fetching GDrive folder ID for test.")
-    bfro_test_gdrive_folder_id = Secret.load(
-        "bfro-test-gdrive-folder-id"
-    ).get()
-
-    logger.info("Fetching editor email for test.")
-    bfro_test_editor_email = Secret.load("bfro-test-editor-email").get()
-
-    logger.info("Fetching GDrive folder ID for prod.")
-    bfro_prod_gdrive_folder_id = Secret.load(
-        "bfro-prod-gdrive-folder-id"
-    ).get()
-
-    logger.info("Fetching GDrive editor email for prod.")
-    bfro_prod_editor_email = Secret.load("bfro-prod-editor-email").get()
-
-    logger.info(f"Downloading data to {data_dir}.")
-    bigfoot_bucket = GcsBucket(
-        bucket="trenner-datasets",
-        gcp_credentials=gcp_credentials,
-    )
-    bigfoot_bucket.download_folder_to_path("bigfoot", data_dir)
-
-    logger.info("Executing pipeline.")
-    bfro_success = bfro_pipeline(
-        test_run=test_run, visual_crossing_key=visual_crossing_key
-    )
-
-    logger.info("Uploading to gdrive (test).")
-    upload_to_gdrive(
-        data_dir / "processed" / "bfro_reports_geocoded.csv",
-        sa_credentials_location=sa_credentials_location,
-        gdrive_folder_id=bfro_test_gdrive_folder_id,
-        owner_email=bfro_test_editor_email,
-    )
-
-    logger.info("Uploading to gdrive (prod).")
-    upload_to_gdrive(
-        data_dir / "processed" / "bfro_reports_geocoded.csv",
-        sa_credentials_location=sa_credentials_location,
-        gdrive_folder_id=bfro_prod_gdrive_folder_id,
-        owner_email=bfro_prod_editor_email,
-    )
-
-    logger.info(f"Uploading updated contents of {data_dir} to GCS.")
-    if bfro_success:
-        bigfoot_bucket.upload_from_folder(data_dir, "bigfoot")
-
-
-if __name__ == "__main__":
-    typer.run(main)
diff --git a/pipeline/main-deployment.yaml b/pipeline/main-deployment.yaml
deleted file mode 100644
index 9aab527..0000000
--- a/pipeline/main-deployment.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-###
-### A complete description of a Prefect Deployment for flow 'BFRO Pipeline (Docker)'
-###
-name: bfro-pipeline
-description: null
-version: 42854cfbb777416a51819d5b21e78974
-# The work queue that will handle this deployment's runs
-work_queue_name: default
-work_pool_name: bfro-agent-pool
-tags: []
-parameters: {}
-schedule:
-  interval: 1209600.0
-  anchor_date: '2023-04-03T12:54:15.962000+00:00'
-  timezone: America/Chicago
-is_schedule_active: true
-infra_overrides: {}
-
-###
-### DO NOT EDIT BELOW THIS LINE
-###
-flow_name: BFRO Pipeline (Docker)
-manifest_path: null
-infrastructure:
-  type: process
-  env: {}
-  labels: {}
-  name: null
-  command: null
-  stream_output: true
-  working_dir: null
-  _block_document_id: c311d1a6-7dd2-4c82-b395-079f915f1ff3
-  _block_document_name: bfro-local
-  _is_anonymous: false
-  block_type_slug: process
-  _block_type_slug: process
-storage:
-  bucket_path: trenner-datasets/bfro-pipeline
-  service_account_info: '**********'
-  project: null
-  _block_document_id: 246e2c05-6aa6-42ba-9b1c-608904e06fd2
-  _block_document_name: bfro-pipeline-storage
-  _is_anonymous: false
-  block_type_slug: gcs
-  _block_type_slug: gcs
-path: ''
-entrypoint: bfro_pipeline_docker:main
-parameter_openapi_schema:
-  title: Parameters
-  type: object
-  properties:
-    data_dir:
-      title: data_dir
-      default: data
-      position: 0
-      type: string
-      format: path
-    test_run:
-      title: test_run
-      default: false
-      position: 1
-      type: boolean
-  required: null
-  definitions: null
-timestamp: '2023-06-13T12:36:35.704834+00:00'
diff --git a/pipeline/scripts/combine_raw_reports.py b/pipeline/scripts/combine_raw_reports.py
new file mode 100644
index 0000000..7469b90
--- /dev/null
+++ b/pipeline/scripts/combine_raw_reports.py
@@ -0,0 +1,60 @@
+import typer
+from pathlib import Path
+from loguru import logger
+import duckdb
+import polars as pl
+
+
+def main(
+    reports_orig_file: Path, reports_new_file: Path, reports_out_file: Path
+):
+    logger.info(f"reports_orig_file: {reports_orig_file.name}")
+    logger.info(f"reports_new_file: {reports_new_file.name}")
+    logger.info(f"reports_out_file: {reports_out_file.name}")
+
+    if reports_orig_file.exists():
+        combined_reports_frame = duckdb.sql(
+            f"""
+            WITH all_rows AS (
+                SELECT * FROM '{reports_orig_file}'
+                UNION ALL
+                SELECT * FROM READ_NDJSON(
+                    '{reports_new_file}',
+                    columns={{
+                        year: 'VARCHAR',
+                        season: 'VARCHAR',
+                        month: 'VARCHAR',
+                        date: 'VARCHAR',
+                        state: 'VARCHAR',
+                        county: 'VARCHAR',
+                        location_details: 'VARCHAR',
+                        nearest_town: 'VARCHAR',
+                        nearest_road: 'VARCHAR',
+                        observed: 'VARCHAR',
+                        also_noticed: 'VARCHAR',
+                        other_witnesses: 'VARCHAR',
+                        other_stories: 'VARCHAR',
+                        time_and_conditions: 'VARCHAR',
+                        environment: 'VARCHAR',
+                        report_number: 'BIGINT',
+                        report_class: 'VARCHAR',
+                        "a_&_g_references": 'VARCHAR',
+                        pulled_datetime: 'VARCHAR'
+                    }}
+                )
+            )
+            SELECT * FROM all_rows
+            QUALIFY ROW_NUMBER() OVER(
+                PARTITION BY report_number ORDER BY pulled_datetime DESC
+            ) = 1
+            """
+        ).pl()
+    else:
+        combined_reports_frame = pl.read_ndjson(reports_new_file)
+
+    combined_reports_frame.write_csv(reports_out_file)
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/pipeline/scripts/pull_weather.py b/pipeline/scripts/pull_weather.py
index 9b461dc..c941ae9 100644
--- a/pipeline/scripts/pull_weather.py
+++ b/pipeline/scripts/pull_weather.py
@@ -130,6 +130,9 @@ def merge_new_records_with_weather_cache(
 def main(
     weather_cache_file: Path, geocoded_reports_file: Path, limit: int = 900
 ):
+    logger.info(f"weather_cache_file: {weather_cache_file.name}")
+    logger.info(f"geocoded_reports_file: {geocoded_reports_file.name}")
+    logger.info(f"limit: {limit}")
     visual_crossing_key = get_visual_crossing_key_from_env()
     logger.info(f"Getting missing weather keys from {geocoded_reports_file}.")
     missing_weather_keys = get_missing_weather_keys(
diff --git a/pipeline/upload_to_gdrive.py b/pipeline/scripts/upload_to_gdrive.py
similarity index 93%
rename from pipeline/upload_to_gdrive.py
rename to pipeline/scripts/upload_to_gdrive.py
index b4e224f..596ac59 100644
--- a/pipeline/upload_to_gdrive.py
+++ b/pipeline/scripts/upload_to_gdrive.py
@@ -1,5 +1,4 @@
 import typer
-from prefect import flow, task, get_run_logger
 from pathlib import Path
 from typing import Optional, List
 from googleapiclient import discovery
@@ -7,16 +6,15 @@
 from google.oauth2 import service_account
 import os
 from dotenv import load_dotenv, find_dotenv
+from loguru import logger
 
 
-@task(name="Upload")
-def upload_to_gdrive_task(
+def upload_to_gdrive(
     google_drive_service,
     file: Path,
     destination_folder_id: str,
     owner_emails: List[str] = [],
 ):
-    logger = get_run_logger()
     logger.info("Determining if file exists.")
     file_id: Optional[str] = None
     file_search_response = (
@@ -66,14 +64,12 @@ def upload_to_gdrive_task(
             logger.exception("Encountered error with sharing.")
 
 
-@flow(name="Save to GDrive")
-def upload_to_gdrive(
+def main(
     file: Path,
     sa_credentials_location: Optional[str] = None,
     gdrive_folder_id: Optional[str] = None,
     owner_email: Optional[str] = None,
 ):
-    logger = get_run_logger()
     if sa_credentials_location is None:
         logger.info(
             "Service account credentials location not passed, "
@@ -110,10 +106,11 @@ def upload_to_gdrive(
     )
 
     logger.info("Performing upload task.")
-    upload_to_gdrive_task(
+    upload_to_gdrive(
         google_drive_service, file, gdrive_folder_id, [owner_email]
     )
+    logger.info("Done!")
 
 
 if __name__ == "__main__":
-    typer.run(upload_to_gdrive)
+    typer.run(main)
diff --git a/pipeline/update_geocoder.py b/pipeline/update_geocoder.py
deleted file mode 100644
index 99f6af5..0000000
--- a/pipeline/update_geocoder.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from prefect import flow, task, get_run_logger
-from prefect_shell import ShellOperation
-import typer
-from pathlib import Path
-import polars as pl
-from lxml import etree
-from scripts.extract_locations_from_kml import extract_geocoded_reports
-from scripts.combine_geocoded_reports import combine_geocoded_reports
-
-
-@task(name="Unzip aspx file")
-def download_and_unzip_aspx_file(aspx_file: Path) -> Path:
-    ShellOperation(
-        commands=[
-            "wget http://www.bfro.net/app/AllReportsKMZ.aspx",
-            f"mv {aspx_file.name} {aspx_file.parent}",
-            f"unzip -o {aspx_file} -d {aspx_file.parent}",
-        ]
-    ).run()
-    return aspx_file.parent / "doc.kml"
-
-
-@task(name="Extract geocoded reports")
-def extract_geocoded_reports_task(kml_file: Path) -> pl.DataFrame:
-    logger = get_run_logger()
-    logger.info(f"Reading and parsing {kml_file.name}")
-    report_xml = etree.fromstring(kml_file.read_bytes())
-    logger.info(f"Extracting geocoded reports from {kml_file.name}")
-    return extract_geocoded_reports(report_xml)
-
-
-@task(name="Combine geocoded reports")
-def combine_geocoded_reports_task(
-    orig_report_file: Path, new_reports: pl.DataFrame
-) -> pl.DataFrame:
-    logger = get_run_logger()
-    logger.info(f"Reading original reports from {orig_report_file.name}")
-    orig_reports = pl.read_csv(orig_report_file)
-    logger.info("Combining original and new reports.")
-    combined_reports = combine_geocoded_reports(orig_reports, new_reports)
-    return combined_reports
-
-
-@task(name="Save combined reports back to raw")
-def save_combined_reports(
-    combined_reports: pl.DataFrame, orig_report_file: Path
-):
-    logger = get_run_logger()
-    logger.info(f"Saving combined reports back into {orig_report_file.name}")
-    combined_reports.write_csv(orig_report_file)
-
-
-@flow(name="Update geocoded reports")
-def pull_and_update_geocoded_reports(
-    data_dir: Path = Path("data"),
-) -> bool:
-    logger = get_run_logger()
-    aspx_file = data_dir / Path("raw/geocoder/AllReportsKMZ.aspx")
-    logger.info(f"aspx_file: {aspx_file}")
-
-    orig_report_file = data_dir / Path("raw/geocoder/geocoded_reports.csv")
-    logger.info(f"orig_report_file: {orig_report_file}")
-
-    source_report_file = data_dir / Path("sources/geocoded_reports.csv")
-    logger.info(f"source_report_file: {source_report_file}")
-
-    kml_file = download_and_unzip_aspx_file(aspx_file)
-    new_geocoded_reports = extract_geocoded_reports_task(kml_file)
-    if orig_report_file.exists():
-        combined_geocoded_reports = combine_geocoded_reports_task(
-            orig_report_file, new_geocoded_reports
-        )
-    else:
-        combined_geocoded_reports = new_geocoded_reports
-    save_combined_reports(combined_geocoded_reports, orig_report_file)
-    save_combined_reports(combined_geocoded_reports, source_report_file)
-    # Signals to downstream flows that the source is ready.
-    return True
-
-
-if __name__ == "__main__":
-    typer.run(pull_and_update_geocoded_reports)
diff --git a/pipeline/update_reports.py b/pipeline/update_reports.py
deleted file mode 100644
index ffcc403..0000000
--- a/pipeline/update_reports.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from prefect import flow, task, get_run_logger
-import typer
-from pathlib import Path
-import polars as pl
-import duckdb
-from prefect_shell import ShellOperation
-
-
-@task(name="Run scraper")
-def run_scraper_task(reports_new_file: Path, test_run: bool = False) -> Path:
-    logger = get_run_logger()
-    logger.info("Firing up the scraper.")
-    scraper_working_dir = "scraper/bfro_scrape"
-    scraper = ShellOperation(
-        commands=[
-            "scrapy crawl bfro_reports "
-            f"-a test_run={test_run} "
-            f"--overwrite-output new_reports.json:jsonlines"
-        ],
-        working_dir=scraper_working_dir,
-    ).trigger()
-
-    scraper.wait_for_completion()
-    logger.info(
-        "Scraper completed. "
-        f"Saved to {scraper_working_dir}/new_reports.json"
-    )
-
-    ShellOperation(
-        commands=[
-            f"cp {scraper_working_dir}/new_reports.json {reports_new_file}"
-        ]
-    ).run()
-    return reports_new_file
-
-
-@task(name="Combine reports")
-def combine_reports_task(
-    reports_orig_file: Path, reports_new_file: Path
-) -> pl.DataFrame:
-    logger = get_run_logger()
-    logger.info(
-        f"Combining reports in {reports_orig_file} with {reports_new_file}."
-    )
-    return duckdb.sql(
-        f"""
-        WITH all_rows AS (
-            SELECT * FROM '{reports_orig_file}'
-            UNION ALL
-            SELECT * FROM READ_NDJSON(
-                '{reports_new_file}',
-                columns={{
-                    year: 'VARCHAR',
-                    season: 'VARCHAR',
-                    month: 'VARCHAR',
-                    date: 'VARCHAR',
-                    state: 'VARCHAR',
-                    county: 'VARCHAR',
-                    location_details: 'VARCHAR',
-                    nearest_town: 'VARCHAR',
-                    nearest_road: 'VARCHAR',
-                    observed: 'VARCHAR',
-                    also_noticed: 'VARCHAR',
-                    other_witnesses: 'VARCHAR',
-                    other_stories: 'VARCHAR',
-                    time_and_conditions: 'VARCHAR',
-                    environment: 'VARCHAR',
-                    report_number: 'BIGINT',
-                    report_class: 'VARCHAR',
-                    "a_&_g_references": 'VARCHAR',
-                    pulled_datetime: 'VARCHAR'
-                }}
-            )
-        )
-        SELECT * FROM all_rows
-        QUALIFY ROW_NUMBER() OVER(
-            PARTITION BY report_number ORDER BY pulled_datetime DESC
-        ) = 1
-        """
-    ).pl()
-
-
-@flow(name="Update reports")
-def update_reports(
-    data_dir: Path = Path("data"),
-    test_run: bool = False,
-) -> bool:
-    logger = get_run_logger()
-    reports_orig_file = data_dir / Path("raw/reports/bfro_reports.csv")
-    logger.info(f"reports_orig_file: {reports_orig_file}")
-
-    reports_new_file = data_dir / Path("raw/reports/bfro_reports_new.json")
-    logger.info(f"reports_new_file: {reports_new_file}")
-
-    reports_source_file = data_dir / Path("sources/bfro_reports.csv")
-    logger.info(f"reports_source_file: {reports_source_file}")
-
-    # Mostly the output is a signal that the task completed, and establishes
-    # the data dependency between this and the combining reports task.
-    reports_new_file = run_scraper_task(reports_new_file, test_run=test_run)
-
-    if reports_orig_file.exists():
-        logger.info(f"{reports_orig_file} exists, combining with new data.")
-        combined_reports = combine_reports_task(
-            reports_orig_file, reports_new_file
-        )
-    else:
-        combined_reports = pl.read_csv(reports_new_file)
-
-    logger.info(f"Writing combined reports back to {reports_orig_file}.")
-    combined_reports.write_csv(reports_orig_file)
-    logger.info(f"Writing combined reports to {reports_source_file}.")
-    combined_reports.write_csv(reports_source_file)
-    return True
-
-
-if __name__ == "__main__":
-    typer.run(update_reports)
diff --git a/pipeline/update_weather.py b/pipeline/update_weather.py
deleted file mode 100644
index 277e4f3..0000000
--- a/pipeline/update_weather.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from prefect import flow, task, get_run_logger
-import typer
-from pathlib import Path
-import polars as pl
-from scripts.pull_weather import (
-    get_missing_weather_keys,
-    pull_missing_weather,
-    merge_new_records_with_weather_cache,
-    get_visual_crossing_key_from_env,
-)
-from typing import Tuple, Optional
-
-
-@task(name="Get missing weather keys")
-def get_missing_weather_keys_task(
-    geocoded_reports_file: Path, weather_cache_file: Path
-) -> pl.DataFrame:
-    return get_missing_weather_keys(geocoded_reports_file, weather_cache_file)
-
-
-@task(name="Pull missing weather data")
-def pull_weather_data(
-    missing_weather_keys: pl.DataFrame,
-    visual_crossing_key: str,
-    limit: int = 900,
-) -> Tuple[pl.DataFrame, bool]:
-    return pull_missing_weather(
-        missing_weather_keys,
-        visual_crossing_key,
-        limit=limit,
-        logger=get_run_logger(),
-    )
-
-
-@task(name="Merge weather data with cache.")
-def merge_weather_data(
-    weather_cache_file: Path, new_weather_data: pl.DataFrame
-) -> pl.DataFrame:
-    return merge_new_records_with_weather_cache(
-        weather_cache_file, new_weather_data
-    )
-
-
-@flow(name="Update weather")
-def update_weather(
-    data_dir: Path = Path("data"),
-    limit: int = 900,
-    visual_crossing_key: Optional[str] = None,
-) -> bool:
-    logger = get_run_logger()
-
-    if visual_crossing_key is None:
-        logger.info("Visual crossing key not passed, retrieving from env.")
-        visual_crossing_key = get_visual_crossing_key_from_env()
-
-    weather_cache_file = data_dir / Path("raw/weather/weather_cache.csv")
-    logger.info(f"weather_cache_file: {weather_cache_file}")
-
-    geocoded_reports_file = data_dir / Path(
-        "raw/geocoder/geocoded_reports.csv"
-    )
-    logger.info(f"geocoded_reports_file: {geocoded_reports_file}")
-
-    source_weather_file = data_dir / Path("sources/weather_cache.csv")
-    logger.info(f"source_weather_file: {source_weather_file}")
-
-    logger.info("Getting missing weather keys.")
-    missing_weather_keys = get_missing_weather_keys_task(
-        geocoded_reports_file, weather_cache_file
-    )
-    if missing_weather_keys.is_empty():
-        logger.info("Nothing new to pull. All done.")
-        # Return false because the weather limit was not reached.
-        # Signals to other flows we can proceed.
-        return True
-
-    logger.info(f"Total missing weather keys: {missing_weather_keys.shape[0]}")
-    logger.info(f"Pulling {min(missing_weather_keys.shape[0], limit)}.")
-    new_weather_data, weather_limit_reached = pull_weather_data(
-        missing_weather_keys, visual_crossing_key, limit=limit
-    )
-    logger.info(f"Pulled {new_weather_data.shape[0]} new weather records.")
-    if not weather_cache_file.exists():
-        logger.info(f"Weather cache file {weather_cache_file} does not exist.")
-        logger.info("Saving what was just pulled.")
-        weather_data_to_write = new_weather_data
-    else:
-        logger.info("Combining new weather data with existing weather data.")
-        weather_data_to_write = merge_weather_data(
-            weather_cache_file, new_weather_data
-        )
-    logger.info(f"Writing updated cache back to {weather_cache_file}.")
-    weather_data_to_write.write_csv(weather_cache_file)
-    logger.info(f"Writing updated cache to {source_weather_file}.")
-    weather_data_to_write.write_csv(source_weather_file)
-    logger.info("🌩️👣 All done with weather update 🌩️👣")
-    # If we reach the weather limit, we cannot proceed downstream.
-    return not weather_limit_reached
-
-
-if __name__ == "__main__":
-    typer.run(update_weather)
diff --git a/run-pipeline-local.sh b/run-pipeline-local.sh
new file mode 100644
index 0000000..321fd92
--- /dev/null
+++ b/run-pipeline-local.sh
@@ -0,0 +1,62 @@
+# Runs the pipeline end to end for development and ad-hoc runs.
+# Not designed for deployments.
+set -e
+cd pipeline # working dir for the whole script.
+
+############# PULL NEW REPORTS ###############
+echo "Pulling new reports, test_run=$1 ."
+cd scraper/bfro_scrape # working dir for the scraper
+scrapy crawl bfro_reports \
+    -a test_run=$1 \
+    --overwrite-output new_reports.json:jsonlines
+cd ../.. # should be back in pipeline/
+# Combines new reports with existing reports, as some reports will drop off
+# the BFRO website.
+# Move from the scraper into the raw data directory.
+cp scraper/bfro_scrape/new_reports.json data/raw/bfro_reports_new.json
+# Combine the reports.
+python scripts/combine_raw_reports.py \
+    data/raw/bfro_reports.csv \
+    data/raw/bfro_reports_new.json \
+    data/raw/bfro_reports_combined.csv
+# Set the combined reports as the new reports csv
+cp data/raw/bfro_reports_combined.csv data/raw/bfro_reports.csv
+# Copy the reports to the DBT source directory.
+cp data/raw/bfro_reports.csv data/sources/bfro_reports.csv
+
+############# PULL KML REPORTS ###############
+echo "Pulling kml file and extracting geocoded reports."
+wget http://www.bfro.net/app/AllReportsKMZ.aspx
+mv AllReportsKMZ.aspx data/raw/geocoder/
+unzip -o data/geocoder/AllReportsKMZ.aspx -d data/raw/geocoder/
+# Extract the lat / lon / report id / etc from the kml file.
+python scripts/extract_locations_from_kml.py \
+    data/raw/geocoder/doc.kml \
+    data/raw/geocoded_reports_new.csv
+# Combine the newly extracted reports with any existing KML sourced reports,
+# in case any are removed from the KML file.
+python scripts/combine_geocoded_reports.py \
+    data/raw/geocoded_reports.csv \
+    data/raw/geocoded_reports_new.csv \
+    data/raw/geocoded_reports_combined.csv
+# The combined reports are now current.
+cp data/raw/geocoded_reports_combined.csv data/raw/geocoded_reports.csv
+# Copy to the source file for dbt.
+cp data/raw/geocoded_reports.csv data/sources/geocoded_reports.csv
+
+############# PULL WEATHER ###############
+echo "Pulling weather."
+python scripts/pull_weather.py \
+    data/raw/weather/weather_cache.csv \
+    data/raw/geocoder/geocoded_reports.csv
+# Copy to the source file for dbt.
+cp data/raw/weather_cache.csv data/sources/weather_cache.csv
+
+############# RUN DBT ###############
+echo "Building mini warehouse."
+cd bfro_mini_warehouse
+dbt build --vars '{"data_dir":"../data"}'
+# Hop back into pipeline.
+cd ..
+# Hop back onto project root, we are done.
+cd ..
\ No newline at end of file

From f69652f876b804b9b362477a254a6a778a8791bf Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 09:15:09 -0500
Subject: [PATCH 05/11] Bump dbt-core and dbt-duckdb.

---
 deps/dev-requirements.txt | 80 +++++++++++++++++++++------------------
 deps/requirements.in      |  5 ++-
 deps/requirements.txt     | 80 +++++++++++++++++++++------------------
 3 files changed, 89 insertions(+), 76 deletions(-)

diff --git a/deps/dev-requirements.txt b/deps/dev-requirements.txt
index 3682e52..c44b356 100644
--- a/deps/dev-requirements.txt
+++ b/deps/dev-requirements.txt
@@ -6,6 +6,7 @@ attrs==22.2.0
     # via
     #   automat
     #   jsonschema
+    #   referencing
     #   service-identity
     #   twisted
     #   visions
@@ -13,8 +14,6 @@ automat==22.10.0
     # via twisted
 babel==2.12.1
     # via agate
-betterproto==1.2.5
-    # via dbt-core
 cachetools==5.3.0
     # via google-auth
 certifi==2022.12.7
@@ -28,6 +27,7 @@ charset-normalizer==3.1.0
 click==8.1.3
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   typer
 colorama==0.4.6
     # via dbt-core
@@ -46,12 +46,14 @@ cssselect==1.2.0
     #   scrapy
 cycler==0.11.0
     # via matplotlib
-dbt-core==1.4.5
+dbt-core==1.7.13
     # via dbt-duckdb
-dbt-duckdb==1.4.1
-dbt-extractor==0.4.1
+dbt-duckdb==1.7.4
+dbt-extractor==0.5.1
+    # via dbt-core
+dbt-semantic-interfaces==0.4.4
     # via dbt-core
-duckdb==0.7.1
+duckdb==0.10.2
     # via dbt-duckdb
 filelock==3.10.7
     # via tldextract
@@ -73,23 +75,13 @@ google-auth-httplib2==0.1.0
 google-auth-oauthlib==1.0.0
 googleapis-common-protos==1.59.0
     # via google-api-core
-grpclib==0.4.3
-    # via betterproto
-h2==4.1.0
-    # via grpclib
 h3==3.7.6
-hologram==0.0.15
-    # via dbt-core
-hpack==4.0.0
-    # via h2
 htmlmin==0.1.12
     # via ydata-profiling
 httplib2==0.22.0
     # via
     #   google-api-python-client
     #   google-auth-httplib2
-hyperframe==6.0.1
-    # via h2
 hyperlink==21.0.0
     # via twisted
 idna==3.4
@@ -102,6 +94,8 @@ imagehash==4.3.1
     # via
     #   visions
     #   ydata-profiling
+importlib-metadata==6.11.0
+    # via dbt-semantic-interfaces
 incremental==22.10.0
     # via twisted
 isodate==0.6.1
@@ -114,16 +108,21 @@ itemadapter==0.7.0
     #   scrapy
 itemloaders==1.0.6
     # via scrapy
-jinja2==3.1.2
+jinja2==3.1.3
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   ydata-profiling
 jmespath==1.0.1
     # via itemloaders
 joblib==1.2.0
     # via phik
-jsonschema==3.2.0
-    # via hologram
+jsonschema==4.21.1
+    # via
+    #   dbt-core
+    #   dbt-semantic-interfaces
+jsonschema-specifications==2023.12.1
+    # via jsonschema
 kiwisolver==1.4.4
     # via matplotlib
 leather==0.3.4
@@ -136,10 +135,8 @@ lxml==4.9.2
     #   parsel
     #   scrapy
 markupsafe==2.1.2
-    # via
-    #   jinja2
-    #   werkzeug
-mashumaro==3.3.1
+    # via jinja2
+mashumaro==3.12
     # via dbt-core
 matplotlib==3.6.3
     # via
@@ -148,10 +145,10 @@ matplotlib==3.6.3
     #   ydata-profiling
 minimal-snowplow-tracker==0.0.2
     # via dbt-core
+more-itertools==10.2.0
+    # via dbt-semantic-interfaces
 msgpack==1.0.5
     # via mashumaro
-multidict==6.0.4
-    # via grpclib
 multimethod==1.9.1
     # via
     #   visions
@@ -213,6 +210,7 @@ protego==0.2.1
     # via scrapy
 protobuf==4.22.1
     # via
+    #   dbt-core
     #   google-api-core
     #   googleapis-common-protos
 pyarrow==11.0.0
@@ -228,7 +226,9 @@ pyasn1-modules==0.2.8
 pycparser==2.21
     # via cffi
 pydantic==1.10.7
-    # via ydata-profiling
+    # via
+    #   dbt-semantic-interfaces
+    #   ydata-profiling
 pydispatcher==2.0.7
     # via scrapy
 pygeohash==1.2.0
@@ -238,11 +238,9 @@ pyparsing==3.0.9
     # via
     #   httplib2
     #   matplotlib
-pyrsistent==0.19.3
-    # via jsonschema
 python-dateutil==2.8.2
     # via
-    #   hologram
+    #   dbt-semantic-interfaces
     #   matplotlib
     #   pandas
 python-dotenv==1.0.0
@@ -259,9 +257,14 @@ pywavelets==1.4.1
 pyyaml==6.0
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   ydata-profiling
 queuelib==1.6.2
     # via scrapy
+referencing==0.34.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
 requests==2.28.2
     # via
     #   dbt-core
@@ -275,6 +278,10 @@ requests-file==1.5.1
     # via tldextract
 requests-oauthlib==1.3.1
     # via google-auth-oauthlib
+rpds-py==0.18.0
+    # via
+    #   jsonschema
+    #   referencing
 rsa==4.9
     # via google-auth
 ruff==0.0.292
@@ -291,7 +298,6 @@ service-identity==21.1.0
     # via scrapy
 setuptools==69.5.1
     # via
-    #   jsonschema
     #   scrapy
     #   zope-interface
 six==1.16.0
@@ -300,7 +306,6 @@ six==1.16.0
     #   google-auth
     #   google-auth-httplib2
     #   isodate
-    #   jsonschema
     #   leather
     #   minimal-snowplow-tracker
     #   patsy
@@ -308,12 +313,10 @@ six==1.16.0
     #   python-dateutil
     #   requests-file
     #   service-identity
-sqlparse==0.4.3
+sqlparse==0.5.0
     # via dbt-core
 statsmodels==0.13.5
     # via ydata-profiling
-stringcase==1.2.0
-    # via betterproto
 tangled-up-in-unicode==0.2.0
     # via visions
 text-unidecode==1.3
@@ -331,13 +334,16 @@ typer==0.7.0
 typing-extensions==4.5.0
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   mashumaro
     #   pydantic
     #   twisted
 uritemplate==4.1.1
     # via google-api-python-client
 urllib3==1.26.15
-    # via requests
+    # via
+    #   dbt-core
+    #   requests
 visions==0.7.5
     # via ydata-profiling
 w3lib==2.1.1
@@ -345,9 +351,9 @@ w3lib==2.1.1
     #   itemloaders
     #   parsel
     #   scrapy
-werkzeug==2.2.3
-    # via dbt-core
 ydata-profiling==4.1.2
+zipp==3.18.1
+    # via importlib-metadata
 zope-interface==6.0
     # via
     #   scrapy
diff --git a/deps/requirements.in b/deps/requirements.in
index c069c1c..601e79c 100644
--- a/deps/requirements.in
+++ b/deps/requirements.in
@@ -6,10 +6,11 @@ scrapy>=2,<3
 toolz
 loguru
 ydata-profiling
-duckdb
+duckdb~=0.10
 polars[pyarrow]
 h3<4.0
-dbt-duckdb
+dbt-core~=1.7
+dbt-duckdb~=1.7
 google-api-python-client
 google-auth-httplib2
 google-auth-oauthlib
\ No newline at end of file
diff --git a/deps/requirements.txt b/deps/requirements.txt
index 80ebce9..32d173c 100644
--- a/deps/requirements.txt
+++ b/deps/requirements.txt
@@ -6,6 +6,7 @@ attrs==22.2.0
     # via
     #   automat
     #   jsonschema
+    #   referencing
     #   service-identity
     #   twisted
     #   visions
@@ -13,8 +14,6 @@ automat==22.10.0
     # via twisted
 babel==2.12.1
     # via agate
-betterproto==1.2.5
-    # via dbt-core
 cachetools==5.3.0
     # via google-auth
 certifi==2022.12.7
@@ -28,6 +27,7 @@ charset-normalizer==3.1.0
 click==8.1.3
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   typer
 colorama==0.4.6
     # via dbt-core
@@ -46,12 +46,14 @@ cssselect==1.2.0
     #   scrapy
 cycler==0.11.0
     # via matplotlib
-dbt-core==1.4.5
+dbt-core==1.7.13
     # via dbt-duckdb
-dbt-duckdb==1.4.1
-dbt-extractor==0.4.1
+dbt-duckdb==1.7.4
+dbt-extractor==0.5.1
+    # via dbt-core
+dbt-semantic-interfaces==0.4.4
     # via dbt-core
-duckdb==0.7.1
+duckdb==0.10.2
     # via dbt-duckdb
 filelock==3.10.7
     # via tldextract
@@ -73,23 +75,13 @@ google-auth-httplib2==0.1.0
 google-auth-oauthlib==1.0.0
 googleapis-common-protos==1.59.0
     # via google-api-core
-grpclib==0.4.3
-    # via betterproto
-h2==4.1.0
-    # via grpclib
 h3==3.7.6
-hologram==0.0.15
-    # via dbt-core
-hpack==4.0.0
-    # via h2
 htmlmin==0.1.12
     # via ydata-profiling
 httplib2==0.22.0
     # via
     #   google-api-python-client
     #   google-auth-httplib2
-hyperframe==6.0.1
-    # via h2
 hyperlink==21.0.0
     # via twisted
 idna==3.4
@@ -102,6 +94,8 @@ imagehash==4.3.1
     # via
     #   visions
     #   ydata-profiling
+importlib-metadata==6.11.0
+    # via dbt-semantic-interfaces
 incremental==22.10.0
     # via twisted
 isodate==0.6.1
@@ -114,16 +108,21 @@ itemadapter==0.7.0
     #   scrapy
 itemloaders==1.0.6
     # via scrapy
-jinja2==3.1.2
+jinja2==3.1.3
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   ydata-profiling
 jmespath==1.0.1
     # via itemloaders
 joblib==1.2.0
     # via phik
-jsonschema==3.2.0
-    # via hologram
+jsonschema==4.21.1
+    # via
+    #   dbt-core
+    #   dbt-semantic-interfaces
+jsonschema-specifications==2023.12.1
+    # via jsonschema
 kiwisolver==1.4.4
     # via matplotlib
 leather==0.3.4
@@ -136,10 +135,8 @@ lxml==4.9.2
     #   parsel
     #   scrapy
 markupsafe==2.1.2
-    # via
-    #   jinja2
-    #   werkzeug
-mashumaro==3.3.1
+    # via jinja2
+mashumaro==3.12
     # via dbt-core
 matplotlib==3.6.3
     # via
@@ -148,10 +145,10 @@ matplotlib==3.6.3
     #   ydata-profiling
 minimal-snowplow-tracker==0.0.2
     # via dbt-core
+more-itertools==10.2.0
+    # via dbt-semantic-interfaces
 msgpack==1.0.5
     # via mashumaro
-multidict==6.0.4
-    # via grpclib
 multimethod==1.9.1
     # via
     #   visions
@@ -213,6 +210,7 @@ protego==0.2.1
     # via scrapy
 protobuf==4.22.1
     # via
+    #   dbt-core
     #   google-api-core
     #   googleapis-common-protos
 pyarrow==11.0.0
@@ -229,7 +227,9 @@ pyasn1-modules==0.2.8
 pycparser==2.21
     # via cffi
 pydantic==1.10.7
-    # via ydata-profiling
+    # via
+    #   dbt-semantic-interfaces
+    #   ydata-profiling
 pydispatcher==2.0.7
     # via scrapy
 pygeohash==1.2.0
@@ -239,11 +239,9 @@ pyparsing==3.0.9
     # via
     #   httplib2
     #   matplotlib
-pyrsistent==0.19.3
-    # via jsonschema
 python-dateutil==2.8.2
     # via
-    #   hologram
+    #   dbt-semantic-interfaces
     #   matplotlib
     #   pandas
 python-dotenv==1.0.0
@@ -260,9 +258,14 @@ pywavelets==1.4.1
 pyyaml==6.0
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   ydata-profiling
 queuelib==1.6.2
     # via scrapy
+referencing==0.34.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
 requests==2.28.2
     # via
     #   dbt-core
@@ -276,6 +279,10 @@ requests-file==1.5.1
     # via tldextract
 requests-oauthlib==1.3.1
     # via google-auth-oauthlib
+rpds-py==0.18.0
+    # via
+    #   jsonschema
+    #   referencing
 rsa==4.9
     # via google-auth
 scipy==1.9.3
@@ -291,7 +298,6 @@ service-identity==21.1.0
     # via scrapy
 setuptools==69.5.1
     # via
-    #   jsonschema
     #   scrapy
     #   zope-interface
 six==1.16.0
@@ -300,7 +306,6 @@ six==1.16.0
     #   google-auth
     #   google-auth-httplib2
     #   isodate
-    #   jsonschema
     #   leather
     #   minimal-snowplow-tracker
     #   patsy
@@ -308,12 +313,10 @@ six==1.16.0
     #   python-dateutil
     #   requests-file
     #   service-identity
-sqlparse==0.4.3
+sqlparse==0.5.0
     # via dbt-core
 statsmodels==0.13.5
     # via ydata-profiling
-stringcase==1.2.0
-    # via betterproto
 tangled-up-in-unicode==0.2.0
     # via visions
 text-unidecode==1.3
@@ -331,13 +334,16 @@ typer==0.7.0
 typing-extensions==4.5.0
     # via
     #   dbt-core
+    #   dbt-semantic-interfaces
     #   mashumaro
     #   pydantic
     #   twisted
 uritemplate==4.1.1
     # via google-api-python-client
 urllib3==1.26.15
-    # via requests
+    # via
+    #   dbt-core
+    #   requests
 visions==0.7.5
     # via ydata-profiling
 w3lib==2.1.1
@@ -345,9 +351,9 @@ w3lib==2.1.1
     #   itemloaders
     #   parsel
     #   scrapy
-werkzeug==2.2.3
-    # via dbt-core
 ydata-profiling==4.1.2
+zipp==3.18.1
+    # via importlib-metadata
 zope-interface==6.0
     # via
     #   scrapy

From 8300d14ebda4da9a4c0e97db085135a069d748cb Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 09:15:32 -0500
Subject: [PATCH 06/11] Got script running end to end on local without docker.

---
 pipeline/scripts/combine_raw_reports.py | 59 ++++++++++++++++++++++---
 run-pipeline-local.sh                   | 29 ++++++------
 2 files changed, 69 insertions(+), 19 deletions(-)
 mode change 100644 => 100755 run-pipeline-local.sh

diff --git a/pipeline/scripts/combine_raw_reports.py b/pipeline/scripts/combine_raw_reports.py
index 7469b90..a59ae6e 100644
--- a/pipeline/scripts/combine_raw_reports.py
+++ b/pipeline/scripts/combine_raw_reports.py
@@ -12,13 +12,35 @@ def main(
     logger.info(f"reports_new_file: {reports_new_file.name}")
     logger.info(f"reports_out_file: {reports_out_file.name}")
 
+    columns = [
+        "year",
+        "season",
+        "month",
+        "date",
+        "state",
+        "county",
+        "location_details",
+        "nearest_town",
+        "nearest_road",
+        "observed",
+        "also_noticed",
+        "other_witnesses",
+        "other_stories",
+        "time_and_conditions",
+        "environment",
+        "report_number",
+        "report_class",
+        '"a_&_g_references"',
+        "pulled_datetime",
+    ]
+
     if reports_orig_file.exists():
         combined_reports_frame = duckdb.sql(
             f"""
             WITH all_rows AS (
-                SELECT * FROM '{reports_orig_file}'
+                SELECT {', '.join(columns)} FROM '{reports_orig_file}'
                 UNION ALL
-                SELECT * FROM READ_NDJSON(
+                SELECT {', '.join(columns)} FROM READ_NDJSON(
                     '{reports_new_file}',
                     columns={{
                         year: 'VARCHAR',
@@ -39,18 +61,45 @@ def main(
                         report_number: 'BIGINT',
                         report_class: 'VARCHAR',
                         "a_&_g_references": 'VARCHAR',
-                        pulled_datetime: 'VARCHAR'
+                        pulled_datetime: 'TIMESTAMP'
                     }}
                 )
             )
-            SELECT * FROM all_rows
+            SELECT {', '.join(columns)} FROM all_rows
             QUALIFY ROW_NUMBER() OVER(
                 PARTITION BY report_number ORDER BY pulled_datetime DESC
             ) = 1
             """
         ).pl()
     else:
-        combined_reports_frame = pl.read_ndjson(reports_new_file)
+        combined_reports_frame = duckdb.sql(
+            f"""
+            SELECT {', '.join(columns)} FROM READ_NDJSON(
+                '{reports_new_file}',
+                columns={{
+                    year: 'VARCHAR',
+                    season: 'VARCHAR',
+                    month: 'VARCHAR',
+                    date: 'VARCHAR',
+                    state: 'VARCHAR',
+                    county: 'VARCHAR',
+                    location_details: 'VARCHAR',
+                    nearest_town: 'VARCHAR',
+                    nearest_road: 'VARCHAR',
+                    observed: 'VARCHAR',
+                    also_noticed: 'VARCHAR',
+                    other_witnesses: 'VARCHAR',
+                    other_stories: 'VARCHAR',
+                    time_and_conditions: 'VARCHAR',
+                    environment: 'VARCHAR',
+                    report_number: 'BIGINT',
+                    report_class: 'VARCHAR',
+                    "a_&_g_references": 'VARCHAR',
+                    pulled_datetime: 'TIMESTAMP'
+                }}
+            )
+            """
+        ).pl()
 
     combined_reports_frame.write_csv(reports_out_file)
     logger.info("Done!")
diff --git a/run-pipeline-local.sh b/run-pipeline-local.sh
old mode 100644
new mode 100755
index 321fd92..a4bc3ea
--- a/run-pipeline-local.sh
+++ b/run-pipeline-local.sh
@@ -13,36 +13,37 @@ cd ../.. # should be back in pipeline/
 # Combines new reports with existing reports, as some reports will drop off
 # the BFRO website.
 # Move from the scraper into the raw data directory.
-cp scraper/bfro_scrape/new_reports.json data/raw/bfro_reports_new.json
+cp scraper/bfro_scrape/new_reports.json data/raw/reports/bfro_reports_new.json
 # Combine the reports.
 python scripts/combine_raw_reports.py \
-    data/raw/bfro_reports.csv \
-    data/raw/bfro_reports_new.json \
-    data/raw/bfro_reports_combined.csv
+    data/raw/reports/bfro_reports.csv \
+    data/raw/reports/bfro_reports_new.json \
+    data/raw/reports/bfro_reports_combined.csv
 # Set the combined reports as the new reports csv
-cp data/raw/bfro_reports_combined.csv data/raw/bfro_reports.csv
+cp data/raw/reports/bfro_reports_combined.csv data/raw/reports/bfro_reports.csv
 # Copy the reports to the DBT source directory.
-cp data/raw/bfro_reports.csv data/sources/bfro_reports.csv
+cp data/raw/reports/bfro_reports.csv data/sources/bfro_reports.csv
 
 ############# PULL KML REPORTS ###############
 echo "Pulling kml file and extracting geocoded reports."
 wget http://www.bfro.net/app/AllReportsKMZ.aspx
 mv AllReportsKMZ.aspx data/raw/geocoder/
-unzip -o data/geocoder/AllReportsKMZ.aspx -d data/raw/geocoder/
+unzip -o data/raw/geocoder/AllReportsKMZ.aspx -d data/raw/geocoder/
 # Extract the lat / lon / report id / etc from the kml file.
 python scripts/extract_locations_from_kml.py \
     data/raw/geocoder/doc.kml \
-    data/raw/geocoded_reports_new.csv
+    data/raw/geocoder/geocoded_reports_new.csv
 # Combine the newly extracted reports with any existing KML sourced reports,
 # in case any are removed from the KML file.
 python scripts/combine_geocoded_reports.py \
-    data/raw/geocoded_reports.csv \
-    data/raw/geocoded_reports_new.csv \
-    data/raw/geocoded_reports_combined.csv
+    data/raw/geocoder/geocoded_reports.csv \
+    data/raw/geocoder/geocoded_reports_new.csv \
+    data/raw/geocoder/geocoded_reports_combined.csv
 # The combined reports are now current.
-cp data/raw/geocoded_reports_combined.csv data/raw/geocoded_reports.csv
+cp data/raw/geocoder/geocoded_reports_combined.csv \
+    data/raw/geocoder/geocoded_reports.csv
 # Copy to the source file for dbt.
-cp data/raw/geocoded_reports.csv data/sources/geocoded_reports.csv
+cp data/raw/geocoder/geocoded_reports.csv data/sources/geocoded_reports.csv
 
 ############# PULL WEATHER ###############
 echo "Pulling weather."
@@ -50,7 +51,7 @@ python scripts/pull_weather.py \
     data/raw/weather/weather_cache.csv \
     data/raw/geocoder/geocoded_reports.csv
 # Copy to the source file for dbt.
-cp data/raw/weather_cache.csv data/sources/weather_cache.csv
+cp data/raw/weather/weather_cache.csv data/sources/weather_cache.csv
 
 ############# RUN DBT ###############
 echo "Building mini warehouse."

From 31b96f398a3cee5804fb4df534b1d27dd7d18cab Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 18:51:43 -0500
Subject: [PATCH 07/11] Updated gdrive uploader to have sane env var names.

---
 pipeline/scripts/upload_to_gdrive.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipeline/scripts/upload_to_gdrive.py b/pipeline/scripts/upload_to_gdrive.py
index 596ac59..1f1ec8e 100644
--- a/pipeline/scripts/upload_to_gdrive.py
+++ b/pipeline/scripts/upload_to_gdrive.py
@@ -76,9 +76,11 @@ def main(
             "retrieving from env."
         )
         load_dotenv(find_dotenv())
-        sa_credentials_location = os.getenv("SA_CREDENTIALS_LOCATION")
+        sa_credentials_location = os.getenv("GDRIVE_SERVICE_ACCOUNT_LOCATION")
         if sa_credentials_location is None:
-            raise ValueError("SA_CREDENTIALS_LOCATION not in .env or env.")
+            raise ValueError(
+                "GDRIVE_SERVICE_ACCOUNT_LOCATION not in .env or env."
+            )
 
     if gdrive_folder_id is None:
         logger.info("GDrive folder ID not passed, retrieving from env.")

From e5b83deb7e4067a7dc2cf832cdef85469539f25f Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 20:04:41 -0500
Subject: [PATCH 08/11] Got docker running.

---
 .dockerignore           | 20 ++++++++++++------
 Dockerfile              | 16 +++++++++++---
 Makefile                |  6 +-----
 pipeline/.prefectignore | 47 -----------------------------------------
 4 files changed, 27 insertions(+), 62 deletions(-)
 delete mode 100644 pipeline/.prefectignore

diff --git a/.dockerignore b/.dockerignore
index 1dd62fc..6be784a 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -12,11 +12,17 @@ deps/dev-requirements.in
 deps/dev-requirements.txt
 notebooks/
 scripts/
-pipeline/__pycache__/
-pipeline/scripts/__pycache__/
-pipeline/data/*
-pipeline/bfro_mini_warehouse/logs/
-pipeline/bfro_mini_warehouse/target/
+pipeline/data/**/*.csv
+pipeline/data/**/*.json
+pipeline/data/**/*.kml
+pipeline/data/**/*.aspx
+pipeline/logs/*
+pipeline/bfro_mini_warehouse/logs/*
+pipeline/bfro_mini_warehouse/target/*
 pipeline/bfro_mini_warehouse/README.md
-pipeline/scraper/__pycache__/
-pipeline/scraper/bfro_scrape/__pycache__/
\ No newline at end of file
+pipeline/bfro_mini_warehouse/.user.yml
+**/__pycache__/
+.dvc
+.github
+.ruff_cache
+.vscode
diff --git a/Dockerfile b/Dockerfile
index f08cccb..0011d8f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,15 @@
-FROM python:3.9-slim-buster
+FROM python:3.11-slim-bullseye
 
-COPY deps/requirements.txt .
+ENV VIRTUAL_ENV=/usr/local
+# Install basics
+RUN apt-get update -y && apt-get install -y zip wget apt-transport-https ca-certificates gnupg curl
+# Install mc
+RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/bin/mc && chmod +x /usr/bin/mc
+# Install gcloud
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && apt-get update -y && apt-get install google-cloud-sdk -y
 
-RUN pip install -r requirements.txt
\ No newline at end of file
+# Install the python stuff.
+COPY deps/requirements.txt requirements.txt
+RUN pip install uv && uv pip install -r requirements.txt
+
+COPY . .
diff --git a/Makefile b/Makefile
index 252e05a..155f9ec 100644
--- a/Makefile
+++ b/Makefile
@@ -25,13 +25,9 @@ format:
 	python -m ruff format .
 
 .PHONY: build-docker
-## Build docker with local registry tag
+## Build docker with local registry tag and push to local registry
 build-docker:
 	docker build --tag localhost:5000/bfro_pipeline:latest .
-
-.PHONY: push-docker
-## Push docker to local registry
-push-docker:
 	docker push localhost:5000/bfro_pipeline:latest
 
 .PHONY: pull-data
diff --git a/pipeline/.prefectignore b/pipeline/.prefectignore
deleted file mode 100644
index eb471be..0000000
--- a/pipeline/.prefectignore
+++ /dev/null
@@ -1,47 +0,0 @@
-# prefect artifacts
-.prefectignore
-
-# python artifacts
-__pycache__/
-*.py[cod]
-*$py.class
-*.egg-info/
-*.egg
-
-# Type checking artifacts
-.mypy_cache/
-.dmypy.json
-dmypy.json
-.pyre/
-
-# IPython
-profile_default/
-ipython_config.py
-*.ipynb_checkpoints/*
-
-# Environments
-.python-version
-.env
-.venv
-env/
-venv/
-
-# MacOS
-.DS_Store
-
-# Dask
-dask-worker-space/
-
-# Editors
-.idea/
-.vscode/
-
-# VCS
-.git/
-.hg/
-
-data/
-
-## dbt
-bfro_mini_warehouse/logs
-bfro_mini_warehouse/target
\ No newline at end of file

From aed406d2680f03fe219bd36de93167dcd0a7a844 Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 20:09:39 -0500
Subject: [PATCH 09/11] Update readme.

---
 README.md | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 9c6759d..b92cbd1 100644
--- a/README.md
+++ b/README.md
@@ -33,38 +33,23 @@ For more information on the weather data, see the [Visual Crossing documentation
 ## Full Pipeline
 
 The pipeline (including scraper, weather, and the DBT project), is in the `pipeline/` directory.
-Everything assumes relative paths from that directory, rather than the project root (which is just for setup and deployment operations).
+However, there's a shell script that will run it all for you.
 
-```
-cd pipeline/
-python bfro_pipeline.py
+```sh
+# In project root.
+./run-pipeline-local.sh False
 ```
 
-For a test run (which runs the scraper on a small set of pages, and pulls a small set of weather data), use `--test-run`.
+To run a test run, use
 
+```sh
+./run-pipeline-local.sh True
 ```
-python bfro_pipeline.py --test-run
-```
-
-Once the sources are in place (however you decide to run them), you can run dbt directly from within the `pipeline/bfro_mini_warehouse` directory, or use the script.
-
-```
-python bfro_pipeline.py --dbt-only
-```
-
-The pipeline command runs the source tests first, builds the csv files, then runs the rest of the data tests.
 
 ## Deployment and Orchestration
 
-The pipelines are [Prefect](https://www.prefect.io/) flows, and they can be deployed but require some setup.
-The `pipeline/bfro_pipeline_docker.py` file has the blocks you need (basically `prefect-gcs-rw` as GCP credentials, and `visual-crossing-key` as a Secret).
-I assume if you're messing with deployments you probably know how that stuff works.
-It's not _super_ hard to self host Prefect, but it's not super easy either.
-
-Also worth noting - while the thing says `_docker` in the file name and pipeline name, I don't actually have the dockerized version working yet 😬 .
-
-It will still deploy and run, as is, with a process infrastructure block on an agent within the conda env provided in this repo though.
-When I get docker working, you'll be able to launch it with a docker container infrastructure block and no code change to the flow.
+There's a Dockerfile and docker make targets (set to push to a local registry).
+If you want to run in a container you should wrap the `run-pipeline-local.sh` shell script into one that will pull down / push updated data before the container tears down, or execute with a bind mount or volume mount on the local project root.
 
 ## Data Dictionary
 

From f2cc6e40d45bb76594e73ace9bc9294325104b35 Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 20:13:01 -0500
Subject: [PATCH 10/11] Bump python version in actions and update make target
 for linting.

---
 .github/workflows/test-and-lint.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-and-lint.yml b/.github/workflows/test-and-lint.yml
index 124a162..460ca17 100644
--- a/.github/workflows/test-and-lint.yml
+++ b/.github/workflows/test-and-lint.yml
@@ -13,12 +13,12 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v2
         with:
-          python-version: "3.9"
+          python-version: "3.11"
       
       - name: Install dependencies
         run: pip install -r deps/dev-requirements.txt
       
       - name: Run checks
-        run: make check
+        run: make lint
 
       # uhhhh no tests for now I guess
\ No newline at end of file

From 158445eaf79b8f4101cee0e6401e920af5368c52 Mon Sep 17 00:00:00 2001
From: Tim Renner <renner.timothy@gmail.com>
Date: Tue, 23 Apr 2024 20:16:23 -0500
Subject: [PATCH 11/11] Get everything linted or ignored (looking at you scrapy
 settings.py)

---
 .../scraper/bfro_scrape/spiders/bfro_reports.py   | 15 ++++++++-------
 pipeline/scripts/combine_raw_reports.py           |  1 -
 ruff.toml                                         |  5 ++++-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py b/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py
index 6b7aed6..6894ee2 100644
--- a/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py
+++ b/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py
@@ -45,10 +45,10 @@ def parse_report(self, response):
         raw_keys = response.xpath("//p/span[@class='field']/text()").extract()
         keys = [k.replace(":", "").replace(" ", "_") for k in raw_keys]
 
-        # Now if scrapy had XPath 2.0 this would be pretty simple. Unfortunately
-        # it doesn't so we need to use a mixture of Python and XPath. This
-        # XPath query grabs all "p" elements containing a 'span.field' with
-        # text matching the key.
+        # Now if scrapy had XPath 2.0 this would be pretty simple.
+        # Unfortunately it doesn't so we need to use a mixture of Python and
+        # XPath. This XPath query grabs all "p" elements containing a
+        # 'span.field' with text matching the key.
         value_query = (
             "//p[span[@class = 'field' and contains(text(), '{}')]]/text()"
         )
@@ -83,9 +83,10 @@ def parse_report(self, response):
 
         for k in empty_keys:
             data[k] = response.xpath(
-                "//p[span[@class='field' and contains(text(), '{}')]]/a/text()".format(
-                    k
-                )
+                (
+                    "//p[span[@class='field' and "
+                    "contains(text(), '{}')]]/a/text()"
+                ).format(k)
             ).extract_first()
 
         # If everything is None, we don't want to write the values out,
diff --git a/pipeline/scripts/combine_raw_reports.py b/pipeline/scripts/combine_raw_reports.py
index a59ae6e..ffc2bf9 100644
--- a/pipeline/scripts/combine_raw_reports.py
+++ b/pipeline/scripts/combine_raw_reports.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 from loguru import logger
 import duckdb
-import polars as pl
 
 
 def main(
diff --git a/ruff.toml b/ruff.toml
index 723e27e..10b94ee 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -1 +1,4 @@
-line-length = 79
\ No newline at end of file
+line-length = 79
+exclude=[
+    "pipeline/scraper/bfro_scrape/settings.py"
+]
\ No newline at end of file