From 1a9739b8db298c3c8dc106be70d46af0b8d4fd89 Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Mon, 22 Apr 2024 07:43:26 -0500 Subject: [PATCH 01/11] Updated .gitignore and create .dockerignore. --- .dockerignore | 22 ++++++++++++++++++++++ .gitignore | 4 +++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1dd62fc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,22 @@ +.env +service-account.json +gdrive-service-account.json +.git +.gitignore +environment.yaml +Makefile +README.md +LICENSE +deps/requirements.in +deps/dev-requirements.in +deps/dev-requirements.txt +notebooks/ +scripts/ +pipeline/__pycache__/ +pipeline/scripts/__pycache__/ +pipeline/data/* +pipeline/bfro_mini_warehouse/logs/ +pipeline/bfro_mini_warehouse/target/ +pipeline/bfro_mini_warehouse/README.md +pipeline/scraper/__pycache__/ +pipeline/scraper/bfro_scrape/__pycache__/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index ccca9c5..f04d624 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ logs data_old test.csv data_backup - +service-account.json +gdrive-service-account.json +run-docker.sh From 81b1ecc00c6cccdb94bdd2fe42b934188477b634 Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Mon, 22 Apr 2024 07:46:07 -0500 Subject: [PATCH 02/11] Switch from black to ruff. --- Makefile | 27 ++------------------------- deps/dev-requirements.in | 1 - pyproject.toml | 14 -------------- ruff.toml | 1 + 4 files changed, 3 insertions(+), 40 deletions(-) delete mode 100644 pyproject.toml create mode 100644 ruff.toml diff --git a/Makefile b/Makefile index b150a6e..605ef16 100644 --- a/Makefile +++ b/Makefile @@ -18,17 +18,11 @@ dev-env: deps/dev-requirements.txt ## Lint project with ruff. lint: - python -m ruff . + python -m ruff check . ## Format imports and code. format: - python -m ruff . --fix - python -m black . - -## Check linting and formatting. -check: - python -m ruff check . - python -m black --check . + python -m ruff format . .PHONY: build-docker ## Build docker with local registry tag @@ -40,23 +34,6 @@ build-docker: push-docker: docker push localhost:5000/bfro_pipeline:latest -.PHONY: build-deployment -## Builds the Prefect deployment yaml file. -build-deployment: - cd pipeline && \ - prefect deployment build \ - bfro_pipeline_docker:main \ - --name bfro-pipeline \ - --pool bfro-agent-pool \ - --work-queue default \ - --infra-block process/bfro-local \ - --storage-block gcs/bfro-pipeline-storage - -.PHONY: apply-deployment -## Sends the Prefect deployment file to the server. -apply-deployment: - prefect deployment apply pipeline/main-deployment.yaml - .PHONY: pull-data ## Downloads the data locally for testing pull-data: diff --git a/deps/dev-requirements.in b/deps/dev-requirements.in index f392554..40cd4be 100644 --- a/deps/dev-requirements.in +++ b/deps/dev-requirements.in @@ -1,3 +1,2 @@ -r requirements.txt -black ruff \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 1378b95..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,14 +0,0 @@ -[tool.black] -line-length = 79 -extend-exclude = ''' -/( - | .env - | environment.yaml - | notebooks - | data - | dvc.lock - | dvc.yaml - | LICENSE - | README.md -)/ -''' \ No newline at end of file diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..723e27e --- /dev/null +++ b/ruff.toml @@ -0,0 +1 @@ +line-length = 79 \ No newline at end of file From 1b568b2b63f94f1eb4ea912c876e4d3bdd190b96 Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Mon, 22 Apr 2024 08:02:48 -0500 Subject: [PATCH 03/11] Upgrade to Python 3.11, switch to uv for dependency management. --- Makefile | 8 +- deps/dev-requirements.txt | 586 +++++++++----------------------------- environment.yaml | 4 +- 3 files changed, 137 insertions(+), 461 deletions(-) diff --git a/Makefile b/Makefile index 605ef16..252e05a 100644 --- a/Makefile +++ b/Makefile @@ -2,19 +2,19 @@ ## Compile requirements.in into requirements.txt deps/requirements.txt: deps/requirements.in - pip-compile deps/requirements.in --output-file deps/requirements.txt + uv pip compile deps/requirements.in --output-file deps/requirements.txt ## Compile dev-requirements.in into dev-requirements.txt deps/dev-requirements.txt: deps/dev-requirements.in deps/requirements.txt - pip-compile deps/dev-requirements.in --output-file deps/dev-requirements.txt + uv pip compile deps/dev-requirements.in --output-file deps/dev-requirements.txt ## Install non-dev dependencies. env: deps/requirements.txt - pip-sync deps/requirements.txt + uv pip sync deps/requirements.txt ## Install dev and non-dev dependencies. dev-env: deps/dev-requirements.txt - pip-sync deps/dev-requirements.txt + uv pip sync deps/dev-requirements.txt ## Lint project with ruff. lint: diff --git a/deps/dev-requirements.txt b/deps/dev-requirements.txt index b0ce1d1..550948b 100644 --- a/deps/dev-requirements.txt +++ b/deps/dev-requirements.txt @@ -1,54 +1,30 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile --output-file=deps/dev-requirements.txt deps/dev-requirements.in -# +# This file was autogenerated by uv via the following command: +# uv pip compile deps/dev-requirements.in --output-file deps/dev-requirements.txt agate==1.7.0 - # via - # -r deps/requirements.txt - # dbt-core + # via dbt-core aiohttp==3.8.4 - # via - # -r deps/requirements.txt - # gcsfs + # via gcsfs aiosignal==1.3.1 - # via - # -r deps/requirements.txt - # aiohttp + # via aiohttp aiosqlite==0.18.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect alembic==1.10.2 - # via - # -r deps/requirements.txt - # prefect + # via prefect anyio==3.7.1 # via - # -r deps/requirements.txt # httpcore # prefect # starlette apprise==1.3.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect asgi-lifespan==2.0.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect async-timeout==4.0.2 - # via - # -r deps/requirements.txt - # aiohttp + # via aiohttp asyncpg==0.27.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect attrs==22.2.0 # via - # -r deps/requirements.txt # aiohttp # automat # jsonschema @@ -56,26 +32,15 @@ attrs==22.2.0 # twisted # visions automat==22.10.0 - # via - # -r deps/requirements.txt - # twisted + # via twisted babel==2.12.1 - # via - # -r deps/requirements.txt - # agate + # via agate betterproto==1.2.5 - # via - # -r deps/requirements.txt - # dbt-core -black==23.9.1 - # via -r deps/dev-requirements.in + # via dbt-core cachetools==5.3.0 - # via - # -r deps/requirements.txt - # google-auth + # via google-auth certifi==2022.12.7 # via - # -r deps/requirements.txt # apprise # httpcore # httpx @@ -83,140 +48,89 @@ certifi==2022.12.7 # requests cffi==1.15.1 # via - # -r deps/requirements.txt # cryptography # dbt-core charset-normalizer==3.1.0 # via - # -r deps/requirements.txt # aiohttp # requests click==8.1.3 # via - # -r deps/requirements.txt # apprise - # black # dbt-core # prefect # typer # uvicorn cloudpickle==2.2.1 - # via - # -r deps/requirements.txt - # prefect + # via prefect colorama==0.4.6 # via - # -r deps/requirements.txt # dbt-core # griffe constantly==15.1.0 - # via - # -r deps/requirements.txt - # twisted + # via twisted contourpy==1.0.7 - # via - # -r deps/requirements.txt - # matplotlib + # via matplotlib coolname==2.2.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect croniter==1.3.8 - # via - # -r deps/requirements.txt - # prefect + # via prefect cryptography==40.0.1 # via - # -r deps/requirements.txt # prefect # pyopenssl # scrapy # service-identity cssselect==1.2.0 # via - # -r deps/requirements.txt # parsel # scrapy cycler==0.11.0 - # via - # -r deps/requirements.txt - # matplotlib + # via matplotlib dateparser==1.1.8 - # via - # -r deps/requirements.txt - # prefect + # via prefect dbt-core==1.4.5 # via - # -r deps/requirements.txt # dbt-duckdb # prefect-dbt dbt-duckdb==1.4.1 - # via -r deps/requirements.txt dbt-extractor==0.4.1 - # via - # -r deps/requirements.txt - # dbt-core + # via dbt-core decorator==5.1.1 - # via - # -r deps/requirements.txt - # gcsfs + # via gcsfs dnspython==2.4.2 - # via - # -r deps/requirements.txt - # email-validator + # via email-validator docker==6.0.1 - # via - # -r deps/requirements.txt - # prefect + # via prefect duckdb==0.7.1 - # via - # -r deps/requirements.txt - # dbt-duckdb + # via dbt-duckdb email-validator==2.1.0.post1 - # via - # -r deps/requirements.txt - # pydantic + # via pydantic exceptiongroup==1.1.3 - # via - # -r deps/requirements.txt - # anyio filelock==3.10.7 - # via - # -r deps/requirements.txt - # tldextract + # via tldextract fonttools==4.39.2 - # via - # -r deps/requirements.txt - # matplotlib + # via matplotlib frozenlist==1.3.3 # via - # -r deps/requirements.txt # aiohttp # aiosignal fsspec==2023.1.0 # via - # -r deps/requirements.txt # gcsfs # prefect future==0.18.3 - # via - # -r deps/requirements.txt - # parsedatetime + # via parsedatetime gcsfs==2023.1.0 - # via -r deps/requirements.txt google-api-core==2.11.0 # via - # -r deps/requirements.txt # google-api-python-client # google-cloud-core # google-cloud-storage google-api-python-client==2.83.0 - # via - # -r deps/requirements.txt - # prefect-gcp + # via prefect-gcp google-auth==2.16.3 # via - # -r deps/requirements.txt # gcsfs # google-api-core # google-api-python-client @@ -226,103 +140,62 @@ google-auth==2.16.3 # google-cloud-storage # kubernetes google-auth-httplib2==0.1.0 - # via - # -r deps/requirements.txt - # google-api-python-client + # via google-api-python-client google-auth-oauthlib==1.0.0 - # via - # -r deps/requirements.txt - # gcsfs + # via gcsfs google-cloud-core==2.3.2 - # via - # -r deps/requirements.txt - # google-cloud-storage + # via google-cloud-storage google-cloud-storage==2.8.0 # via - # -r deps/requirements.txt # gcsfs # prefect-gcp google-crc32c==1.5.0 - # via - # -r deps/requirements.txt - # google-resumable-media + # via google-resumable-media google-resumable-media==2.4.1 - # via - # -r deps/requirements.txt - # google-cloud-storage + # via google-cloud-storage googleapis-common-protos==1.59.0 - # via - # -r deps/requirements.txt - # google-api-core + # via google-api-core graphql-core==3.2.3 - # via - # -r deps/requirements.txt - # sgqlc + # via sgqlc graphviz==0.20.1 - # via - # -r deps/requirements.txt - # prefect + # via prefect greenlet==2.0.2 - # via - # -r deps/requirements.txt - # sqlalchemy + # via sqlalchemy griffe==0.25.5 - # via - # -r deps/requirements.txt - # prefect + # via prefect grpclib==0.4.3 - # via - # -r deps/requirements.txt - # betterproto + # via betterproto h11==0.14.0 # via - # -r deps/requirements.txt # httpcore # uvicorn h2==4.1.0 # via - # -r deps/requirements.txt # grpclib # httpx h3==3.7.6 - # via -r deps/requirements.txt hologram==0.0.15 - # via - # -r deps/requirements.txt - # dbt-core + # via dbt-core hpack==4.0.0 - # via - # -r deps/requirements.txt - # h2 + # via h2 htmlmin==0.1.12 - # via - # -r deps/requirements.txt - # ydata-profiling + # via ydata-profiling httpcore==0.16.3 # via - # -r deps/requirements.txt # httpx # prefect httplib2==0.22.0 # via - # -r deps/requirements.txt # google-api-python-client # google-auth-httplib2 -httpx[http2]==0.23.3 - # via - # -r deps/requirements.txt - # prefect +httpx==0.23.3 + # via prefect hyperframe==6.0.1 - # via - # -r deps/requirements.txt - # h2 + # via h2 hyperlink==21.0.0 - # via - # -r deps/requirements.txt - # twisted + # via twisted idna==3.4 # via - # -r deps/requirements.txt # anyio # dbt-core # email-validator @@ -333,143 +206,90 @@ idna==3.4 # yarl imagehash==4.3.1 # via - # -r deps/requirements.txt # visions # ydata-profiling importlib-metadata==6.1.0 - # via - # -r deps/requirements.txt - # markdown - # prefect incremental==22.10.0 - # via - # -r deps/requirements.txt - # twisted + # via twisted isodate==0.6.1 # via - # -r deps/requirements.txt # agate # dbt-core itemadapter==0.7.0 # via - # -r deps/requirements.txt # itemloaders # scrapy itemloaders==1.0.6 - # via - # -r deps/requirements.txt - # scrapy + # via scrapy jinja2==3.1.2 # via - # -r deps/requirements.txt # dbt-core # prefect # ydata-profiling jmespath==1.0.1 - # via - # -r deps/requirements.txt - # itemloaders + # via itemloaders joblib==1.2.0 - # via - # -r deps/requirements.txt - # phik + # via phik jsonpatch==1.32 - # via - # -r deps/requirements.txt - # prefect + # via prefect jsonpointer==2.3 - # via - # -r deps/requirements.txt - # jsonpatch + # via jsonpatch jsonschema==3.2.0 # via - # -r deps/requirements.txt # hologram # prefect kiwisolver==1.4.4 - # via - # -r deps/requirements.txt - # matplotlib + # via matplotlib kubernetes==26.1.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect leather==0.3.4 - # via - # -r deps/requirements.txt - # agate + # via agate logbook==1.5.3 - # via - # -r deps/requirements.txt - # dbt-core + # via dbt-core loguru==0.6.0 - # via -r deps/requirements.txt lxml==4.9.2 # via - # -r deps/requirements.txt # parsel # scrapy mako==1.2.4 - # via - # -r deps/requirements.txt - # alembic + # via alembic markdown==3.4.3 - # via - # -r deps/requirements.txt - # apprise + # via apprise markdown-it-py==2.2.0 - # via - # -r deps/requirements.txt - # rich + # via rich markupsafe==2.1.2 # via - # -r deps/requirements.txt # jinja2 # mako # werkzeug -mashumaro[msgpack]==3.3.1 - # via - # -r deps/requirements.txt - # dbt-core +mashumaro==3.3.1 + # via dbt-core matplotlib==3.6.3 # via - # -r deps/requirements.txt # phik # seaborn # ydata-profiling mdurl==0.1.2 - # via - # -r deps/requirements.txt - # markdown-it-py + # via markdown-it-py minimal-snowplow-tracker==0.0.2 - # via - # -r deps/requirements.txt - # dbt-core + # via dbt-core msgpack==1.0.5 - # via - # -r deps/requirements.txt - # mashumaro + # via mashumaro multidict==6.0.4 # via - # -r deps/requirements.txt # aiohttp # grpclib # yarl multimethod==1.9.1 # via - # -r deps/requirements.txt # visions # ydata-profiling -mypy-extensions==1.0.0 - # via black networkx==2.8.8 # via - # -r deps/requirements.txt # dbt-core # visions numpy==1.23.5 # via - # -r deps/requirements.txt # contourpy # imagehash # matplotlib @@ -484,17 +304,11 @@ numpy==1.23.5 # visions # ydata-profiling oauthlib==3.2.2 - # via - # -r deps/requirements.txt - # requests-oauthlib + # via requests-oauthlib orjson==3.8.8 - # via - # -r deps/requirements.txt - # prefect + # via prefect packaging==23.0 # via - # -r deps/requirements.txt - # black # dbt-core # docker # matplotlib @@ -504,122 +318,80 @@ packaging==23.0 # statsmodels pandas==1.5.3 # via - # -r deps/requirements.txt # phik # seaborn # statsmodels # visions # ydata-profiling parsedatetime==2.4 - # via - # -r deps/requirements.txt - # agate + # via agate parsel==1.7.0 # via - # -r deps/requirements.txt # itemloaders # scrapy pathspec==0.10.3 # via - # -r deps/requirements.txt - # black # dbt-core # prefect patsy==0.5.3 - # via - # -r deps/requirements.txt - # statsmodels + # via statsmodels pendulum==2.1.2 - # via - # -r deps/requirements.txt - # prefect + # via prefect phik==0.12.3 - # via - # -r deps/requirements.txt - # ydata-profiling + # via ydata-profiling pillow==9.4.0 # via - # -r deps/requirements.txt # imagehash # matplotlib # visions -platformdirs==3.11.0 - # via black -polars[pyarrow]==0.16.16 - # via -r deps/requirements.txt +polars==0.16.16 prefect==2.14.2 # via - # -r deps/requirements.txt # prefect-dbt # prefect-gcp # prefect-shell prefect-dbt==0.3.1 - # via -r deps/requirements.txt -prefect-gcp[cloud_storage]==0.3.0 - # via -r deps/requirements.txt +prefect-gcp==0.3.0 prefect-shell==0.1.5 - # via - # -r deps/requirements.txt - # prefect-dbt + # via prefect-dbt protego==0.2.1 - # via - # -r deps/requirements.txt - # scrapy + # via scrapy protobuf==4.22.1 # via - # -r deps/requirements.txt # google-api-core # googleapis-common-protos pyarrow==11.0.0 - # via - # -r deps/requirements.txt - # polars + # via polars pyasn1==0.4.8 # via - # -r deps/requirements.txt # pyasn1-modules # rsa # service-identity pyasn1-modules==0.2.8 # via - # -r deps/requirements.txt # google-auth # service-identity pycparser==2.21 + # via cffi +pydantic==1.10.7 # via - # -r deps/requirements.txt - # cffi -pydantic[email]==1.10.7 - # via - # -r deps/requirements.txt # prefect # ydata-profiling pydispatcher==2.0.7 - # via - # -r deps/requirements.txt - # scrapy + # via scrapy pygeohash==1.2.0 - # via -r deps/requirements.txt pygments==2.14.0 - # via - # -r deps/requirements.txt - # rich + # via rich pyopenssl==23.1.1 - # via - # -r deps/requirements.txt - # scrapy + # via scrapy pyparsing==3.0.9 # via - # -r deps/requirements.txt # httplib2 # matplotlib pyrsistent==0.19.3 - # via - # -r deps/requirements.txt - # jsonschema + # via jsonschema python-dateutil==2.8.2 # via - # -r deps/requirements.txt # croniter # dateparser # hologram @@ -629,58 +401,39 @@ python-dateutil==2.8.2 # pendulum # prefect python-dotenv==1.0.0 - # via -r deps/requirements.txt python-slugify==8.0.1 # via - # -r deps/requirements.txt # agate # prefect pytimeparse==1.1.8 - # via - # -r deps/requirements.txt - # agate + # via agate pytz==2023.2 # via - # -r deps/requirements.txt # dateparser # dbt-core # pandas # prefect pytz-deprecation-shim==0.1.0.post0 - # via - # -r deps/requirements.txt - # tzlocal + # via tzlocal pytzdata==2020.1 - # via - # -r deps/requirements.txt - # pendulum + # via pendulum pywavelets==1.4.1 - # via - # -r deps/requirements.txt - # imagehash + # via imagehash pyyaml==6.0 # via - # -r deps/requirements.txt # apprise # dbt-core # kubernetes # prefect # ydata-profiling queuelib==1.6.2 - # via - # -r deps/requirements.txt - # scrapy + # via scrapy readchar==4.0.5 - # via - # -r deps/requirements.txt - # prefect + # via prefect regex==2023.3.23 - # via - # -r deps/requirements.txt - # dateparser + # via dateparser requests==2.28.2 # via - # -r deps/requirements.txt # apprise # dbt-core # docker @@ -694,61 +447,45 @@ requests==2.28.2 # tldextract # ydata-profiling requests-file==1.5.1 - # via - # -r deps/requirements.txt - # tldextract + # via tldextract requests-oauthlib==1.3.1 # via - # -r deps/requirements.txt # apprise # google-auth-oauthlib # kubernetes -rfc3986[idna2008]==1.5.0 - # via - # -r deps/requirements.txt - # httpx +rfc3986==1.5.0 + # via httpx rich==13.3.3 - # via - # -r deps/requirements.txt - # prefect + # via prefect rsa==4.9 - # via - # -r deps/requirements.txt - # google-auth + # via google-auth ruamel-yaml==0.17.35 - # via - # -r deps/requirements.txt - # prefect + # via prefect ruamel-yaml-clib==0.2.8 - # via - # -r deps/requirements.txt - # ruamel-yaml + # via ruamel-yaml ruff==0.0.292 - # via -r deps/dev-requirements.in scipy==1.9.3 # via - # -r deps/requirements.txt # imagehash # phik # statsmodels # ydata-profiling scrapy==2.8.0 - # via -r deps/requirements.txt seaborn==0.12.2 - # via - # -r deps/requirements.txt - # ydata-profiling + # via ydata-profiling service-identity==21.1.0 + # via scrapy +setuptools==69.5.1 # via - # -r deps/requirements.txt + # jsonschema + # kubernetes + # readchar # scrapy + # zope-interface sgqlc==16.1 - # via - # -r deps/requirements.txt - # prefect-dbt + # via prefect-dbt six==1.16.0 # via - # -r deps/requirements.txt # automat # google-auth # google-auth-httplib2 @@ -764,145 +501,84 @@ six==1.16.0 # service-identity sniffio==1.3.0 # via - # -r deps/requirements.txt # anyio # asgi-lifespan # httpcore # httpx # prefect -sqlalchemy[asyncio]==1.4.47 +sqlalchemy==1.4.47 # via - # -r deps/requirements.txt # alembic # prefect sqlparse==0.4.3 - # via - # -r deps/requirements.txt - # dbt-core + # via dbt-core starlette==0.27.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect statsmodels==0.13.5 - # via - # -r deps/requirements.txt - # ydata-profiling + # via ydata-profiling stringcase==1.2.0 - # via - # -r deps/requirements.txt - # betterproto + # via betterproto tangled-up-in-unicode==0.2.0 - # via - # -r deps/requirements.txt - # visions + # via visions text-unidecode==1.3 - # via - # -r deps/requirements.txt - # python-slugify + # via python-slugify tldextract==3.4.0 - # via - # -r deps/requirements.txt - # scrapy + # via scrapy toml==0.10.2 - # via - # -r deps/requirements.txt - # prefect -tomli==2.0.1 - # via black + # via prefect toolz==0.12.0 - # via -r deps/requirements.txt tqdm==4.64.1 - # via - # -r deps/requirements.txt - # ydata-profiling + # via ydata-profiling twisted==22.10.0 - # via - # -r deps/requirements.txt - # scrapy + # via scrapy typeguard==2.13.3 - # via - # -r deps/requirements.txt - # ydata-profiling + # via ydata-profiling typer==0.7.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect typing-extensions==4.5.0 # via - # -r deps/requirements.txt # alembic - # black # dbt-core # mashumaro - # polars # prefect # pydantic - # starlette # twisted tzdata==2023.2 - # via - # -r deps/requirements.txt - # pytz-deprecation-shim + # via pytz-deprecation-shim tzlocal==4.3 - # via - # -r deps/requirements.txt - # dateparser + # via dateparser ujson==5.8.0 - # via - # -r deps/requirements.txt - # prefect + # via prefect uritemplate==4.1.1 - # via - # -r deps/requirements.txt - # google-api-python-client + # via google-api-python-client urllib3==1.26.15 # via - # -r deps/requirements.txt # docker # kubernetes # requests uvicorn==0.21.1 - # via - # -r deps/requirements.txt - # prefect -visions[type_image_path]==0.7.5 - # via - # -r deps/requirements.txt - # ydata-profiling + # via prefect +visions==0.7.5 + # via ydata-profiling w3lib==2.1.1 # via - # -r deps/requirements.txt # itemloaders # parsel # scrapy websocket-client==1.5.1 # via - # -r deps/requirements.txt # docker # kubernetes websockets==10.4 - # via - # -r deps/requirements.txt - # prefect + # via prefect werkzeug==2.2.3 - # via - # -r deps/requirements.txt - # dbt-core + # via dbt-core yarl==1.8.2 - # via - # -r deps/requirements.txt - # aiohttp + # via aiohttp ydata-profiling==4.1.2 - # via -r deps/requirements.txt zipp==3.15.0 - # via - # -r deps/requirements.txt - # importlib-metadata + # via importlib-metadata zope-interface==6.0 # via - # -r deps/requirements.txt # scrapy # twisted - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/environment.yaml b/environment.yaml index d9dce72..b0c8947 100644 --- a/environment.yaml +++ b/environment.yaml @@ -3,6 +3,6 @@ channels: - defaults dependencies: - pip - - python=3.9 + - python=3.11 - pip: - - pip-tools + - uv From 7617be24bef131e96a673dd0fbc3e3dd2285e8b3 Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 08:19:01 -0500 Subject: [PATCH 04/11] Bye, prefect. --- .gitignore | 1 + README.md | 2 +- deps/dev-requirements.txt | 256 +---------------- deps/requirements.in | 11 - deps/requirements.txt | 313 ++------------------- pipeline/bfro_pipeline.py | 123 -------- pipeline/bfro_pipeline_docker.py | 76 ----- pipeline/main-deployment.yaml | 65 ----- pipeline/scripts/combine_raw_reports.py | 60 ++++ pipeline/scripts/pull_weather.py | 3 + pipeline/{ => scripts}/upload_to_gdrive.py | 15 +- pipeline/update_geocoder.py | 82 ------ pipeline/update_reports.py | 118 -------- pipeline/update_weather.py | 102 ------- run-pipeline-local.sh | 62 ++++ 15 files changed, 172 insertions(+), 1117 deletions(-) delete mode 100644 pipeline/bfro_pipeline.py delete mode 100644 pipeline/bfro_pipeline_docker.py delete mode 100644 pipeline/main-deployment.yaml create mode 100644 pipeline/scripts/combine_raw_reports.py rename pipeline/{ => scripts}/upload_to_gdrive.py (93%) delete mode 100644 pipeline/update_geocoder.py delete mode 100644 pipeline/update_reports.py delete mode 100644 pipeline/update_weather.py create mode 100644 run-pipeline-local.sh diff --git a/.gitignore b/.gitignore index f04d624..43175b5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ data_backup service-account.json gdrive-service-account.json run-docker.sh +run-pipeline-docker.sh \ No newline at end of file diff --git a/README.md b/README.md index 8dd88fb..9c6759d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ To get started, run the following commands to set up the environment. ```shell conda env create -f environment.yaml -conda activate bfro +conda activate bfro-sightings-data make dev-env ``` diff --git a/deps/dev-requirements.txt b/deps/dev-requirements.txt index 550948b..3682e52 100644 --- a/deps/dev-requirements.txt +++ b/deps/dev-requirements.txt @@ -2,30 +2,8 @@ # uv pip compile deps/dev-requirements.in --output-file deps/dev-requirements.txt agate==1.7.0 # via dbt-core -aiohttp==3.8.4 - # via gcsfs -aiosignal==1.3.1 - # via aiohttp -aiosqlite==0.18.0 - # via prefect -alembic==1.10.2 - # via prefect -anyio==3.7.1 - # via - # httpcore - # prefect - # starlette -apprise==1.3.0 - # via prefect -asgi-lifespan==2.0.0 - # via prefect -async-timeout==4.0.2 - # via aiohttp -asyncpg==0.27.0 - # via prefect attrs==22.2.0 # via - # aiohttp # automat # jsonschema # service-identity @@ -40,44 +18,25 @@ betterproto==1.2.5 cachetools==5.3.0 # via google-auth certifi==2022.12.7 - # via - # apprise - # httpcore - # httpx - # kubernetes - # requests + # via requests cffi==1.15.1 # via # cryptography # dbt-core charset-normalizer==3.1.0 - # via - # aiohttp - # requests + # via requests click==8.1.3 # via - # apprise # dbt-core - # prefect # typer - # uvicorn -cloudpickle==2.2.1 - # via prefect colorama==0.4.6 - # via - # dbt-core - # griffe + # via dbt-core constantly==15.1.0 # via twisted contourpy==1.0.7 # via matplotlib -coolname==2.2.0 - # via prefect -croniter==1.3.8 - # via prefect cryptography==40.0.1 # via - # prefect # pyopenssl # scrapy # service-identity @@ -87,92 +46,37 @@ cssselect==1.2.0 # scrapy cycler==0.11.0 # via matplotlib -dateparser==1.1.8 - # via prefect dbt-core==1.4.5 - # via - # dbt-duckdb - # prefect-dbt + # via dbt-duckdb dbt-duckdb==1.4.1 dbt-extractor==0.4.1 # via dbt-core -decorator==5.1.1 - # via gcsfs -dnspython==2.4.2 - # via email-validator -docker==6.0.1 - # via prefect duckdb==0.7.1 # via dbt-duckdb -email-validator==2.1.0.post1 - # via pydantic -exceptiongroup==1.1.3 filelock==3.10.7 # via tldextract fonttools==4.39.2 # via matplotlib -frozenlist==1.3.3 - # via - # aiohttp - # aiosignal -fsspec==2023.1.0 - # via - # gcsfs - # prefect future==0.18.3 # via parsedatetime -gcsfs==2023.1.0 google-api-core==2.11.0 - # via - # google-api-python-client - # google-cloud-core - # google-cloud-storage + # via google-api-python-client google-api-python-client==2.83.0 - # via prefect-gcp google-auth==2.16.3 # via - # gcsfs # google-api-core # google-api-python-client # google-auth-httplib2 # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage - # kubernetes google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==1.0.0 - # via gcsfs -google-cloud-core==2.3.2 - # via google-cloud-storage -google-cloud-storage==2.8.0 - # via - # gcsfs - # prefect-gcp -google-crc32c==1.5.0 - # via google-resumable-media -google-resumable-media==2.4.1 - # via google-cloud-storage googleapis-common-protos==1.59.0 # via google-api-core -graphql-core==3.2.3 - # via sgqlc -graphviz==0.20.1 - # via prefect -greenlet==2.0.2 - # via sqlalchemy -griffe==0.25.5 - # via prefect grpclib==0.4.3 # via betterproto -h11==0.14.0 - # via - # httpcore - # uvicorn h2==4.1.0 - # via - # grpclib - # httpx + # via grpclib h3==3.7.6 hologram==0.0.15 # via dbt-core @@ -180,35 +84,24 @@ hpack==4.0.0 # via h2 htmlmin==0.1.12 # via ydata-profiling -httpcore==0.16.3 - # via - # httpx - # prefect httplib2==0.22.0 # via # google-api-python-client # google-auth-httplib2 -httpx==0.23.3 - # via prefect hyperframe==6.0.1 # via h2 hyperlink==21.0.0 # via twisted idna==3.4 # via - # anyio # dbt-core - # email-validator # hyperlink # requests - # rfc3986 # tldextract - # yarl imagehash==4.3.1 # via # visions # ydata-profiling -importlib-metadata==6.1.0 incremental==22.10.0 # via twisted isodate==0.6.1 @@ -224,24 +117,15 @@ itemloaders==1.0.6 jinja2==3.1.2 # via # dbt-core - # prefect # ydata-profiling jmespath==1.0.1 # via itemloaders joblib==1.2.0 # via phik -jsonpatch==1.32 - # via prefect -jsonpointer==2.3 - # via jsonpatch jsonschema==3.2.0 - # via - # hologram - # prefect + # via hologram kiwisolver==1.4.4 # via matplotlib -kubernetes==26.1.0 - # via prefect leather==0.3.4 # via agate logbook==1.5.3 @@ -251,16 +135,9 @@ lxml==4.9.2 # via # parsel # scrapy -mako==1.2.4 - # via alembic -markdown==3.4.3 - # via apprise -markdown-it-py==2.2.0 - # via rich markupsafe==2.1.2 # via # jinja2 - # mako # werkzeug mashumaro==3.3.1 # via dbt-core @@ -269,17 +146,12 @@ matplotlib==3.6.3 # phik # seaborn # ydata-profiling -mdurl==0.1.2 - # via markdown-it-py minimal-snowplow-tracker==0.0.2 # via dbt-core msgpack==1.0.5 # via mashumaro multidict==6.0.4 - # via - # aiohttp - # grpclib - # yarl + # via grpclib multimethod==1.9.1 # via # visions @@ -305,15 +177,11 @@ numpy==1.23.5 # ydata-profiling oauthlib==3.2.2 # via requests-oauthlib -orjson==3.8.8 - # via prefect packaging==23.0 # via # dbt-core - # docker # matplotlib # parsel - # prefect # scrapy # statsmodels pandas==1.5.3 @@ -330,13 +198,9 @@ parsel==1.7.0 # itemloaders # scrapy pathspec==0.10.3 - # via - # dbt-core - # prefect + # via dbt-core patsy==0.5.3 # via statsmodels -pendulum==2.1.2 - # via prefect phik==0.12.3 # via ydata-profiling pillow==9.4.0 @@ -345,15 +209,6 @@ pillow==9.4.0 # matplotlib # visions polars==0.16.16 -prefect==2.14.2 - # via - # prefect-dbt - # prefect-gcp - # prefect-shell -prefect-dbt==0.3.1 -prefect-gcp==0.3.0 -prefect-shell==0.1.5 - # via prefect-dbt protego==0.2.1 # via scrapy protobuf==4.22.1 @@ -361,7 +216,6 @@ protobuf==4.22.1 # google-api-core # googleapis-common-protos pyarrow==11.0.0 - # via polars pyasn1==0.4.8 # via # pyasn1-modules @@ -374,14 +228,10 @@ pyasn1-modules==0.2.8 pycparser==2.21 # via cffi pydantic==1.10.7 - # via - # prefect - # ydata-profiling + # via ydata-profiling pydispatcher==2.0.7 # via scrapy pygeohash==1.2.0 -pygments==2.14.0 - # via rich pyopenssl==23.1.1 # via scrapy pyparsing==3.0.9 @@ -392,55 +242,30 @@ pyrsistent==0.19.3 # via jsonschema python-dateutil==2.8.2 # via - # croniter - # dateparser # hologram - # kubernetes # matplotlib # pandas - # pendulum - # prefect python-dotenv==1.0.0 python-slugify==8.0.1 - # via - # agate - # prefect + # via agate pytimeparse==1.1.8 # via agate pytz==2023.2 # via - # dateparser # dbt-core # pandas - # prefect -pytz-deprecation-shim==0.1.0.post0 - # via tzlocal -pytzdata==2020.1 - # via pendulum pywavelets==1.4.1 # via imagehash pyyaml==6.0 # via - # apprise # dbt-core - # kubernetes - # prefect # ydata-profiling queuelib==1.6.2 # via scrapy -readchar==4.0.5 - # via prefect -regex==2023.3.23 - # via dateparser requests==2.28.2 # via - # apprise # dbt-core - # docker - # gcsfs # google-api-core - # google-cloud-storage - # kubernetes # minimal-snowplow-tracker # requests-file # requests-oauthlib @@ -449,20 +274,9 @@ requests==2.28.2 requests-file==1.5.1 # via tldextract requests-oauthlib==1.3.1 - # via - # apprise - # google-auth-oauthlib - # kubernetes -rfc3986==1.5.0 - # via httpx -rich==13.3.3 - # via prefect + # via google-auth-oauthlib rsa==4.9 # via google-auth -ruamel-yaml==0.17.35 - # via prefect -ruamel-yaml-clib==0.2.8 - # via ruamel-yaml ruff==0.0.292 scipy==1.9.3 # via @@ -478,12 +292,8 @@ service-identity==21.1.0 setuptools==69.5.1 # via # jsonschema - # kubernetes - # readchar # scrapy # zope-interface -sgqlc==16.1 - # via prefect-dbt six==1.16.0 # via # automat @@ -491,7 +301,6 @@ six==1.16.0 # google-auth-httplib2 # isodate # jsonschema - # kubernetes # leather # minimal-snowplow-tracker # patsy @@ -499,21 +308,8 @@ six==1.16.0 # python-dateutil # requests-file # service-identity -sniffio==1.3.0 - # via - # anyio - # asgi-lifespan - # httpcore - # httpx - # prefect -sqlalchemy==1.4.47 - # via - # alembic - # prefect sqlparse==0.4.3 # via dbt-core -starlette==0.27.0 - # via prefect statsmodels==0.13.5 # via ydata-profiling stringcase==1.2.0 @@ -524,8 +320,6 @@ text-unidecode==1.3 # via python-slugify tldextract==3.4.0 # via scrapy -toml==0.10.2 - # via prefect toolz==0.12.0 tqdm==4.64.1 # via ydata-profiling @@ -534,30 +328,16 @@ twisted==22.10.0 typeguard==2.13.3 # via ydata-profiling typer==0.7.0 - # via prefect typing-extensions==4.5.0 # via - # alembic # dbt-core # mashumaro - # prefect # pydantic # twisted -tzdata==2023.2 - # via pytz-deprecation-shim -tzlocal==4.3 - # via dateparser -ujson==5.8.0 - # via prefect uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.15 - # via - # docker - # kubernetes - # requests -uvicorn==0.21.1 - # via prefect + # via requests visions==0.7.5 # via ydata-profiling w3lib==2.1.1 @@ -565,19 +345,9 @@ w3lib==2.1.1 # itemloaders # parsel # scrapy -websocket-client==1.5.1 - # via - # docker - # kubernetes -websockets==10.4 - # via prefect werkzeug==2.2.3 # via dbt-core -yarl==1.8.2 - # via aiohttp ydata-profiling==4.1.2 -zipp==3.15.0 - # via importlib-metadata zope-interface==6.0 # via # scrapy diff --git a/deps/requirements.in b/deps/requirements.in index a17ae38..c069c1c 100644 --- a/deps/requirements.in +++ b/deps/requirements.in @@ -6,21 +6,10 @@ scrapy>=2,<3 toolz loguru ydata-profiling - -# Leaving dvc out for now. The issue is a dependency of dvc[gs] (specifically -# fsspec/gcsfs requires fsspec==2023.3.0. Prefect explicitly ignores this -# version of fsspec for ... reasons. Idk. It's a PITA. -# dvc[gs] - duckdb polars[pyarrow] -prefect~=2.14 -prefect-shell -prefect-dbt -prefect-gcp[cloud_storage] h3<4.0 dbt-duckdb -gcsfs!=2023.3.0 google-api-python-client google-auth-httplib2 google-auth-oauthlib \ No newline at end of file diff --git a/deps/requirements.txt b/deps/requirements.txt index 406ab41..80ebce9 100644 --- a/deps/requirements.txt +++ b/deps/requirements.txt @@ -1,35 +1,9 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile --output-file=deps/requirements.txt deps/requirements.in -# +# This file was autogenerated by uv via the following command: +# uv pip compile deps/requirements.in --output-file deps/requirements.txt agate==1.7.0 # via dbt-core -aiohttp==3.8.4 - # via gcsfs -aiosignal==1.3.1 - # via aiohttp -aiosqlite==0.18.0 - # via prefect -alembic==1.10.2 - # via prefect -anyio==3.7.1 - # via - # httpcore - # prefect - # starlette -apprise==1.3.0 - # via prefect -asgi-lifespan==2.0.0 - # via prefect -async-timeout==4.0.2 - # via aiohttp -asyncpg==0.27.0 - # via prefect attrs==22.2.0 # via - # aiohttp # automat # jsonschema # service-identity @@ -44,44 +18,25 @@ betterproto==1.2.5 cachetools==5.3.0 # via google-auth certifi==2022.12.7 - # via - # apprise - # httpcore - # httpx - # kubernetes - # requests + # via requests cffi==1.15.1 # via # cryptography # dbt-core charset-normalizer==3.1.0 - # via - # aiohttp - # requests + # via requests click==8.1.3 # via - # apprise # dbt-core - # prefect # typer - # uvicorn -cloudpickle==2.2.1 - # via prefect colorama==0.4.6 - # via - # dbt-core - # griffe + # via dbt-core constantly==15.1.0 # via twisted contourpy==1.0.7 # via matplotlib -coolname==2.2.0 - # via prefect -croniter==1.3.8 - # via prefect cryptography==40.0.1 # via - # prefect # pyopenssl # scrapy # service-identity @@ -91,143 +46,62 @@ cssselect==1.2.0 # scrapy cycler==0.11.0 # via matplotlib -dateparser==1.1.8 - # via prefect dbt-core==1.4.5 - # via - # dbt-duckdb - # prefect-dbt + # via dbt-duckdb dbt-duckdb==1.4.1 - # via -r deps/requirements.in dbt-extractor==0.4.1 # via dbt-core -decorator==5.1.1 - # via gcsfs -dnspython==2.4.2 - # via email-validator -docker==6.0.1 - # via prefect duckdb==0.7.1 - # via - # -r deps/requirements.in - # dbt-duckdb -email-validator==2.1.0.post1 - # via pydantic -exceptiongroup==1.1.3 - # via anyio + # via dbt-duckdb filelock==3.10.7 # via tldextract fonttools==4.39.2 # via matplotlib -frozenlist==1.3.3 - # via - # aiohttp - # aiosignal -fsspec==2023.1.0 - # via - # gcsfs - # prefect future==0.18.3 # via parsedatetime -gcsfs==2023.1.0 - # via -r deps/requirements.in google-api-core==2.11.0 - # via - # google-api-python-client - # google-cloud-core - # google-cloud-storage + # via google-api-python-client google-api-python-client==2.83.0 - # via - # -r deps/requirements.in - # prefect-gcp google-auth==2.16.3 # via - # gcsfs # google-api-core # google-api-python-client # google-auth-httplib2 # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage - # kubernetes google-auth-httplib2==0.1.0 - # via - # -r deps/requirements.in - # google-api-python-client + # via google-api-python-client google-auth-oauthlib==1.0.0 - # via - # -r deps/requirements.in - # gcsfs -google-cloud-core==2.3.2 - # via google-cloud-storage -google-cloud-storage==2.8.0 - # via - # gcsfs - # prefect-gcp -google-crc32c==1.5.0 - # via google-resumable-media -google-resumable-media==2.4.1 - # via google-cloud-storage googleapis-common-protos==1.59.0 # via google-api-core -graphql-core==3.2.3 - # via sgqlc -graphviz==0.20.1 - # via prefect -greenlet==2.0.2 - # via sqlalchemy -griffe==0.25.5 - # via prefect grpclib==0.4.3 # via betterproto -h11==0.14.0 - # via - # httpcore - # uvicorn h2==4.1.0 - # via - # grpclib - # httpx + # via grpclib h3==3.7.6 - # via -r deps/requirements.in hologram==0.0.15 # via dbt-core hpack==4.0.0 # via h2 htmlmin==0.1.12 # via ydata-profiling -httpcore==0.16.3 - # via - # httpx - # prefect httplib2==0.22.0 # via # google-api-python-client # google-auth-httplib2 -httpx[http2]==0.23.3 - # via prefect hyperframe==6.0.1 # via h2 hyperlink==21.0.0 # via twisted idna==3.4 # via - # anyio # dbt-core - # email-validator # hyperlink # requests - # rfc3986 # tldextract - # yarl imagehash==4.3.1 # via # visions # ydata-profiling -importlib-metadata==6.1.0 - # via - # markdown - # prefect incremental==22.10.0 # via twisted isodate==0.6.1 @@ -243,64 +117,41 @@ itemloaders==1.0.6 jinja2==3.1.2 # via # dbt-core - # prefect # ydata-profiling jmespath==1.0.1 # via itemloaders joblib==1.2.0 # via phik -jsonpatch==1.32 - # via prefect -jsonpointer==2.3 - # via jsonpatch jsonschema==3.2.0 - # via - # hologram - # prefect + # via hologram kiwisolver==1.4.4 # via matplotlib -kubernetes==26.1.0 - # via prefect leather==0.3.4 # via agate logbook==1.5.3 # via dbt-core loguru==0.6.0 - # via -r deps/requirements.in lxml==4.9.2 # via - # -r deps/requirements.in # parsel # scrapy -mako==1.2.4 - # via alembic -markdown==3.4.3 - # via apprise -markdown-it-py==2.2.0 - # via rich markupsafe==2.1.2 # via # jinja2 - # mako # werkzeug -mashumaro[msgpack]==3.3.1 +mashumaro==3.3.1 # via dbt-core matplotlib==3.6.3 # via # phik # seaborn # ydata-profiling -mdurl==0.1.2 - # via markdown-it-py minimal-snowplow-tracker==0.0.2 # via dbt-core msgpack==1.0.5 # via mashumaro multidict==6.0.4 - # via - # aiohttp - # grpclib - # yarl + # via grpclib multimethod==1.9.1 # via # visions @@ -326,15 +177,11 @@ numpy==1.23.5 # ydata-profiling oauthlib==3.2.2 # via requests-oauthlib -orjson==3.8.8 - # via prefect packaging==23.0 # via # dbt-core - # docker # matplotlib # parsel - # prefect # scrapy # statsmodels pandas==1.5.3 @@ -351,13 +198,9 @@ parsel==1.7.0 # itemloaders # scrapy pathspec==0.10.3 - # via - # dbt-core - # prefect + # via dbt-core patsy==0.5.3 # via statsmodels -pendulum==2.1.2 - # via prefect phik==0.12.3 # via ydata-profiling pillow==9.4.0 @@ -365,22 +208,7 @@ pillow==9.4.0 # imagehash # matplotlib # visions -polars[pyarrow]==0.16.16 - # via -r deps/requirements.in -prefect==2.14.2 - # via - # -r deps/requirements.in - # prefect-dbt - # prefect-gcp - # prefect-shell -prefect-dbt==0.3.1 - # via -r deps/requirements.in -prefect-gcp[cloud_storage]==0.3.0 - # via -r deps/requirements.in -prefect-shell==0.1.5 - # via - # -r deps/requirements.in - # prefect-dbt +polars==0.16.16 protego==0.2.1 # via scrapy protobuf==4.22.1 @@ -400,16 +228,11 @@ pyasn1-modules==0.2.8 # service-identity pycparser==2.21 # via cffi -pydantic[email]==1.10.7 - # via - # prefect - # ydata-profiling +pydantic==1.10.7 + # via ydata-profiling pydispatcher==2.0.7 # via scrapy pygeohash==1.2.0 - # via -r deps/requirements.in -pygments==2.14.0 - # via rich pyopenssl==23.1.1 # via scrapy pyparsing==3.0.9 @@ -420,56 +243,30 @@ pyrsistent==0.19.3 # via jsonschema python-dateutil==2.8.2 # via - # croniter - # dateparser # hologram - # kubernetes # matplotlib # pandas - # pendulum - # prefect python-dotenv==1.0.0 - # via -r deps/requirements.in python-slugify==8.0.1 - # via - # agate - # prefect + # via agate pytimeparse==1.1.8 # via agate pytz==2023.2 # via - # dateparser # dbt-core # pandas - # prefect -pytz-deprecation-shim==0.1.0.post0 - # via tzlocal -pytzdata==2020.1 - # via pendulum pywavelets==1.4.1 # via imagehash pyyaml==6.0 # via - # apprise # dbt-core - # kubernetes - # prefect # ydata-profiling queuelib==1.6.2 # via scrapy -readchar==4.0.5 - # via prefect -regex==2023.3.23 - # via dateparser requests==2.28.2 # via - # apprise # dbt-core - # docker - # gcsfs # google-api-core - # google-cloud-storage - # kubernetes # minimal-snowplow-tracker # requests-file # requests-oauthlib @@ -478,20 +275,9 @@ requests==2.28.2 requests-file==1.5.1 # via tldextract requests-oauthlib==1.3.1 - # via - # apprise - # google-auth-oauthlib - # kubernetes -rfc3986[idna2008]==1.5.0 - # via httpx -rich==13.3.3 - # via prefect + # via google-auth-oauthlib rsa==4.9 # via google-auth -ruamel-yaml==0.17.35 - # via prefect -ruamel-yaml-clib==0.2.8 - # via ruamel-yaml scipy==1.9.3 # via # imagehash @@ -499,13 +285,15 @@ scipy==1.9.3 # statsmodels # ydata-profiling scrapy==2.8.0 - # via -r deps/requirements.in seaborn==0.12.2 # via ydata-profiling service-identity==21.1.0 # via scrapy -sgqlc==16.1 - # via prefect-dbt +setuptools==69.5.1 + # via + # jsonschema + # scrapy + # zope-interface six==1.16.0 # via # automat @@ -513,7 +301,6 @@ six==1.16.0 # google-auth-httplib2 # isodate # jsonschema - # kubernetes # leather # minimal-snowplow-tracker # patsy @@ -521,21 +308,8 @@ six==1.16.0 # python-dateutil # requests-file # service-identity -sniffio==1.3.0 - # via - # anyio - # asgi-lifespan - # httpcore - # httpx - # prefect -sqlalchemy[asyncio]==1.4.47 - # via - # alembic - # prefect sqlparse==0.4.3 # via dbt-core -starlette==0.27.0 - # via prefect statsmodels==0.13.5 # via ydata-profiling stringcase==1.2.0 @@ -546,10 +320,7 @@ text-unidecode==1.3 # via python-slugify tldextract==3.4.0 # via scrapy -toml==0.10.2 - # via prefect toolz==0.12.0 - # via -r deps/requirements.in tqdm==4.64.1 # via ydata-profiling twisted==22.10.0 @@ -557,59 +328,27 @@ twisted==22.10.0 typeguard==2.13.3 # via ydata-profiling typer==0.7.0 - # via - # -r deps/requirements.in - # prefect typing-extensions==4.5.0 # via - # alembic # dbt-core # mashumaro - # polars - # prefect # pydantic - # starlette # twisted -tzdata==2023.2 - # via pytz-deprecation-shim -tzlocal==4.3 - # via dateparser -ujson==5.8.0 - # via prefect uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.15 - # via - # docker - # kubernetes - # requests -uvicorn==0.21.1 - # via prefect -visions[type_image_path]==0.7.5 + # via requests +visions==0.7.5 # via ydata-profiling w3lib==2.1.1 # via # itemloaders # parsel # scrapy -websocket-client==1.5.1 - # via - # docker - # kubernetes -websockets==10.4 - # via prefect werkzeug==2.2.3 # via dbt-core -yarl==1.8.2 - # via aiohttp ydata-profiling==4.1.2 - # via -r deps/requirements.in -zipp==3.15.0 - # via importlib-metadata zope-interface==6.0 # via # scrapy # twisted - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/pipeline/bfro_pipeline.py b/pipeline/bfro_pipeline.py deleted file mode 100644 index de7edac..0000000 --- a/pipeline/bfro_pipeline.py +++ /dev/null @@ -1,123 +0,0 @@ -import typer -from prefect import flow, task, get_run_logger -from prefect_dbt.cli.commands import DbtCoreOperation -from update_reports import update_reports -from update_geocoder import pull_and_update_geocoded_reports -from update_weather import update_weather -from pathlib import Path -from typing import Optional - - -@flow(name="Update sources") -def set_up_sources( - data_dir: Path = Path("data"), - weather_limit: int = 900, - test_run: bool = False, - visual_crossing_key: Optional[str] = None, -) -> bool: - reports_updated = update_reports(data_dir=data_dir, test_run=test_run) - geocoder_updated = pull_and_update_geocoded_reports(data_dir=data_dir) - weather_updated = False - if geocoder_updated: - weather_updated = update_weather( - limit=weather_limit, - data_dir=data_dir, - visual_crossing_key=visual_crossing_key, - ) - - return reports_updated and geocoder_updated and weather_updated - - -@task(name="DBT test sources") -def dbt_test_sources(data_dir: Path = Path("data")) -> bool: - logger = get_run_logger() - logger.info("Testing sources.") - DbtCoreOperation( - commands=[ - "dbt test --select source:local_files " - f'--vars \'{{"data_dir":"{data_dir.absolute()}"}}\'' - ], - project_dir="bfro_mini_warehouse", - profiles_dir="bfro_mini_warehouse", - ).run() - logger.info("Testing completed.") - return True - - -@task(name="DBT run") -def dbt_run(data_dir: Path = Path("data")) -> bool: - logger = get_run_logger() - logger.info("Building csv files with DBT.") - DbtCoreOperation( - commands=[ - f'dbt run --vars \'{{"data_dir":"{data_dir.absolute()}"}}\'' - ], - project_dir="bfro_mini_warehouse", - profiles_dir="bfro_mini_warehouse", - ).run() - logger.info("DBT run completed.") - - return True - - -@task(name="DBT test") -def dbt_test(data_dir: Path = Path("data")) -> bool: - logger = get_run_logger() - logger.info("Testing DBT models.") - DbtCoreOperation( - commands=[ - "dbt test --exclude source:* " - # Yikes that double escaping 😬 - f'--vars \'{{"data_dir":"{data_dir.absolute()}"}}\'' - ], - project_dir="bfro_mini_warehouse", - profiles_dir="bfro_mini_warehouse", - ).run() - logger.info("Testing completed.") - return True - - -@flow(name="DBT") -def dbt(data_dir: Path = Path("data")) -> bool: - sources_pass = dbt_test_sources(data_dir) - if not sources_pass: - return False - run_completed = dbt_run(data_dir) - if not run_completed: - return False - tests_pass = dbt_test(data_dir) - return tests_pass - - -@flow(name="BFRO Pipeline") -def main( - test_run: bool = False, - dbt_only: bool = False, - data_dir: Path = Path("data"), - visual_crossing_key: Optional[str] = None, -) -> bool: - logger = get_run_logger() - weather_limit = 900 - if test_run: - logger.info("Test run selected.") - weather_limit = 25 - - sources_updated = False - if not dbt_only: - sources_updated = set_up_sources( - data_dir=data_dir, - weather_limit=weather_limit, - test_run=test_run, - visual_crossing_key=visual_crossing_key, - ) - - if not sources_updated and not dbt_only: - logger.info("Source update incomplete. Terminating flow.") - return False - dbt(data_dir=data_dir) - - return True - - -if __name__ == "__main__": - typer.run(main) diff --git a/pipeline/bfro_pipeline_docker.py b/pipeline/bfro_pipeline_docker.py deleted file mode 100644 index ef407fb..0000000 --- a/pipeline/bfro_pipeline_docker.py +++ /dev/null @@ -1,76 +0,0 @@ -import typer -from prefect import flow, get_run_logger -from prefect.blocks.system import Secret -from prefect_gcp import GcpCredentials, GcsBucket -from bfro_pipeline import main as bfro_pipeline -from upload_to_gdrive import upload_to_gdrive -from pathlib import Path - - -@flow(name="BFRO Pipeline (Docker)") -def main(data_dir: Path = Path("data"), test_run: bool = False): - logger = get_run_logger() - - logger.info("Fetching credentials.") - gcp_credentials = GcpCredentials.load("prefect-gcs-rw") - - logger.info("Fetching visual crossing key.") - visual_crossing_block = Secret.load("visual-crossing-key") - visual_crossing_key = visual_crossing_block.get() - - logger.info("Fetching service account location for GDrive upload.") - sa_credentials_location = Secret.load( - "bfro-gdrive-service-account-location" - ).get() - - logger.info("Fetching GDrive folder ID for test.") - bfro_test_gdrive_folder_id = Secret.load( - "bfro-test-gdrive-folder-id" - ).get() - - logger.info("Fetching editor email for test.") - bfro_test_editor_email = Secret.load("bfro-test-editor-email").get() - - logger.info("Fetching GDrive folder ID for prod.") - bfro_prod_gdrive_folder_id = Secret.load( - "bfro-prod-gdrive-folder-id" - ).get() - - logger.info("Fetching GDrive editor email for prod.") - bfro_prod_editor_email = Secret.load("bfro-prod-editor-email").get() - - logger.info(f"Downloading data to {data_dir}.") - bigfoot_bucket = GcsBucket( - bucket="trenner-datasets", - gcp_credentials=gcp_credentials, - ) - bigfoot_bucket.download_folder_to_path("bigfoot", data_dir) - - logger.info("Executing pipeline.") - bfro_success = bfro_pipeline( - test_run=test_run, visual_crossing_key=visual_crossing_key - ) - - logger.info("Uploading to gdrive (test).") - upload_to_gdrive( - data_dir / "processed" / "bfro_reports_geocoded.csv", - sa_credentials_location=sa_credentials_location, - gdrive_folder_id=bfro_test_gdrive_folder_id, - owner_email=bfro_test_editor_email, - ) - - logger.info("Uploading to gdrive (prod).") - upload_to_gdrive( - data_dir / "processed" / "bfro_reports_geocoded.csv", - sa_credentials_location=sa_credentials_location, - gdrive_folder_id=bfro_prod_gdrive_folder_id, - owner_email=bfro_prod_editor_email, - ) - - logger.info(f"Uploading updated contents of {data_dir} to GCS.") - if bfro_success: - bigfoot_bucket.upload_from_folder(data_dir, "bigfoot") - - -if __name__ == "__main__": - typer.run(main) diff --git a/pipeline/main-deployment.yaml b/pipeline/main-deployment.yaml deleted file mode 100644 index 9aab527..0000000 --- a/pipeline/main-deployment.yaml +++ /dev/null @@ -1,65 +0,0 @@ -### -### A complete description of a Prefect Deployment for flow 'BFRO Pipeline (Docker)' -### -name: bfro-pipeline -description: null -version: 42854cfbb777416a51819d5b21e78974 -# The work queue that will handle this deployment's runs -work_queue_name: default -work_pool_name: bfro-agent-pool -tags: [] -parameters: {} -schedule: - interval: 1209600.0 - anchor_date: '2023-04-03T12:54:15.962000+00:00' - timezone: America/Chicago -is_schedule_active: true -infra_overrides: {} - -### -### DO NOT EDIT BELOW THIS LINE -### -flow_name: BFRO Pipeline (Docker) -manifest_path: null -infrastructure: - type: process - env: {} - labels: {} - name: null - command: null - stream_output: true - working_dir: null - _block_document_id: c311d1a6-7dd2-4c82-b395-079f915f1ff3 - _block_document_name: bfro-local - _is_anonymous: false - block_type_slug: process - _block_type_slug: process -storage: - bucket_path: trenner-datasets/bfro-pipeline - service_account_info: '**********' - project: null - _block_document_id: 246e2c05-6aa6-42ba-9b1c-608904e06fd2 - _block_document_name: bfro-pipeline-storage - _is_anonymous: false - block_type_slug: gcs - _block_type_slug: gcs -path: '' -entrypoint: bfro_pipeline_docker:main -parameter_openapi_schema: - title: Parameters - type: object - properties: - data_dir: - title: data_dir - default: data - position: 0 - type: string - format: path - test_run: - title: test_run - default: false - position: 1 - type: boolean - required: null - definitions: null -timestamp: '2023-06-13T12:36:35.704834+00:00' diff --git a/pipeline/scripts/combine_raw_reports.py b/pipeline/scripts/combine_raw_reports.py new file mode 100644 index 0000000..7469b90 --- /dev/null +++ b/pipeline/scripts/combine_raw_reports.py @@ -0,0 +1,60 @@ +import typer +from pathlib import Path +from loguru import logger +import duckdb +import polars as pl + + +def main( + reports_orig_file: Path, reports_new_file: Path, reports_out_file: Path +): + logger.info(f"reports_orig_file: {reports_orig_file.name}") + logger.info(f"reports_new_file: {reports_new_file.name}") + logger.info(f"reports_out_file: {reports_out_file.name}") + + if reports_orig_file.exists(): + combined_reports_frame = duckdb.sql( + f""" + WITH all_rows AS ( + SELECT * FROM '{reports_orig_file}' + UNION ALL + SELECT * FROM READ_NDJSON( + '{reports_new_file}', + columns={{ + year: 'VARCHAR', + season: 'VARCHAR', + month: 'VARCHAR', + date: 'VARCHAR', + state: 'VARCHAR', + county: 'VARCHAR', + location_details: 'VARCHAR', + nearest_town: 'VARCHAR', + nearest_road: 'VARCHAR', + observed: 'VARCHAR', + also_noticed: 'VARCHAR', + other_witnesses: 'VARCHAR', + other_stories: 'VARCHAR', + time_and_conditions: 'VARCHAR', + environment: 'VARCHAR', + report_number: 'BIGINT', + report_class: 'VARCHAR', + "a_&_g_references": 'VARCHAR', + pulled_datetime: 'VARCHAR' + }} + ) + ) + SELECT * FROM all_rows + QUALIFY ROW_NUMBER() OVER( + PARTITION BY report_number ORDER BY pulled_datetime DESC + ) = 1 + """ + ).pl() + else: + combined_reports_frame = pl.read_ndjson(reports_new_file) + + combined_reports_frame.write_csv(reports_out_file) + logger.info("Done!") + + +if __name__ == "__main__": + typer.run(main) diff --git a/pipeline/scripts/pull_weather.py b/pipeline/scripts/pull_weather.py index 9b461dc..c941ae9 100644 --- a/pipeline/scripts/pull_weather.py +++ b/pipeline/scripts/pull_weather.py @@ -130,6 +130,9 @@ def merge_new_records_with_weather_cache( def main( weather_cache_file: Path, geocoded_reports_file: Path, limit: int = 900 ): + logger.info(f"weather_cache_file: {weather_cache_file.name}") + logger.info(f"geocoded_reports_file: {geocoded_reports_file.name}") + logger.info(f"limit: {limit}") visual_crossing_key = get_visual_crossing_key_from_env() logger.info(f"Getting missing weather keys from {geocoded_reports_file}.") missing_weather_keys = get_missing_weather_keys( diff --git a/pipeline/upload_to_gdrive.py b/pipeline/scripts/upload_to_gdrive.py similarity index 93% rename from pipeline/upload_to_gdrive.py rename to pipeline/scripts/upload_to_gdrive.py index b4e224f..596ac59 100644 --- a/pipeline/upload_to_gdrive.py +++ b/pipeline/scripts/upload_to_gdrive.py @@ -1,5 +1,4 @@ import typer -from prefect import flow, task, get_run_logger from pathlib import Path from typing import Optional, List from googleapiclient import discovery @@ -7,16 +6,15 @@ from google.oauth2 import service_account import os from dotenv import load_dotenv, find_dotenv +from loguru import logger -@task(name="Upload") -def upload_to_gdrive_task( +def upload_to_gdrive( google_drive_service, file: Path, destination_folder_id: str, owner_emails: List[str] = [], ): - logger = get_run_logger() logger.info("Determining if file exists.") file_id: Optional[str] = None file_search_response = ( @@ -66,14 +64,12 @@ def upload_to_gdrive_task( logger.exception("Encountered error with sharing.") -@flow(name="Save to GDrive") -def upload_to_gdrive( +def main( file: Path, sa_credentials_location: Optional[str] = None, gdrive_folder_id: Optional[str] = None, owner_email: Optional[str] = None, ): - logger = get_run_logger() if sa_credentials_location is None: logger.info( "Service account credentials location not passed, " @@ -110,10 +106,11 @@ def upload_to_gdrive( ) logger.info("Performing upload task.") - upload_to_gdrive_task( + upload_to_gdrive( google_drive_service, file, gdrive_folder_id, [owner_email] ) + logger.info("Done!") if __name__ == "__main__": - typer.run(upload_to_gdrive) + typer.run(main) diff --git a/pipeline/update_geocoder.py b/pipeline/update_geocoder.py deleted file mode 100644 index 99f6af5..0000000 --- a/pipeline/update_geocoder.py +++ /dev/null @@ -1,82 +0,0 @@ -from prefect import flow, task, get_run_logger -from prefect_shell import ShellOperation -import typer -from pathlib import Path -import polars as pl -from lxml import etree -from scripts.extract_locations_from_kml import extract_geocoded_reports -from scripts.combine_geocoded_reports import combine_geocoded_reports - - -@task(name="Unzip aspx file") -def download_and_unzip_aspx_file(aspx_file: Path) -> Path: - ShellOperation( - commands=[ - "wget http://www.bfro.net/app/AllReportsKMZ.aspx", - f"mv {aspx_file.name} {aspx_file.parent}", - f"unzip -o {aspx_file} -d {aspx_file.parent}", - ] - ).run() - return aspx_file.parent / "doc.kml" - - -@task(name="Extract geocoded reports") -def extract_geocoded_reports_task(kml_file: Path) -> pl.DataFrame: - logger = get_run_logger() - logger.info(f"Reading and parsing {kml_file.name}") - report_xml = etree.fromstring(kml_file.read_bytes()) - logger.info(f"Extracting geocoded reports from {kml_file.name}") - return extract_geocoded_reports(report_xml) - - -@task(name="Combine geocoded reports") -def combine_geocoded_reports_task( - orig_report_file: Path, new_reports: pl.DataFrame -) -> pl.DataFrame: - logger = get_run_logger() - logger.info(f"Reading original reports from {orig_report_file.name}") - orig_reports = pl.read_csv(orig_report_file) - logger.info("Combining original and new reports.") - combined_reports = combine_geocoded_reports(orig_reports, new_reports) - return combined_reports - - -@task(name="Save combined reports back to raw") -def save_combined_reports( - combined_reports: pl.DataFrame, orig_report_file: Path -): - logger = get_run_logger() - logger.info(f"Saving combined reports back into {orig_report_file.name}") - combined_reports.write_csv(orig_report_file) - - -@flow(name="Update geocoded reports") -def pull_and_update_geocoded_reports( - data_dir: Path = Path("data"), -) -> bool: - logger = get_run_logger() - aspx_file = data_dir / Path("raw/geocoder/AllReportsKMZ.aspx") - logger.info(f"aspx_file: {aspx_file}") - - orig_report_file = data_dir / Path("raw/geocoder/geocoded_reports.csv") - logger.info(f"orig_report_file: {orig_report_file}") - - source_report_file = data_dir / Path("sources/geocoded_reports.csv") - logger.info(f"source_report_file: {source_report_file}") - - kml_file = download_and_unzip_aspx_file(aspx_file) - new_geocoded_reports = extract_geocoded_reports_task(kml_file) - if orig_report_file.exists(): - combined_geocoded_reports = combine_geocoded_reports_task( - orig_report_file, new_geocoded_reports - ) - else: - combined_geocoded_reports = new_geocoded_reports - save_combined_reports(combined_geocoded_reports, orig_report_file) - save_combined_reports(combined_geocoded_reports, source_report_file) - # Signals to downstream flows that the source is ready. - return True - - -if __name__ == "__main__": - typer.run(pull_and_update_geocoded_reports) diff --git a/pipeline/update_reports.py b/pipeline/update_reports.py deleted file mode 100644 index ffcc403..0000000 --- a/pipeline/update_reports.py +++ /dev/null @@ -1,118 +0,0 @@ -from prefect import flow, task, get_run_logger -import typer -from pathlib import Path -import polars as pl -import duckdb -from prefect_shell import ShellOperation - - -@task(name="Run scraper") -def run_scraper_task(reports_new_file: Path, test_run: bool = False) -> Path: - logger = get_run_logger() - logger.info("Firing up the scraper.") - scraper_working_dir = "scraper/bfro_scrape" - scraper = ShellOperation( - commands=[ - "scrapy crawl bfro_reports " - f"-a test_run={test_run} " - f"--overwrite-output new_reports.json:jsonlines" - ], - working_dir=scraper_working_dir, - ).trigger() - - scraper.wait_for_completion() - logger.info( - "Scraper completed. " - f"Saved to {scraper_working_dir}/new_reports.json" - ) - - ShellOperation( - commands=[ - f"cp {scraper_working_dir}/new_reports.json {reports_new_file}" - ] - ).run() - return reports_new_file - - -@task(name="Combine reports") -def combine_reports_task( - reports_orig_file: Path, reports_new_file: Path -) -> pl.DataFrame: - logger = get_run_logger() - logger.info( - f"Combining reports in {reports_orig_file} with {reports_new_file}." - ) - return duckdb.sql( - f""" - WITH all_rows AS ( - SELECT * FROM '{reports_orig_file}' - UNION ALL - SELECT * FROM READ_NDJSON( - '{reports_new_file}', - columns={{ - year: 'VARCHAR', - season: 'VARCHAR', - month: 'VARCHAR', - date: 'VARCHAR', - state: 'VARCHAR', - county: 'VARCHAR', - location_details: 'VARCHAR', - nearest_town: 'VARCHAR', - nearest_road: 'VARCHAR', - observed: 'VARCHAR', - also_noticed: 'VARCHAR', - other_witnesses: 'VARCHAR', - other_stories: 'VARCHAR', - time_and_conditions: 'VARCHAR', - environment: 'VARCHAR', - report_number: 'BIGINT', - report_class: 'VARCHAR', - "a_&_g_references": 'VARCHAR', - pulled_datetime: 'VARCHAR' - }} - ) - ) - SELECT * FROM all_rows - QUALIFY ROW_NUMBER() OVER( - PARTITION BY report_number ORDER BY pulled_datetime DESC - ) = 1 - """ - ).pl() - - -@flow(name="Update reports") -def update_reports( - data_dir: Path = Path("data"), - test_run: bool = False, -) -> bool: - logger = get_run_logger() - reports_orig_file = data_dir / Path("raw/reports/bfro_reports.csv") - logger.info(f"reports_orig_file: {reports_orig_file}") - - reports_new_file = data_dir / Path("raw/reports/bfro_reports_new.json") - logger.info(f"reports_new_file: {reports_new_file}") - - reports_source_file = data_dir / Path("sources/bfro_reports.csv") - logger.info(f"reports_source_file: {reports_source_file}") - - # Mostly the output is a signal that the task completed, and establishes - # the data dependency between this and the combining reports task. - reports_new_file = run_scraper_task(reports_new_file, test_run=test_run) - - if reports_orig_file.exists(): - logger.info(f"{reports_orig_file} exists, combining with new data.") - combined_reports = combine_reports_task( - reports_orig_file, reports_new_file - ) - else: - combined_reports = pl.read_csv(reports_new_file) - - logger.info(f"Writing combined reports back to {reports_orig_file}.") - combined_reports.write_csv(reports_orig_file) - logger.info(f"Writing combined reports to {reports_source_file}.") - combined_reports.write_csv(reports_source_file) - return True - - -if __name__ == "__main__": - typer.run(update_reports) diff --git a/pipeline/update_weather.py b/pipeline/update_weather.py deleted file mode 100644 index 277e4f3..0000000 --- a/pipeline/update_weather.py +++ /dev/null @@ -1,102 +0,0 @@ -from prefect import flow, task, get_run_logger -import typer -from pathlib import Path -import polars as pl -from scripts.pull_weather import ( - get_missing_weather_keys, - pull_missing_weather, - merge_new_records_with_weather_cache, - get_visual_crossing_key_from_env, -) -from typing import Tuple, Optional - - -@task(name="Get missing weather keys") -def get_missing_weather_keys_task( - geocoded_reports_file: Path, weather_cache_file: Path -) -> pl.DataFrame: - return get_missing_weather_keys(geocoded_reports_file, weather_cache_file) - - -@task(name="Pull missing weather data") -def pull_weather_data( - missing_weather_keys: pl.DataFrame, - visual_crossing_key: str, - limit: int = 900, -) -> Tuple[pl.DataFrame, bool]: - return pull_missing_weather( - missing_weather_keys, - visual_crossing_key, - limit=limit, - logger=get_run_logger(), - ) - - -@task(name="Merge weather data with cache.") -def merge_weather_data( - weather_cache_file: Path, new_weather_data: pl.DataFrame -) -> pl.DataFrame: - return merge_new_records_with_weather_cache( - weather_cache_file, new_weather_data - ) - - -@flow(name="Update weather") -def update_weather( - data_dir: Path = Path("data"), - limit: int = 900, - visual_crossing_key: Optional[str] = None, -) -> bool: - logger = get_run_logger() - - if visual_crossing_key is None: - logger.info("Visual crossing key not passed, retrieving from env.") - visual_crossing_key = get_visual_crossing_key_from_env() - - weather_cache_file = data_dir / Path("raw/weather/weather_cache.csv") - logger.info(f"weather_cache_file: {weather_cache_file}") - - geocoded_reports_file = data_dir / Path( - "raw/geocoder/geocoded_reports.csv" - ) - logger.info(f"geocoded_reports_file: {geocoded_reports_file}") - - source_weather_file = data_dir / Path("sources/weather_cache.csv") - logger.info(f"source_weather_file: {source_weather_file}") - - logger.info("Getting missing weather keys.") - missing_weather_keys = get_missing_weather_keys_task( - geocoded_reports_file, weather_cache_file - ) - if missing_weather_keys.is_empty(): - logger.info("Nothing new to pull. All done.") - # Return false because the weather limit was not reached. - # Signals to other flows we can proceed. - return True - - logger.info(f"Total missing weather keys: {missing_weather_keys.shape[0]}") - logger.info(f"Pulling {min(missing_weather_keys.shape[0], limit)}.") - new_weather_data, weather_limit_reached = pull_weather_data( - missing_weather_keys, visual_crossing_key, limit=limit - ) - logger.info(f"Pulled {new_weather_data.shape[0]} new weather records.") - if not weather_cache_file.exists(): - logger.info(f"Weather cache file {weather_cache_file} does not exist.") - logger.info("Saving what was just pulled.") - weather_data_to_write = new_weather_data - else: - logger.info("Combining new weather data with existing weather data.") - weather_data_to_write = merge_weather_data( - weather_cache_file, new_weather_data - ) - logger.info(f"Writing updated cache back to {weather_cache_file}.") - weather_data_to_write.write_csv(weather_cache_file) - logger.info(f"Writing updated cache to {source_weather_file}.") - weather_data_to_write.write_csv(source_weather_file) - logger.info("🌩️👣 All done with weather update 🌩️👣") - # If we reach the weather limit, we cannot proceed downstream. - return not weather_limit_reached - - -if __name__ == "__main__": - typer.run(update_weather) diff --git a/run-pipeline-local.sh b/run-pipeline-local.sh new file mode 100644 index 0000000..321fd92 --- /dev/null +++ b/run-pipeline-local.sh @@ -0,0 +1,62 @@ +# Runs the pipeline end to end for development and ad-hoc runs. +# Not designed for deployments. +set -e +cd pipeline # working dir for the whole script. + +############# PULL NEW REPORTS ############### +echo "Pulling new reports, test_run=$1 ." +cd scraper/bfro_scrape # working dir for the scraper +scrapy crawl bfro_reports \ + -a test_run=$1 \ + --overwrite-output new_reports.json:jsonlines +cd ../.. # should be back in pipeline/ +# Combines new reports with existing reports, as some reports will drop off +# the BFRO website. +# Move from the scraper into the raw data directory. +cp scraper/bfro_scrape/new_reports.json data/raw/bfro_reports_new.json +# Combine the reports. +python scripts/combine_raw_reports.py \ + data/raw/bfro_reports.csv \ + data/raw/bfro_reports_new.json \ + data/raw/bfro_reports_combined.csv +# Set the combined reports as the new reports csv +cp data/raw/bfro_reports_combined.csv data/raw/bfro_reports.csv +# Copy the reports to the DBT source directory. +cp data/raw/bfro_reports.csv data/sources/bfro_reports.csv + +############# PULL KML REPORTS ############### +echo "Pulling kml file and extracting geocoded reports." +wget http://www.bfro.net/app/AllReportsKMZ.aspx +mv AllReportsKMZ.aspx data/raw/geocoder/ +unzip -o data/geocoder/AllReportsKMZ.aspx -d data/raw/geocoder/ +# Extract the lat / lon / report id / etc from the kml file. +python scripts/extract_locations_from_kml.py \ + data/raw/geocoder/doc.kml \ + data/raw/geocoded_reports_new.csv +# Combine the newly extracted reports with any existing KML sourced reports, +# in case any are removed from the KML file. +python scripts/combine_geocoded_reports.py \ + data/raw/geocoded_reports.csv \ + data/raw/geocoded_reports_new.csv \ + data/raw/geocoded_reports_combined.csv +# The combined reports are now current. +cp data/raw/geocoded_reports_combined.csv data/raw/geocoded_reports.csv +# Copy to the source file for dbt. +cp data/raw/geocoded_reports.csv data/sources/geocoded_reports.csv + +############# PULL WEATHER ############### +echo "Pulling weather." +python scripts/pull_weather.py \ + data/raw/weather/weather_cache.csv \ + data/raw/geocoder/geocoded_reports.csv +# Copy to the source file for dbt. +cp data/raw/weather_cache.csv data/sources/weather_cache.csv + +############# RUN DBT ############### +echo "Building mini warehouse." +cd bfro_mini_warehouse +dbt build --vars '{"data_dir":"../data"}' +# Hop back into pipeline. +cd .. +# Hop back onto project root, we are done. +cd .. \ No newline at end of file From f69652f876b804b9b362477a254a6a778a8791bf Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 09:15:09 -0500 Subject: [PATCH 05/11] Bump dbt-core and dbt-duckdb. --- deps/dev-requirements.txt | 80 +++++++++++++++++++++------------------ deps/requirements.in | 5 ++- deps/requirements.txt | 80 +++++++++++++++++++++------------------ 3 files changed, 89 insertions(+), 76 deletions(-) diff --git a/deps/dev-requirements.txt b/deps/dev-requirements.txt index 3682e52..c44b356 100644 --- a/deps/dev-requirements.txt +++ b/deps/dev-requirements.txt @@ -6,6 +6,7 @@ attrs==22.2.0 # via # automat # jsonschema + # referencing # service-identity # twisted # visions @@ -13,8 +14,6 @@ automat==22.10.0 # via twisted babel==2.12.1 # via agate -betterproto==1.2.5 - # via dbt-core cachetools==5.3.0 # via google-auth certifi==2022.12.7 @@ -28,6 +27,7 @@ charset-normalizer==3.1.0 click==8.1.3 # via # dbt-core + # dbt-semantic-interfaces # typer colorama==0.4.6 # via dbt-core @@ -46,12 +46,14 @@ cssselect==1.2.0 # scrapy cycler==0.11.0 # via matplotlib -dbt-core==1.4.5 +dbt-core==1.7.13 # via dbt-duckdb -dbt-duckdb==1.4.1 -dbt-extractor==0.4.1 +dbt-duckdb==1.7.4 +dbt-extractor==0.5.1 + # via dbt-core +dbt-semantic-interfaces==0.4.4 # via dbt-core -duckdb==0.7.1 +duckdb==0.10.2 # via dbt-duckdb filelock==3.10.7 # via tldextract @@ -73,23 +75,13 @@ google-auth-httplib2==0.1.0 google-auth-oauthlib==1.0.0 googleapis-common-protos==1.59.0 # via google-api-core -grpclib==0.4.3 - # via betterproto -h2==4.1.0 - # via grpclib h3==3.7.6 -hologram==0.0.15 - # via dbt-core -hpack==4.0.0 - # via h2 htmlmin==0.1.12 # via ydata-profiling httplib2==0.22.0 # via # google-api-python-client # google-auth-httplib2 -hyperframe==6.0.1 - # via h2 hyperlink==21.0.0 # via twisted idna==3.4 @@ -102,6 +94,8 @@ imagehash==4.3.1 # via # visions # ydata-profiling +importlib-metadata==6.11.0 + # via dbt-semantic-interfaces incremental==22.10.0 # via twisted isodate==0.6.1 @@ -114,16 +108,21 @@ itemadapter==0.7.0 # scrapy itemloaders==1.0.6 # via scrapy -jinja2==3.1.2 +jinja2==3.1.3 # via # dbt-core + # dbt-semantic-interfaces # ydata-profiling jmespath==1.0.1 # via itemloaders joblib==1.2.0 # via phik -jsonschema==3.2.0 - # via hologram +jsonschema==4.21.1 + # via + # dbt-core + # dbt-semantic-interfaces +jsonschema-specifications==2023.12.1 + # via jsonschema kiwisolver==1.4.4 # via matplotlib leather==0.3.4 @@ -136,10 +135,8 @@ lxml==4.9.2 # parsel # scrapy markupsafe==2.1.2 - # via - # jinja2 - # werkzeug -mashumaro==3.3.1 + # via jinja2 +mashumaro==3.12 # via dbt-core matplotlib==3.6.3 # via @@ -148,10 +145,10 @@ matplotlib==3.6.3 # ydata-profiling minimal-snowplow-tracker==0.0.2 # via dbt-core +more-itertools==10.2.0 + # via dbt-semantic-interfaces msgpack==1.0.5 # via mashumaro -multidict==6.0.4 - # via grpclib multimethod==1.9.1 # via # visions @@ -213,6 +210,7 @@ protego==0.2.1 # via scrapy protobuf==4.22.1 # via + # dbt-core # google-api-core # googleapis-common-protos pyarrow==11.0.0 @@ -228,7 +226,9 @@ pyasn1-modules==0.2.8 pycparser==2.21 # via cffi pydantic==1.10.7 - # via ydata-profiling + # via + # dbt-semantic-interfaces + # ydata-profiling pydispatcher==2.0.7 # via scrapy pygeohash==1.2.0 @@ -238,11 +238,9 @@ pyparsing==3.0.9 # via # httplib2 # matplotlib -pyrsistent==0.19.3 - # via jsonschema python-dateutil==2.8.2 # via - # hologram + # dbt-semantic-interfaces # matplotlib # pandas python-dotenv==1.0.0 @@ -259,9 +257,14 @@ pywavelets==1.4.1 pyyaml==6.0 # via # dbt-core + # dbt-semantic-interfaces # ydata-profiling queuelib==1.6.2 # via scrapy +referencing==0.34.0 + # via + # jsonschema + # jsonschema-specifications requests==2.28.2 # via # dbt-core @@ -275,6 +278,10 @@ requests-file==1.5.1 # via tldextract requests-oauthlib==1.3.1 # via google-auth-oauthlib +rpds-py==0.18.0 + # via + # jsonschema + # referencing rsa==4.9 # via google-auth ruff==0.0.292 @@ -291,7 +298,6 @@ service-identity==21.1.0 # via scrapy setuptools==69.5.1 # via - # jsonschema # scrapy # zope-interface six==1.16.0 @@ -300,7 +306,6 @@ six==1.16.0 # google-auth # google-auth-httplib2 # isodate - # jsonschema # leather # minimal-snowplow-tracker # patsy @@ -308,12 +313,10 @@ six==1.16.0 # python-dateutil # requests-file # service-identity -sqlparse==0.4.3 +sqlparse==0.5.0 # via dbt-core statsmodels==0.13.5 # via ydata-profiling -stringcase==1.2.0 - # via betterproto tangled-up-in-unicode==0.2.0 # via visions text-unidecode==1.3 @@ -331,13 +334,16 @@ typer==0.7.0 typing-extensions==4.5.0 # via # dbt-core + # dbt-semantic-interfaces # mashumaro # pydantic # twisted uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.15 - # via requests + # via + # dbt-core + # requests visions==0.7.5 # via ydata-profiling w3lib==2.1.1 @@ -345,9 +351,9 @@ w3lib==2.1.1 # itemloaders # parsel # scrapy -werkzeug==2.2.3 - # via dbt-core ydata-profiling==4.1.2 +zipp==3.18.1 + # via importlib-metadata zope-interface==6.0 # via # scrapy diff --git a/deps/requirements.in b/deps/requirements.in index c069c1c..601e79c 100644 --- a/deps/requirements.in +++ b/deps/requirements.in @@ -6,10 +6,11 @@ scrapy>=2,<3 toolz loguru ydata-profiling -duckdb +duckdb~=0.10 polars[pyarrow] h3<4.0 -dbt-duckdb +dbt-core~=1.7 +dbt-duckdb~=1.7 google-api-python-client google-auth-httplib2 google-auth-oauthlib \ No newline at end of file diff --git a/deps/requirements.txt b/deps/requirements.txt index 80ebce9..32d173c 100644 --- a/deps/requirements.txt +++ b/deps/requirements.txt @@ -6,6 +6,7 @@ attrs==22.2.0 # via # automat # jsonschema + # referencing # service-identity # twisted # visions @@ -13,8 +14,6 @@ automat==22.10.0 # via twisted babel==2.12.1 # via agate -betterproto==1.2.5 - # via dbt-core cachetools==5.3.0 # via google-auth certifi==2022.12.7 @@ -28,6 +27,7 @@ charset-normalizer==3.1.0 click==8.1.3 # via # dbt-core + # dbt-semantic-interfaces # typer colorama==0.4.6 # via dbt-core @@ -46,12 +46,14 @@ cssselect==1.2.0 # scrapy cycler==0.11.0 # via matplotlib -dbt-core==1.4.5 +dbt-core==1.7.13 # via dbt-duckdb -dbt-duckdb==1.4.1 -dbt-extractor==0.4.1 +dbt-duckdb==1.7.4 +dbt-extractor==0.5.1 + # via dbt-core +dbt-semantic-interfaces==0.4.4 # via dbt-core -duckdb==0.7.1 +duckdb==0.10.2 # via dbt-duckdb filelock==3.10.7 # via tldextract @@ -73,23 +75,13 @@ google-auth-httplib2==0.1.0 google-auth-oauthlib==1.0.0 googleapis-common-protos==1.59.0 # via google-api-core -grpclib==0.4.3 - # via betterproto -h2==4.1.0 - # via grpclib h3==3.7.6 -hologram==0.0.15 - # via dbt-core -hpack==4.0.0 - # via h2 htmlmin==0.1.12 # via ydata-profiling httplib2==0.22.0 # via # google-api-python-client # google-auth-httplib2 -hyperframe==6.0.1 - # via h2 hyperlink==21.0.0 # via twisted idna==3.4 @@ -102,6 +94,8 @@ imagehash==4.3.1 # via # visions # ydata-profiling +importlib-metadata==6.11.0 + # via dbt-semantic-interfaces incremental==22.10.0 # via twisted isodate==0.6.1 @@ -114,16 +108,21 @@ itemadapter==0.7.0 # scrapy itemloaders==1.0.6 # via scrapy -jinja2==3.1.2 +jinja2==3.1.3 # via # dbt-core + # dbt-semantic-interfaces # ydata-profiling jmespath==1.0.1 # via itemloaders joblib==1.2.0 # via phik -jsonschema==3.2.0 - # via hologram +jsonschema==4.21.1 + # via + # dbt-core + # dbt-semantic-interfaces +jsonschema-specifications==2023.12.1 + # via jsonschema kiwisolver==1.4.4 # via matplotlib leather==0.3.4 @@ -136,10 +135,8 @@ lxml==4.9.2 # parsel # scrapy markupsafe==2.1.2 - # via - # jinja2 - # werkzeug -mashumaro==3.3.1 + # via jinja2 +mashumaro==3.12 # via dbt-core matplotlib==3.6.3 # via @@ -148,10 +145,10 @@ matplotlib==3.6.3 # ydata-profiling minimal-snowplow-tracker==0.0.2 # via dbt-core +more-itertools==10.2.0 + # via dbt-semantic-interfaces msgpack==1.0.5 # via mashumaro -multidict==6.0.4 - # via grpclib multimethod==1.9.1 # via # visions @@ -213,6 +210,7 @@ protego==0.2.1 # via scrapy protobuf==4.22.1 # via + # dbt-core # google-api-core # googleapis-common-protos pyarrow==11.0.0 @@ -229,7 +227,9 @@ pyasn1-modules==0.2.8 pycparser==2.21 # via cffi pydantic==1.10.7 - # via ydata-profiling + # via + # dbt-semantic-interfaces + # ydata-profiling pydispatcher==2.0.7 # via scrapy pygeohash==1.2.0 @@ -239,11 +239,9 @@ pyparsing==3.0.9 # via # httplib2 # matplotlib -pyrsistent==0.19.3 - # via jsonschema python-dateutil==2.8.2 # via - # hologram + # dbt-semantic-interfaces # matplotlib # pandas python-dotenv==1.0.0 @@ -260,9 +258,14 @@ pywavelets==1.4.1 pyyaml==6.0 # via # dbt-core + # dbt-semantic-interfaces # ydata-profiling queuelib==1.6.2 # via scrapy +referencing==0.34.0 + # via + # jsonschema + # jsonschema-specifications requests==2.28.2 # via # dbt-core @@ -276,6 +279,10 @@ requests-file==1.5.1 # via tldextract requests-oauthlib==1.3.1 # via google-auth-oauthlib +rpds-py==0.18.0 + # via + # jsonschema + # referencing rsa==4.9 # via google-auth scipy==1.9.3 @@ -291,7 +298,6 @@ service-identity==21.1.0 # via scrapy setuptools==69.5.1 # via - # jsonschema # scrapy # zope-interface six==1.16.0 @@ -300,7 +306,6 @@ six==1.16.0 # google-auth # google-auth-httplib2 # isodate - # jsonschema # leather # minimal-snowplow-tracker # patsy @@ -308,12 +313,10 @@ six==1.16.0 # python-dateutil # requests-file # service-identity -sqlparse==0.4.3 +sqlparse==0.5.0 # via dbt-core statsmodels==0.13.5 # via ydata-profiling -stringcase==1.2.0 - # via betterproto tangled-up-in-unicode==0.2.0 # via visions text-unidecode==1.3 @@ -331,13 +334,16 @@ typer==0.7.0 typing-extensions==4.5.0 # via # dbt-core + # dbt-semantic-interfaces # mashumaro # pydantic # twisted uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.15 - # via requests + # via + # dbt-core + # requests visions==0.7.5 # via ydata-profiling w3lib==2.1.1 @@ -345,9 +351,9 @@ w3lib==2.1.1 # itemloaders # parsel # scrapy -werkzeug==2.2.3 - # via dbt-core ydata-profiling==4.1.2 +zipp==3.18.1 + # via importlib-metadata zope-interface==6.0 # via # scrapy From 8300d14ebda4da9a4c0e97db085135a069d748cb Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 09:15:32 -0500 Subject: [PATCH 06/11] Got script running end to end on local without docker. --- pipeline/scripts/combine_raw_reports.py | 59 ++++++++++++++++++++++--- run-pipeline-local.sh | 29 ++++++------ 2 files changed, 69 insertions(+), 19 deletions(-) mode change 100644 => 100755 run-pipeline-local.sh diff --git a/pipeline/scripts/combine_raw_reports.py b/pipeline/scripts/combine_raw_reports.py index 7469b90..a59ae6e 100644 --- a/pipeline/scripts/combine_raw_reports.py +++ b/pipeline/scripts/combine_raw_reports.py @@ -12,13 +12,35 @@ def main( logger.info(f"reports_new_file: {reports_new_file.name}") logger.info(f"reports_out_file: {reports_out_file.name}") + columns = [ + "year", + "season", + "month", + "date", + "state", + "county", + "location_details", + "nearest_town", + "nearest_road", + "observed", + "also_noticed", + "other_witnesses", + "other_stories", + "time_and_conditions", + "environment", + "report_number", + "report_class", + '"a_&_g_references"', + "pulled_datetime", + ] + if reports_orig_file.exists(): combined_reports_frame = duckdb.sql( f""" WITH all_rows AS ( - SELECT * FROM '{reports_orig_file}' + SELECT {', '.join(columns)} FROM '{reports_orig_file}' UNION ALL - SELECT * FROM READ_NDJSON( + SELECT {', '.join(columns)} FROM READ_NDJSON( '{reports_new_file}', columns={{ year: 'VARCHAR', @@ -39,18 +61,45 @@ def main( report_number: 'BIGINT', report_class: 'VARCHAR', "a_&_g_references": 'VARCHAR', - pulled_datetime: 'VARCHAR' + pulled_datetime: 'TIMESTAMP' }} ) ) - SELECT * FROM all_rows + SELECT {', '.join(columns)} FROM all_rows QUALIFY ROW_NUMBER() OVER( PARTITION BY report_number ORDER BY pulled_datetime DESC ) = 1 """ ).pl() else: - combined_reports_frame = pl.read_ndjson(reports_new_file) + combined_reports_frame = duckdb.sql( + f""" + SELECT {', '.join(columns)} FROM READ_NDJSON( + '{reports_new_file}', + columns={{ + year: 'VARCHAR', + season: 'VARCHAR', + month: 'VARCHAR', + date: 'VARCHAR', + state: 'VARCHAR', + county: 'VARCHAR', + location_details: 'VARCHAR', + nearest_town: 'VARCHAR', + nearest_road: 'VARCHAR', + observed: 'VARCHAR', + also_noticed: 'VARCHAR', + other_witnesses: 'VARCHAR', + other_stories: 'VARCHAR', + time_and_conditions: 'VARCHAR', + environment: 'VARCHAR', + report_number: 'BIGINT', + report_class: 'VARCHAR', + "a_&_g_references": 'VARCHAR', + pulled_datetime: 'TIMESTAMP' + }} + ) + """ + ).pl() combined_reports_frame.write_csv(reports_out_file) logger.info("Done!") diff --git a/run-pipeline-local.sh b/run-pipeline-local.sh old mode 100644 new mode 100755 index 321fd92..a4bc3ea --- a/run-pipeline-local.sh +++ b/run-pipeline-local.sh @@ -13,36 +13,37 @@ cd ../.. # should be back in pipeline/ # Combines new reports with existing reports, as some reports will drop off # the BFRO website. # Move from the scraper into the raw data directory. -cp scraper/bfro_scrape/new_reports.json data/raw/bfro_reports_new.json +cp scraper/bfro_scrape/new_reports.json data/raw/reports/bfro_reports_new.json # Combine the reports. python scripts/combine_raw_reports.py \ - data/raw/bfro_reports.csv \ - data/raw/bfro_reports_new.json \ - data/raw/bfro_reports_combined.csv + data/raw/reports/bfro_reports.csv \ + data/raw/reports/bfro_reports_new.json \ + data/raw/reports/bfro_reports_combined.csv # Set the combined reports as the new reports csv -cp data/raw/bfro_reports_combined.csv data/raw/bfro_reports.csv +cp data/raw/reports/bfro_reports_combined.csv data/raw/reports/bfro_reports.csv # Copy the reports to the DBT source directory. -cp data/raw/bfro_reports.csv data/sources/bfro_reports.csv +cp data/raw/reports/bfro_reports.csv data/sources/bfro_reports.csv ############# PULL KML REPORTS ############### echo "Pulling kml file and extracting geocoded reports." wget http://www.bfro.net/app/AllReportsKMZ.aspx mv AllReportsKMZ.aspx data/raw/geocoder/ -unzip -o data/geocoder/AllReportsKMZ.aspx -d data/raw/geocoder/ +unzip -o data/raw/geocoder/AllReportsKMZ.aspx -d data/raw/geocoder/ # Extract the lat / lon / report id / etc from the kml file. python scripts/extract_locations_from_kml.py \ data/raw/geocoder/doc.kml \ - data/raw/geocoded_reports_new.csv + data/raw/geocoder/geocoded_reports_new.csv # Combine the newly extracted reports with any existing KML sourced reports, # in case any are removed from the KML file. python scripts/combine_geocoded_reports.py \ - data/raw/geocoded_reports.csv \ - data/raw/geocoded_reports_new.csv \ - data/raw/geocoded_reports_combined.csv + data/raw/geocoder/geocoded_reports.csv \ + data/raw/geocoder/geocoded_reports_new.csv \ + data/raw/geocoder/geocoded_reports_combined.csv # The combined reports are now current. -cp data/raw/geocoded_reports_combined.csv data/raw/geocoded_reports.csv +cp data/raw/geocoder/geocoded_reports_combined.csv \ + data/raw/geocoder/geocoded_reports.csv # Copy to the source file for dbt. -cp data/raw/geocoded_reports.csv data/sources/geocoded_reports.csv +cp data/raw/geocoder/geocoded_reports.csv data/sources/geocoded_reports.csv ############# PULL WEATHER ############### echo "Pulling weather." @@ -50,7 +51,7 @@ python scripts/pull_weather.py \ data/raw/weather/weather_cache.csv \ data/raw/geocoder/geocoded_reports.csv # Copy to the source file for dbt. -cp data/raw/weather_cache.csv data/sources/weather_cache.csv +cp data/raw/weather/weather_cache.csv data/sources/weather_cache.csv ############# RUN DBT ############### echo "Building mini warehouse." From 31b96f398a3cee5804fb4df534b1d27dd7d18cab Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 18:51:43 -0500 Subject: [PATCH 07/11] Updated gdrive uploader to have sane env var names. --- pipeline/scripts/upload_to_gdrive.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipeline/scripts/upload_to_gdrive.py b/pipeline/scripts/upload_to_gdrive.py index 596ac59..1f1ec8e 100644 --- a/pipeline/scripts/upload_to_gdrive.py +++ b/pipeline/scripts/upload_to_gdrive.py @@ -76,9 +76,11 @@ def main( "retrieving from env." ) load_dotenv(find_dotenv()) - sa_credentials_location = os.getenv("SA_CREDENTIALS_LOCATION") + sa_credentials_location = os.getenv("GDRIVE_SERVICE_ACCOUNT_LOCATION") if sa_credentials_location is None: - raise ValueError("SA_CREDENTIALS_LOCATION not in .env or env.") + raise ValueError( + "GDRIVE_SERVICE_ACCOUNT_LOCATION not in .env or env." + ) if gdrive_folder_id is None: logger.info("GDrive folder ID not passed, retrieving from env.") From e5b83deb7e4067a7dc2cf832cdef85469539f25f Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 20:04:41 -0500 Subject: [PATCH 08/11] Got docker running. --- .dockerignore | 20 ++++++++++++------ Dockerfile | 16 +++++++++++--- Makefile | 6 +----- pipeline/.prefectignore | 47 ----------------------------------------- 4 files changed, 27 insertions(+), 62 deletions(-) delete mode 100644 pipeline/.prefectignore diff --git a/.dockerignore b/.dockerignore index 1dd62fc..6be784a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,11 +12,17 @@ deps/dev-requirements.in deps/dev-requirements.txt notebooks/ scripts/ -pipeline/__pycache__/ -pipeline/scripts/__pycache__/ -pipeline/data/* -pipeline/bfro_mini_warehouse/logs/ -pipeline/bfro_mini_warehouse/target/ +pipeline/data/**/*.csv +pipeline/data/**/*.json +pipeline/data/**/*.kml +pipeline/data/**/*.aspx +pipeline/logs/* +pipeline/bfro_mini_warehouse/logs/* +pipeline/bfro_mini_warehouse/target/* pipeline/bfro_mini_warehouse/README.md -pipeline/scraper/__pycache__/ -pipeline/scraper/bfro_scrape/__pycache__/ \ No newline at end of file +pipeline/bfro_mini_warehouse/.user.yml +**/__pycache__/ +.dvc +.github +.ruff_cache +.vscode diff --git a/Dockerfile b/Dockerfile index f08cccb..0011d8f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,15 @@ -FROM python:3.9-slim-buster +FROM python:3.11-slim-bullseye -COPY deps/requirements.txt . +ENV VIRTUAL_ENV=/usr/local +# Install basics +RUN apt-get update -y && apt-get install -y zip wget apt-transport-https ca-certificates gnupg curl +# Install mc +RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/bin/mc && chmod +x /usr/bin/mc +# Install gcloud +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && apt-get update -y && apt-get install google-cloud-sdk -y -RUN pip install -r requirements.txt \ No newline at end of file +# Install the python stuff. +COPY deps/requirements.txt requirements.txt +RUN pip install uv && uv pip install -r requirements.txt + +COPY . . diff --git a/Makefile b/Makefile index 252e05a..155f9ec 100644 --- a/Makefile +++ b/Makefile @@ -25,13 +25,9 @@ format: python -m ruff format . .PHONY: build-docker -## Build docker with local registry tag +## Build docker with local registry tag and push to local registry build-docker: docker build --tag localhost:5000/bfro_pipeline:latest . - -.PHONY: push-docker -## Push docker to local registry -push-docker: docker push localhost:5000/bfro_pipeline:latest .PHONY: pull-data diff --git a/pipeline/.prefectignore b/pipeline/.prefectignore deleted file mode 100644 index eb471be..0000000 --- a/pipeline/.prefectignore +++ /dev/null @@ -1,47 +0,0 @@ -# prefect artifacts -.prefectignore - -# python artifacts -__pycache__/ -*.py[cod] -*$py.class -*.egg-info/ -*.egg - -# Type checking artifacts -.mypy_cache/ -.dmypy.json -dmypy.json -.pyre/ - -# IPython -profile_default/ -ipython_config.py -*.ipynb_checkpoints/* - -# Environments -.python-version -.env -.venv -env/ -venv/ - -# MacOS -.DS_Store - -# Dask -dask-worker-space/ - -# Editors -.idea/ -.vscode/ - -# VCS -.git/ -.hg/ - -data/ - -## dbt -bfro_mini_warehouse/logs -bfro_mini_warehouse/target \ No newline at end of file From aed406d2680f03fe219bd36de93167dcd0a7a844 Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 20:09:39 -0500 Subject: [PATCH 09/11] Update readme. --- README.md | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 9c6759d..b92cbd1 100644 --- a/README.md +++ b/README.md @@ -33,38 +33,23 @@ For more information on the weather data, see the [Visual Crossing documentation ## Full Pipeline The pipeline (including scraper, weather, and the DBT project), is in the `pipeline/` directory. -Everything assumes relative paths from that directory, rather than the project root (which is just for setup and deployment operations). +However, there's a shell script that will run it all for you. -``` -cd pipeline/ -python bfro_pipeline.py +```sh +# In project root. +./run-pipeline-local.sh False ``` -For a test run (which runs the scraper on a small set of pages, and pulls a small set of weather data), use `--test-run`. +To run a test run, use +```sh +./run-pipeline-local.sh True ``` -python bfro_pipeline.py --test-run -``` - -Once the sources are in place (however you decide to run them), you can run dbt directly from within the `pipeline/bfro_mini_warehouse` directory, or use the script. - -``` -python bfro_pipeline.py --dbt-only -``` - -The pipeline command runs the source tests first, builds the csv files, then runs the rest of the data tests. ## Deployment and Orchestration -The pipelines are [Prefect](https://www.prefect.io/) flows, and they can be deployed but require some setup. -The `pipeline/bfro_pipeline_docker.py` file has the blocks you need (basically `prefect-gcs-rw` as GCP credentials, and `visual-crossing-key` as a Secret). -I assume if you're messing with deployments you probably know how that stuff works. -It's not _super_ hard to self host Prefect, but it's not super easy either. - -Also worth noting - while the thing says `_docker` in the file name and pipeline name, I don't actually have the dockerized version working yet 😬 . - -It will still deploy and run, as is, with a process infrastructure block on an agent within the conda env provided in this repo though. -When I get docker working, you'll be able to launch it with a docker container infrastructure block and no code change to the flow. +There's a Dockerfile and docker make targets (set to push to a local registry). +If you want to run in a container you should wrap the `run-pipeline-local.sh` shell script into one that will pull down / push updated data before the container tears down, or execute with a bind mount or volume mount on the local project root. ## Data Dictionary From f2cc6e40d45bb76594e73ace9bc9294325104b35 Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 20:13:01 -0500 Subject: [PATCH 10/11] Bump python version in actions and update make target for linting. --- .github/workflows/test-and-lint.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-and-lint.yml b/.github/workflows/test-and-lint.yml index 124a162..460ca17 100644 --- a/.github/workflows/test-and-lint.yml +++ b/.github/workflows/test-and-lint.yml @@ -13,12 +13,12 @@ jobs: - name: Set up python uses: actions/setup-python@v2 with: - python-version: "3.9" + python-version: "3.11" - name: Install dependencies run: pip install -r deps/dev-requirements.txt - name: Run checks - run: make check + run: make lint # uhhhh no tests for now I guess \ No newline at end of file From 158445eaf79b8f4101cee0e6401e920af5368c52 Mon Sep 17 00:00:00 2001 From: Tim Renner Date: Tue, 23 Apr 2024 20:16:23 -0500 Subject: [PATCH 11/11] Get everything linted or ignored (looking at you scrapy settings.py) --- .../scraper/bfro_scrape/spiders/bfro_reports.py | 15 ++++++++------- pipeline/scripts/combine_raw_reports.py | 1 - ruff.toml | 5 ++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py b/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py index 6b7aed6..6894ee2 100644 --- a/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py +++ b/pipeline/scraper/bfro_scrape/spiders/bfro_reports.py @@ -45,10 +45,10 @@ def parse_report(self, response): raw_keys = response.xpath("//p/span[@class='field']/text()").extract() keys = [k.replace(":", "").replace(" ", "_") for k in raw_keys] - # Now if scrapy had XPath 2.0 this would be pretty simple. Unfortunately - # it doesn't so we need to use a mixture of Python and XPath. This - # XPath query grabs all "p" elements containing a 'span.field' with - # text matching the key. + # Now if scrapy had XPath 2.0 this would be pretty simple. + # Unfortunately it doesn't so we need to use a mixture of Python and + # XPath. This XPath query grabs all "p" elements containing a + # 'span.field' with text matching the key. value_query = ( "//p[span[@class = 'field' and contains(text(), '{}')]]/text()" ) @@ -83,9 +83,10 @@ def parse_report(self, response): for k in empty_keys: data[k] = response.xpath( - "//p[span[@class='field' and contains(text(), '{}')]]/a/text()".format( - k - ) + ( + "//p[span[@class='field' and " + "contains(text(), '{}')]]/a/text()" + ).format(k) ).extract_first() # If everything is None, we don't want to write the values out, diff --git a/pipeline/scripts/combine_raw_reports.py b/pipeline/scripts/combine_raw_reports.py index a59ae6e..ffc2bf9 100644 --- a/pipeline/scripts/combine_raw_reports.py +++ b/pipeline/scripts/combine_raw_reports.py @@ -2,7 +2,6 @@ from pathlib import Path from loguru import logger import duckdb -import polars as pl def main( diff --git a/ruff.toml b/ruff.toml index 723e27e..10b94ee 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1 +1,4 @@ -line-length = 79 \ No newline at end of file +line-length = 79 +exclude=[ + "pipeline/scraper/bfro_scrape/settings.py" +] \ No newline at end of file