Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create simple SQL view assets #2445

Merged
merged 13 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
287 changes: 287 additions & 0 deletions devtools/output-table-conversion-test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f786051b-2aa0-44e0-bfd7-fe6827b6e1a3",
"metadata": {},
"source": [
"# Purpose\n",
"We are in the process of converting some functions in `pudl.output` to be SQL views. This notebook allows us to compare the outputs of the old python functions with the SQL view."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ec08c060-ba49-4466-81a0-315a45993928",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"\n",
"assert os.environ.get(\"DAGSTER_HOME\"), (\n",
" \"The DAGSTER_HOME env var is not set so dagster won't be able to find the assets.\"\n",
" \"Set the DAGSTER_HOME env var in this notebook or kill the jupyter server and set\"\n",
" \" the DAGSTER_HOME env var in your terminal and relaunch jupyter.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5ce7f88e-c7b9-4963-a0e4-72ccbcb1f70e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"pudl_settings is being deprecated in favor of environment variables variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"pudl_settings is being deprecated in favor of environment variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"sqlite and parquet directories are no longer being used. Make sure there is a single directory named 'output' at the root of your workspace. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"pudl_settings is being deprecated in favor of environment variables variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"pudl_settings is being deprecated in favor of environment variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"sqlite and parquet directories are no longer being used. Make sure there is a single directory named 'output' at the root of your workspace. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n"
]
}
],
"source": [
"from pudl.workspace.setup import get_defaults\n",
"import sqlalchemy as sa\n",
"import pandas as pd\n",
"\n",
"# TODO: This should be replaced with get_defaults()\n",
"engine = sa.create_engine(f\"sqlite:///{os.environ['PUDL_OUTPUT']}pudl.sqlite\")"
]
},
{
"cell_type": "markdown",
"id": "d5f6b021-26cf-4804-929f-0e4975751bd1",
"metadata": {},
"source": [
"## Compare output of old python function with SQL view"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5c705a39-0517-4c7b-a1b0-ca8a12daec53",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 34.1 ms, sys: 3.71 ms, total: 37.8 ms\n",
"Wall time: 53.8 ms\n"
]
}
],
"source": [
"%%time\n",
"view_name = \"denorm_plants_utils_ferc1\"\n",
"\n",
"with engine.connect() as con:\n",
" view_df = pd.read_sql_table(view_name, con)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "633999af-d881-424b-af8c-64fc837982cd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>utility_id_ferc1</th>\n",
" <th>plant_name_ferc1</th>\n",
" <th>plant_id_pudl</th>\n",
" <th>utility_name_ferc1</th>\n",
" <th>utility_id_pudl</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>206</td>\n",
" <td>*dolet hills (3)</td>\n",
" <td>1</td>\n",
" <td>Southwestern Electric Power Company</td>\n",
" <td>301</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>206</td>\n",
" <td>*flint creek (1)</td>\n",
" <td>2</td>\n",
" <td>Southwestern Electric Power Company</td>\n",
" <td>301</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>206</td>\n",
" <td>*pirkey (2)</td>\n",
" <td>3</td>\n",
" <td>Southwestern Electric Power Company</td>\n",
" <td>301</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>159</td>\n",
" <td>59th st gt-1</td>\n",
" <td>4</td>\n",
" <td>Consolidated Edison Company of New York, Inc.</td>\n",
" <td>79</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>159</td>\n",
" <td>74th st gt 1&amp;2</td>\n",
" <td>5</td>\n",
" <td>Consolidated Edison Company of New York, Inc.</td>\n",
" <td>79</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" utility_id_ferc1 plant_name_ferc1 plant_id_pudl utility_name_ferc1 utility_id_pudl\n",
"0 206 *dolet hills (3) 1 Southwestern Electric Power Company 301\n",
"1 206 *flint creek (1) 2 Southwestern Electric Power Company 301\n",
"2 206 *pirkey (2) 3 Southwestern Electric Power Company 301\n",
"3 159 59th st gt-1 4 Consolidated Edison Company of New York, Inc. 79\n",
"4 159 74th st gt 1&2 5 Consolidated Edison Company of New York, Inc. 79"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"view_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "af6253bb-e135-4068-8088-4e08c9914054",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-03-23 13:35:06 [ WARNING] catalystcoop.pudl.output.ferc1:56 pudl.output.ferc1.plants_utils_ferc1() will be deprecated in a future version of PUDL. In the future, call the PudlTabl.pu_ferc1() method or pull the plants_utils_ferc1 tabledirectly from the pudl.sqlite database.\n"
]
}
],
"source": [
"# Import the old python functions\n",
"from pudl.output.ferc1 import plants_utils_ferc1\n",
"\n",
"old_output_func = plants_utils_ferc1\n",
"\n",
"old_df = old_output_func(engine)\n",
"\n",
"# Align pandas index and sort values using all of the columns\n",
"key = list(old_df.columns)\n",
"old_df = old_df.sort_values(by=key).reset_index(drop=True)\n",
"view_df = view_df.sort_values(by=key).reset_index(drop=True)\n",
"\n",
"pd.testing.assert_frame_equal(old_df, view_df)"
]
},
{
"cell_type": "markdown",
"id": "955efc1e-b21f-473e-b3f2-ab2fad89591d",
"metadata": {},
"source": [
"## Make sure we can load the view using the SQLite IO Manager"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a9d81a2b-9c68-443b-9a56-5563f4cbc920",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 5 entries, 0 to 4\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 utility_id_ferc1 5 non-null Int64 \n",
" 1 plant_name_ferc1 5 non-null string\n",
" 2 plant_id_pudl 5 non-null Int64 \n",
" 3 utility_name_ferc1 5 non-null string\n",
" 4 utility_id_pudl 5 non-null Int64 \n",
"dtypes: Int64(3), string(2)\n",
"memory usage: 343.0 bytes\n"
]
}
],
"source": [
"from dagster import AssetKey\n",
"\n",
"from pudl.etl import defs\n",
"\n",
"df = defs.load_asset_value(AssetKey(view_name))\n",
"\n",
"df.head().info()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 2 additions & 0 deletions src/pudl/etl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Definitions,
define_asset_job,
load_assets_from_modules,
load_assets_from_package_module,
)

import pudl
Expand Down Expand Up @@ -45,6 +46,7 @@
*load_assets_from_modules([pudl.transform.ferc714], group_name="clean_ferc714"),
*load_assets_from_modules([glue_assets], group_name="glue"),
*load_assets_from_modules([static_assets], group_name="static"),
*load_assets_from_package_module(pudl.output, group_name="outputs"),
bendnorman marked this conversation as resolved.
Show resolved Hide resolved
)

default_resources = {
Expand Down
41 changes: 0 additions & 41 deletions src/pudl/etl/denormalized_assets.py

This file was deleted.

Loading