diff --git a/examples/materialization/using_types/notebook.ipynb b/examples/materialization/using_types/notebook.ipynb new file mode 100644 index 000000000..e46ba15fd --- /dev/null +++ b/examples/materialization/using_types/notebook.ipynb @@ -0,0 +1,175 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-06-25T00:00:13.662458Z", + "start_time": "2024-06-25T00:00:06.982077Z" + } + }, + "source": "%load_ext hamilton.plugins.jupyter_magic\n", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/stefankrawczyk/.pyenv/versions/knowledge_retrieval-py39/lib/python3.9/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + } + ], + "execution_count": 1 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-06-25T00:00:25.003646Z", + "start_time": "2024-06-25T00:00:24.322577Z" + } + }, + "cell_type": "code", + "source": [ + "%%cell_to_module simple_etl --display\n", + "import pandas as pd\n", + "from sklearn import datasets\n", + "\n", + "from hamilton.htypes import DataLoaderMetadata, DataSaverMetadata\n", + "\n", + "\n", + "def raw_data() -> tuple[pd.DataFrame, DataLoaderMetadata]:\n", + " data = datasets.load_digits()\n", + " df = pd.DataFrame(data.data, columns=[f\"feature_{i}\" for i in range(data.data.shape[1])])\n", + " return df, DataLoaderMetadata.from_dataframe(df)\n", + "\n", + "\n", + "def transformed_data(raw_data: pd.DataFrame) -> pd.DataFrame:\n", + " return raw_data\n", + "\n", + "\n", + "def saved_data(transformed_data: pd.DataFrame, filepath: str) -> DataSaverMetadata:\n", + " transformed_data.to_csv(filepath)\n", + " return DataSaverMetadata.from_file_and_dataframe(filepath, transformed_data)\n" + ], + "id": "efd6c1b2417bb9cf", + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\ncluster__legend\n\nLegend\n\n\n\nraw_data\n\nraw_data\nDataFrame\n\n\n\ntransformed_data\n\ntransformed_data\nDataFrame\n\n\n\nraw_data->transformed_data\n\n\n\n\n\nsaved_data\n\n\nsaved_data\nsaved_data\n\n\n\ntransformed_data->saved_data\n\n\n\n\n\n_saved_data_inputs\n\nfilepath\nstr\n\n\n\n_saved_data_inputs->saved_data\n\n\n\n\n\ninput\n\ninput\n\n\n\nfunction\n\nfunction\n\n\n\nmaterializer\n\n\nmaterializer\n\n\n\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 2 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-06-25T00:00:37.889540Z", + "start_time": "2024-06-25T00:00:35.994131Z" + } + }, + "cell_type": "code", + "source": [ + "from hamilton_sdk import adapters\n", + "\n", + "from hamilton import driver\n", + "\n", + "tracker = adapters.HamiltonTracker(\n", + " project_id=7, # modify this as needed\n", + " username=\"elijah@dagworks.io\",\n", + " dag_name=\"my_version_of_the_dag\",\n", + " tags={\"environment\": \"DEV\", \"team\": \"MY_TEAM\", \"version\": \"X\"},\n", + ")\n", + "dr = driver.Builder().with_config({}).with_modules(simple_etl).with_adapters(tracker).build()\n", + "dr.display_all_functions()" + ], + "id": "e9252f2a09228330", + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\ncluster__legend\n\nLegend\n\n\n\nraw_data\n\nraw_data\nDataFrame\n\n\n\ntransformed_data\n\ntransformed_data\nDataFrame\n\n\n\nraw_data->transformed_data\n\n\n\n\n\nsaved_data\n\n\nsaved_data\nsaved_data\n\n\n\ntransformed_data->saved_data\n\n\n\n\n\n_saved_data_inputs\n\nfilepath\nstr\n\n\n\n_saved_data_inputs->saved_data\n\n\n\n\n\ninput\n\ninput\n\n\n\nfunction\n\nfunction\n\n\n\nmaterializer\n\n\nmaterializer\n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-06-25T00:00:53.746596Z", + "start_time": "2024-06-25T00:00:52.320439Z" + } + }, + "cell_type": "code", + "source": "dr.execute([\"saved_data\"], inputs={\"filepath\": \"data.csv\"})", + "id": "86c0d0f7da9a472b", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Capturing execution run. Results can be found at http://localhost:8241/dashboard/project/7/runs/25\n", + "\n", + "\n", + "Captured execution run. Results can be found at http://localhost:8241/dashboard/project/7/runs/25\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "{'saved_data': }" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 4 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "e108601ca3a88aab" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/materialization/using_types/simple_etl.png b/examples/materialization/using_types/simple_etl.png index 3c84b2f9f..e83cef032 100644 Binary files a/examples/materialization/using_types/simple_etl.png and b/examples/materialization/using_types/simple_etl.png differ diff --git a/examples/materialization/using_types/def simple_etl.py b/examples/materialization/using_types/simple_etl.py similarity index 100% rename from examples/materialization/using_types/def simple_etl.py rename to examples/materialization/using_types/simple_etl.py