diff --git a/examples/polars/notebook.ipynb b/examples/polars/notebook.ipynb index c8cad7e44..c81678590 100644 --- a/examples/polars/notebook.ipynb +++ b/examples/polars/notebook.ipynb @@ -38,8 +38,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/stefankrawczyk/.pyenv/versions/knowledge_retrieval-py39/lib/python3.9/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", - " warnings.warn(\n" + "/Users/jernejfrank/miniconda3/envs/hamilton/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "/Users/jernejfrank/miniconda3/envs/hamilton/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], @@ -70,177 +72,177 @@ "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n", " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n", - "<!-- Generated by graphviz version 12.0.0 (20240704.0754)\n", + "<!-- Generated by graphviz version 12.1.2 (20240928.0832)\n", " -->\n", "<!-- Pages: 1 -->\n", - "<svg width=\"985pt\" height=\"413pt\"\n", - " viewBox=\"0.00 0.00 985.10 412.80\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", - "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 408.8)\">\n", - "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-408.8 981.1,-408.8 981.1,4 -4,4\"/>\n", + "<svg width=\"985pt\" height=\"435pt\"\n", + " viewBox=\"0.00 0.00 985.10 434.80\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", + "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 430.8)\">\n", + "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-430.8 981.1,-430.8 981.1,4 -4,4\"/>\n", "<g id=\"clust1\" class=\"cluster\">\n", "<title>cluster__legend</title>\n", - "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"33.88,-266.8 33.88,-396.8 118.72,-396.8 118.72,-266.8 33.88,-266.8\"/>\n", - "<text text-anchor=\"middle\" x=\"76.3\" y=\"-379.5\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">Legend</text>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"33.88,-288.8 33.88,-418.8 118.72,-418.8 118.72,-288.8 33.88,-288.8\"/>\n", + "<text text-anchor=\"middle\" x=\"76.3\" y=\"-401.5\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">Legend</text>\n", "</g>\n", - "<!-- spend_per_signup -->\n", + "<!-- spend_zero_mean -->\n", "<g id=\"node1\" class=\"node\">\n", + "<title>spend_zero_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M701.5,-227.6C701.5,-227.6 585.4,-227.6 585.4,-227.6 579.4,-227.6 573.4,-221.6 573.4,-215.6 573.4,-215.6 573.4,-176 573.4,-176 573.4,-170 579.4,-164 585.4,-164 585.4,-164 701.5,-164 701.5,-164 707.5,-164 713.5,-170 713.5,-176 713.5,-176 713.5,-215.6 713.5,-215.6 713.5,-221.6 707.5,-227.6 701.5,-227.6\"/>\n", + "<text text-anchor=\"start\" x=\"584.2\" y=\"-204.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_zero_mean</text>\n", + "<text text-anchor=\"start\" x=\"624.33\" y=\"-176.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- spend_zero_mean_unit_variance -->\n", + "<g id=\"node4\" class=\"node\">\n", + "<title>spend_zero_mean_unit_variance</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M965.1,-145.6C965.1,-145.6 754.5,-145.6 754.5,-145.6 748.5,-145.6 742.5,-139.6 742.5,-133.6 742.5,-133.6 742.5,-94 742.5,-94 742.5,-88 748.5,-82 754.5,-82 754.5,-82 965.1,-82 965.1,-82 971.1,-82 977.1,-88 977.1,-94 977.1,-94 977.1,-133.6 977.1,-133.6 977.1,-139.6 971.1,-145.6 965.1,-145.6\"/>\n", + "<text text-anchor=\"start\" x=\"753.3\" y=\"-122.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_zero_mean_unit_variance</text>\n", + "<text text-anchor=\"start\" x=\"840.67\" y=\"-94.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- spend_zero_mean->spend_zero_mean_unit_variance -->\n", + "<g id=\"edge6\" class=\"edge\">\n", + "<title>spend_zero_mean->spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M713.75,-169.29C729.75,-163.17 747.05,-156.55 763.93,-150.09\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"764.87,-153.48 772.96,-146.64 762.37,-146.94 764.87,-153.48\"/>\n", + "</g>\n", + "<!-- spend_mean -->\n", + "<g id=\"node2\" class=\"node\">\n", + "<title>spend_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M514.02,-227.6C514.02,-227.6 433.18,-227.6 433.18,-227.6 427.18,-227.6 421.18,-221.6 421.18,-215.6 421.18,-215.6 421.18,-176 421.18,-176 421.18,-170 427.18,-164 433.18,-164 433.18,-164 514.02,-164 514.02,-164 520.02,-164 526.02,-170 526.02,-176 526.02,-176 526.02,-215.6 526.02,-215.6 526.02,-221.6 520.02,-227.6 514.02,-227.6\"/>\n", + "<text text-anchor=\"start\" x=\"431.98\" y=\"-204.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_mean</text>\n", + "<text text-anchor=\"start\" x=\"460.85\" y=\"-176.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- spend_mean->spend_zero_mean -->\n", + "<g id=\"edge2\" class=\"edge\">\n", + "<title>spend_mean->spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M526.33,-195.8C537.5,-195.8 549.56,-195.8 561.55,-195.8\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"561.46,-199.3 571.46,-195.8 561.46,-192.3 561.46,-199.3\"/>\n", + "</g>\n", + "<!-- spend_per_signup -->\n", + "<g id=\"node3\" class=\"node\">\n", "<title>spend_per_signup</title>\n", "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M532.4,-347.6C532.4,-347.6 414.8,-347.6 414.8,-347.6 408.8,-347.6 402.8,-341.6 402.8,-335.6 402.8,-335.6 402.8,-296 402.8,-296 402.8,-290 408.8,-284 414.8,-284 414.8,-284 532.4,-284 532.4,-284 538.4,-284 544.4,-290 544.4,-296 544.4,-296 544.4,-335.6 544.4,-335.6 544.4,-341.6 538.4,-347.6 532.4,-347.6\"/>\n", "<text text-anchor=\"start\" x=\"413.6\" y=\"-324.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_per_signup</text>\n", "<text text-anchor=\"start\" x=\"454.48\" y=\"-296.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", "</g>\n", "<!-- avg_3wk_spend -->\n", - "<g id=\"node2\" class=\"node\">\n", + "<g id=\"node5\" class=\"node\">\n", "<title>avg_3wk_spend</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M524.52,-265.6C524.52,-265.6 422.68,-265.6 422.68,-265.6 416.68,-265.6 410.68,-259.6 410.68,-253.6 410.68,-253.6 410.68,-214 410.68,-214 410.68,-208 416.68,-202 422.68,-202 422.68,-202 524.52,-202 524.52,-202 530.52,-202 536.52,-208 536.52,-214 536.52,-214 536.52,-253.6 536.52,-253.6 536.52,-259.6 530.52,-265.6 524.52,-265.6\"/>\n", - "<text text-anchor=\"start\" x=\"421.48\" y=\"-242.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">avg_3wk_spend</text>\n", - "<text text-anchor=\"start\" x=\"454.48\" y=\"-214.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M524.52,-145.6C524.52,-145.6 422.68,-145.6 422.68,-145.6 416.68,-145.6 410.68,-139.6 410.68,-133.6 410.68,-133.6 410.68,-94 410.68,-94 410.68,-88 416.68,-82 422.68,-82 422.68,-82 524.52,-82 524.52,-82 530.52,-82 536.52,-88 536.52,-94 536.52,-94 536.52,-133.6 536.52,-133.6 536.52,-139.6 530.52,-145.6 524.52,-145.6\"/>\n", + "<text text-anchor=\"start\" x=\"421.48\" y=\"-122.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">avg_3wk_spend</text>\n", + "<text text-anchor=\"start\" x=\"454.48\" y=\"-94.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", "</g>\n", - "<!-- spend -->\n", - "<g id=\"node3\" class=\"node\">\n", - "<title>spend</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M355.8,-224.6C355.8,-224.6 318.45,-224.6 318.45,-224.6 312.45,-224.6 306.45,-218.6 306.45,-212.6 306.45,-212.6 306.45,-173 306.45,-173 306.45,-167 312.45,-161 318.45,-161 318.45,-161 355.8,-161 355.8,-161 361.8,-161 367.8,-167 367.8,-173 367.8,-173 367.8,-212.6 367.8,-212.6 367.8,-218.6 361.8,-224.6 355.8,-224.6\"/>\n", - "<text text-anchor=\"start\" x=\"317.25\" y=\"-201.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend</text>\n", - "<text text-anchor=\"start\" x=\"318\" y=\"-173.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "<!-- base_df -->\n", + "<g id=\"node6\" class=\"node\">\n", + "<title>base_df</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M259.45,-287.6C259.45,-287.6 193.6,-287.6 193.6,-287.6 187.6,-287.6 181.6,-281.6 181.6,-275.6 181.6,-275.6 181.6,-236 181.6,-236 181.6,-230 187.6,-224 193.6,-224 193.6,-224 259.45,-224 259.45,-224 265.45,-224 271.45,-230 271.45,-236 271.45,-236 271.45,-275.6 271.45,-275.6 271.45,-281.6 265.45,-287.6 259.45,-287.6\"/>\n", + "<text text-anchor=\"start\" x=\"201.02\" y=\"-264.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">base_df</text>\n", + "<text text-anchor=\"start\" x=\"192.4\" y=\"-236.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", "</g>\n", - "<!-- spend->spend_per_signup -->\n", - "<g id=\"edge1\" class=\"edge\">\n", - "<title>spend->spend_per_signup</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M359.03,-224.73C370.66,-240.96 386.13,-260.26 402.8,-274.8 403.56,-275.47 404.34,-276.13 405.13,-276.78\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"402.69,-279.32 412.74,-282.68 406.98,-273.79 402.69,-279.32\"/>\n", + "<!-- signups -->\n", + "<g id=\"node8\" class=\"node\">\n", + "<title>signups</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M361.8,-337.6C361.8,-337.6 312.45,-337.6 312.45,-337.6 306.45,-337.6 300.45,-331.6 300.45,-325.6 300.45,-325.6 300.45,-286 300.45,-286 300.45,-280 306.45,-274 312.45,-274 312.45,-274 361.8,-274 361.8,-274 367.8,-274 373.8,-280 373.8,-286 373.8,-286 373.8,-325.6 373.8,-325.6 373.8,-331.6 367.8,-337.6 361.8,-337.6\"/>\n", + "<text text-anchor=\"start\" x=\"311.25\" y=\"-314.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">signups</text>\n", + "<text text-anchor=\"start\" x=\"318\" y=\"-286.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", "</g>\n", - "<!-- spend->avg_3wk_spend -->\n", - "<g id=\"edge3\" class=\"edge\">\n", - "<title>spend->avg_3wk_spend</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M367.97,-201.9C377.47,-204.8 388.43,-208.14 399.6,-211.54\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"398.3,-214.81 408.89,-214.38 400.35,-208.11 398.3,-214.81\"/>\n", + "<!-- base_df->signups -->\n", + "<g id=\"edge11\" class=\"edge\">\n", + "<title>base_df->signups</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M271.7,-276.14C277.57,-278.84 283.61,-281.62 289.52,-284.34\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"288.04,-287.51 298.59,-288.52 290.97,-281.16 288.04,-287.51\"/>\n", "</g>\n", - "<!-- spend_mean -->\n", - "<g id=\"node4\" class=\"node\">\n", - "<title>spend_mean</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M514.02,-183.6C514.02,-183.6 433.18,-183.6 433.18,-183.6 427.18,-183.6 421.18,-177.6 421.18,-171.6 421.18,-171.6 421.18,-132 421.18,-132 421.18,-126 427.18,-120 433.18,-120 433.18,-120 514.02,-120 514.02,-120 520.02,-120 526.02,-126 526.02,-132 526.02,-132 526.02,-171.6 526.02,-171.6 526.02,-177.6 520.02,-183.6 514.02,-183.6\"/>\n", - "<text text-anchor=\"start\" x=\"431.98\" y=\"-160.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_mean</text>\n", - "<text text-anchor=\"start\" x=\"460.85\" y=\"-132.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "<!-- spend -->\n", + "<g id=\"node9\" class=\"node\">\n", + "<title>spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M355.8,-246.6C355.8,-246.6 318.45,-246.6 318.45,-246.6 312.45,-246.6 306.45,-240.6 306.45,-234.6 306.45,-234.6 306.45,-195 306.45,-195 306.45,-189 312.45,-183 318.45,-183 318.45,-183 355.8,-183 355.8,-183 361.8,-183 367.8,-189 367.8,-195 367.8,-195 367.8,-234.6 367.8,-234.6 367.8,-240.6 361.8,-246.6 355.8,-246.6\"/>\n", + "<text text-anchor=\"start\" x=\"317.25\" y=\"-223.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend</text>\n", + "<text text-anchor=\"start\" x=\"318\" y=\"-195.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", "</g>\n", - "<!-- spend->spend_mean -->\n", - "<g id=\"edge5\" class=\"edge\">\n", - "<title>spend->spend_mean</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M367.97,-183.7C380.48,-179.89 395.53,-175.3 410.27,-170.8\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"410.91,-174.27 419.45,-168 408.87,-167.57 410.91,-174.27\"/>\n", + "<!-- base_df->spend -->\n", + "<g id=\"edge12\" class=\"edge\">\n", + "<title>base_df->spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M271.7,-239.12C279.6,-236.14 287.8,-233.04 295.59,-230.1\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"296.73,-233.41 304.85,-226.61 294.26,-226.86 296.73,-233.41\"/>\n", "</g>\n", "<!-- spend_std_dev -->\n", - "<g id=\"node6\" class=\"node\">\n", + "<g id=\"node7\" class=\"node\">\n", "<title>spend_std_dev</title>\n", "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M521.52,-63.6C521.52,-63.6 425.68,-63.6 425.68,-63.6 419.68,-63.6 413.68,-57.6 413.68,-51.6 413.68,-51.6 413.68,-12 413.68,-12 413.68,-6 419.68,0 425.68,0 425.68,0 521.52,0 521.52,0 527.52,0 533.52,-6 533.52,-12 533.52,-12 533.52,-51.6 533.52,-51.6 533.52,-57.6 527.52,-63.6 521.52,-63.6\"/>\n", "<text text-anchor=\"start\" x=\"424.48\" y=\"-40.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_std_dev</text>\n", "<text text-anchor=\"start\" x=\"460.85\" y=\"-12.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", "</g>\n", - "<!-- spend->spend_std_dev -->\n", + "<!-- spend_std_dev->spend_zero_mean_unit_variance -->\n", "<g id=\"edge7\" class=\"edge\">\n", - "<title>spend->spend_std_dev</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M351.17,-160.65C362.29,-136 380.08,-102.18 402.8,-77.8 404.93,-75.51 407.2,-73.28 409.57,-71.12\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"411.42,-74.13 416.79,-65 406.89,-68.8 411.42,-74.13\"/>\n", + "<title>spend_std_dev->spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M533.79,-44.43C586.23,-55.63 664.59,-72.35 731.22,-86.57\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"730.25,-89.94 740.76,-88.61 731.71,-83.1 730.25,-89.94\"/>\n", "</g>\n", - "<!-- spend_zero_mean -->\n", - "<g id=\"node8\" class=\"node\">\n", - "<title>spend_zero_mean</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M701.5,-123.6C701.5,-123.6 585.4,-123.6 585.4,-123.6 579.4,-123.6 573.4,-117.6 573.4,-111.6 573.4,-111.6 573.4,-72 573.4,-72 573.4,-66 579.4,-60 585.4,-60 585.4,-60 701.5,-60 701.5,-60 707.5,-60 713.5,-66 713.5,-72 713.5,-72 713.5,-111.6 713.5,-111.6 713.5,-117.6 707.5,-123.6 701.5,-123.6\"/>\n", - "<text text-anchor=\"start\" x=\"584.2\" y=\"-100.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_zero_mean</text>\n", - "<text text-anchor=\"start\" x=\"624.33\" y=\"-72.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "<!-- signups->spend_per_signup -->\n", + "<g id=\"edge5\" class=\"edge\">\n", + "<title>signups->spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M374.16,-308.48C379.52,-308.88 385.22,-309.3 391.06,-309.74\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"390.71,-313.22 400.94,-310.47 391.22,-306.24 390.71,-313.22\"/>\n", "</g>\n", "<!-- spend->spend_zero_mean -->\n", - "<g id=\"edge9\" class=\"edge\">\n", + "<g id=\"edge1\" class=\"edge\">\n", "<title>spend->spend_zero_mean</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M354.6,-160.84C365.66,-142.89 382.01,-121.91 402.8,-110.8 451.1,-84.97 512.85,-80.63 561.58,-82.54\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"561.36,-86.03 571.52,-83.03 561.71,-79.04 561.36,-86.03\"/>\n", - "</g>\n", - "<!-- spend_mean->spend_zero_mean -->\n", - "<g id=\"edge10\" class=\"edge\">\n", - "<title>spend_mean->spend_zero_mean</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M526.33,-133.31C537.73,-129.23 550.06,-124.83 562.29,-120.46\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"563.31,-123.81 571.55,-117.14 560.96,-117.21 563.31,-123.81\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M368.19,-227.31C378.92,-231.16 391.22,-234.88 402.8,-236.8 464.89,-247.07 482.7,-249.18 544.4,-236.8 551.05,-235.47 557.81,-233.67 564.51,-231.57\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"565.18,-235.04 573.54,-228.53 562.94,-228.41 565.18,-235.04\"/>\n", "</g>\n", - "<!-- base_df -->\n", - "<g id=\"node5\" class=\"node\">\n", - "<title>base_df</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M259.45,-265.6C259.45,-265.6 193.6,-265.6 193.6,-265.6 187.6,-265.6 181.6,-259.6 181.6,-253.6 181.6,-253.6 181.6,-214 181.6,-214 181.6,-208 187.6,-202 193.6,-202 193.6,-202 259.45,-202 259.45,-202 265.45,-202 271.45,-208 271.45,-214 271.45,-214 271.45,-253.6 271.45,-253.6 271.45,-259.6 265.45,-265.6 259.45,-265.6\"/>\n", - "<text text-anchor=\"start\" x=\"201.02\" y=\"-242.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">base_df</text>\n", - "<text text-anchor=\"start\" x=\"192.4\" y=\"-214.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", + "<!-- spend->spend_mean -->\n", + "<g id=\"edge3\" class=\"edge\">\n", + "<title>spend->spend_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M367.97,-210.58C380.36,-208.83 395.24,-206.73 409.84,-204.67\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"409.99,-208.18 419.4,-203.32 409.01,-201.25 409.99,-208.18\"/>\n", "</g>\n", - "<!-- base_df->spend -->\n", + "<!-- spend->spend_per_signup -->\n", "<g id=\"edge4\" class=\"edge\">\n", - "<title>base_df->spend</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M271.7,-217.12C279.6,-214.14 287.8,-211.04 295.59,-208.1\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"296.73,-211.41 304.85,-204.61 294.26,-204.86 296.73,-211.41\"/>\n", - "</g>\n", - "<!-- signups -->\n", - "<g id=\"node7\" class=\"node\">\n", - "<title>signups</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M361.8,-326.6C361.8,-326.6 312.45,-326.6 312.45,-326.6 306.45,-326.6 300.45,-320.6 300.45,-314.6 300.45,-314.6 300.45,-275 300.45,-275 300.45,-269 306.45,-263 312.45,-263 312.45,-263 361.8,-263 361.8,-263 367.8,-263 373.8,-269 373.8,-275 373.8,-275 373.8,-314.6 373.8,-314.6 373.8,-320.6 367.8,-326.6 361.8,-326.6\"/>\n", - "<text text-anchor=\"start\" x=\"311.25\" y=\"-303.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">signups</text>\n", - "<text text-anchor=\"start\" x=\"318\" y=\"-275.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "<title>spend->spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M368.09,-241.6C378.81,-250.83 391.13,-261.04 402.8,-269.8 406.07,-272.25 409.46,-274.72 412.91,-277.17\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"410.7,-279.89 420.9,-282.74 414.7,-274.15 410.7,-279.89\"/>\n", "</g>\n", - "<!-- base_df->signups -->\n", + "<!-- spend->avg_3wk_spend -->\n", "<g id=\"edge8\" class=\"edge\">\n", - "<title>base_df->signups</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M271.7,-258.61C277.72,-261.99 283.91,-265.47 289.96,-268.87\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"288.21,-271.9 298.64,-273.75 291.64,-265.8 288.21,-271.9\"/>\n", - "</g>\n", - "<!-- spend_zero_mean_unit_variance -->\n", - "<g id=\"node9\" class=\"node\">\n", - "<title>spend_zero_mean_unit_variance</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M965.1,-93.6C965.1,-93.6 754.5,-93.6 754.5,-93.6 748.5,-93.6 742.5,-87.6 742.5,-81.6 742.5,-81.6 742.5,-42 742.5,-42 742.5,-36 748.5,-30 754.5,-30 754.5,-30 965.1,-30 965.1,-30 971.1,-30 977.1,-36 977.1,-42 977.1,-42 977.1,-81.6 977.1,-81.6 977.1,-87.6 971.1,-93.6 965.1,-93.6\"/>\n", - "<text text-anchor=\"start\" x=\"753.3\" y=\"-70.5\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">spend_zero_mean_unit_variance</text>\n", - "<text text-anchor=\"start\" x=\"840.67\" y=\"-42.5\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", - "</g>\n", - "<!-- spend_std_dev->spend_zero_mean_unit_variance -->\n", - "<g id=\"edge12\" class=\"edge\">\n", - "<title>spend_std_dev->spend_zero_mean_unit_variance</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M533.79,-36.42C586.12,-40.51 664.26,-46.61 730.81,-51.81\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"730.49,-55.29 740.73,-52.58 731.03,-48.31 730.49,-55.29\"/>\n", - "</g>\n", - "<!-- signups->spend_per_signup -->\n", - "<g id=\"edge2\" class=\"edge\">\n", - "<title>signups->spend_per_signup</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M374.16,-300.43C379.66,-301.29 385.51,-302.2 391.51,-303.14\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"390.53,-306.53 400.95,-304.61 391.61,-299.61 390.53,-306.53\"/>\n", + "<title>spend->avg_3wk_spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M368.18,-184.38C378.71,-174.43 390.88,-163.67 402.8,-154.8 403.94,-153.95 405.1,-153.1 406.28,-152.26\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"408.03,-155.3 414.3,-146.76 404.07,-149.53 408.03,-155.3\"/>\n", "</g>\n", - "<!-- spend_zero_mean->spend_zero_mean_unit_variance -->\n", - "<g id=\"edge11\" class=\"edge\">\n", - "<title>spend_zero_mean->spend_zero_mean_unit_variance</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M713.75,-82.1C719.34,-81.32 725.09,-80.51 730.92,-79.7\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"731.3,-83.18 740.72,-78.33 730.33,-76.25 731.3,-83.18\"/>\n", + "<!-- spend->spend_std_dev -->\n", + "<g id=\"edge10\" class=\"edge\">\n", + "<title>spend->spend_std_dev</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M346.61,-182.58C356.11,-151.53 374.05,-104.69 402.8,-72.8 403.5,-72.02 404.22,-71.26 404.95,-70.5\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"407.23,-73.16 412.27,-63.83 402.52,-67.99 407.23,-73.16\"/>\n", "</g>\n", "<!-- _base_df_inputs -->\n", "<g id=\"node10\" class=\"node\">\n", "<title>_base_df_inputs</title>\n", - "<polygon fill=\"#ffffff\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"152.6,-256.1 0,-256.1 0,-211.5 152.6,-211.5 152.6,-256.1\"/>\n", - "<text text-anchor=\"start\" x=\"14.8\" y=\"-228\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">base_df_location</text>\n", - "<text text-anchor=\"start\" x=\"122.8\" y=\"-228\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">str</text>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"152.6,-278.1 0,-278.1 0,-233.5 152.6,-233.5 152.6,-278.1\"/>\n", + "<text text-anchor=\"start\" x=\"14.8\" y=\"-250\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">base_df_location</text>\n", + "<text text-anchor=\"start\" x=\"122.8\" y=\"-250\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">str</text>\n", "</g>\n", "<!-- _base_df_inputs->base_df -->\n", - "<g id=\"edge6\" class=\"edge\">\n", + "<g id=\"edge9\" class=\"edge\">\n", "<title>_base_df_inputs->base_df</title>\n", - "<path fill=\"none\" stroke=\"black\" d=\"M152.72,-233.8C158.49,-233.8 164.25,-233.8 169.87,-233.8\"/>\n", - "<polygon fill=\"black\" stroke=\"black\" points=\"169.62,-237.3 179.62,-233.8 169.62,-230.3 169.62,-237.3\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M152.72,-255.8C158.49,-255.8 164.25,-255.8 169.87,-255.8\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"169.62,-259.3 179.62,-255.8 169.62,-252.3 169.62,-259.3\"/>\n", "</g>\n", "<!-- input -->\n", "<g id=\"node11\" class=\"node\">\n", "<title>input</title>\n", - "<polygon fill=\"#ffffff\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"103.3,-311.1 49.3,-311.1 49.3,-274.5 103.3,-274.5 103.3,-311.1\"/>\n", - "<text text-anchor=\"middle\" x=\"76.3\" y=\"-287\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">input</text>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"103.3,-333.1 49.3,-333.1 49.3,-296.5 103.3,-296.5 103.3,-333.1\"/>\n", + "<text text-anchor=\"middle\" x=\"76.3\" y=\"-309\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">input</text>\n", "</g>\n", "<!-- function -->\n", "<g id=\"node12\" class=\"node\">\n", "<title>function</title>\n", - "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M98.72,-366.1C98.72,-366.1 53.87,-366.1 53.87,-366.1 47.87,-366.1 41.87,-360.1 41.87,-354.1 41.87,-354.1 41.87,-341.5 41.87,-341.5 41.87,-335.5 47.87,-329.5 53.87,-329.5 53.87,-329.5 98.72,-329.5 98.72,-329.5 104.72,-329.5 110.72,-335.5 110.72,-341.5 110.72,-341.5 110.72,-354.1 110.72,-354.1 110.72,-360.1 104.72,-366.1 98.72,-366.1\"/>\n", - "<text text-anchor=\"middle\" x=\"76.3\" y=\"-342\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">function</text>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M98.72,-388.1C98.72,-388.1 53.87,-388.1 53.87,-388.1 47.87,-388.1 41.87,-382.1 41.87,-376.1 41.87,-376.1 41.87,-363.5 41.87,-363.5 41.87,-357.5 47.87,-351.5 53.87,-351.5 53.87,-351.5 98.72,-351.5 98.72,-351.5 104.72,-351.5 110.72,-357.5 110.72,-363.5 110.72,-363.5 110.72,-376.1 110.72,-376.1 110.72,-382.1 104.72,-388.1 98.72,-388.1\"/>\n", + "<text text-anchor=\"middle\" x=\"76.3\" y=\"-364\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">function</text>\n", "</g>\n", "</g>\n", "</svg>\n" ], "text/plain": [ - "<graphviz.graphs.Digraph at 0x159d3d340>" + "<graphviz.graphs.Digraph at 0x7f819937bb50>" ] }, "metadata": {}, @@ -777,7 +779,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "hamilton", "language": "python", "name": "python3" }, @@ -791,7 +793,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/examples/polars/with_columns/DAG_DataFrame.png b/examples/polars/with_columns/DAG_DataFrame.png new file mode 100644 index 000000000..cc3d713b7 Binary files /dev/null and b/examples/polars/with_columns/DAG_DataFrame.png differ diff --git a/examples/polars/with_columns/DAG_lazy.png b/examples/polars/with_columns/DAG_lazy.png new file mode 100644 index 000000000..d2566fd7b Binary files /dev/null and b/examples/polars/with_columns/DAG_lazy.png differ diff --git a/examples/polars/with_columns/README b/examples/polars/with_columns/README new file mode 100644 index 000000000..86db77204 --- /dev/null +++ b/examples/polars/with_columns/README @@ -0,0 +1,8 @@ +# Using with_columns with Polars + +We show the ability to use the familiar `with_columns` from `polars`. Supported for both `pl.DataFrame` and `pl.LazyFrame`. + +To see the example look at the notebook. + +![image info](./DAG_DataFrame.png) +![image info](./DAG_lazy.png) diff --git a/examples/polars/with_columns/my_functions.py b/examples/polars/with_columns/my_functions.py new file mode 100644 index 000000000..3b2c401b9 --- /dev/null +++ b/examples/polars/with_columns/my_functions.py @@ -0,0 +1,51 @@ +import polars as pl + +from hamilton.function_modifiers import config + +""" +Notes: + 1. This file is used for all the [ray|dask|spark]/hello_world examples. + 2. It therefore show cases how you can write something once and not only scale it, but port it + to different frameworks with ease! +""" + + +@config.when(case="millions") +def avg_3wk_spend__millions(spend: pl.Series) -> pl.Series: + """Rolling 3 week average spend.""" + return ( + spend.to_frame("spend").select(pl.col("spend").rolling_mean(window_size=3) / 1e6) + ).to_series(0) + + +@config.when(case="thousands") +def avg_3wk_spend__thousands(spend: pl.Series) -> pl.Series: + """Rolling 3 week average spend.""" + return ( + spend.to_frame("spend").select(pl.col("spend").rolling_mean(window_size=3) / 1e3) + ).to_series(0) + + +def spend_per_signup(spend: pl.Series, signups: pl.Series) -> pl.Series: + """The cost per signup in relation to spend.""" + return spend / signups + + +def spend_mean(spend: pl.Series) -> float: + """Shows function creating a scalar. In this case it computes the mean of the entire column.""" + return spend.mean() + + +def spend_zero_mean(spend: pl.Series, spend_mean: float) -> pl.Series: + """Shows function that takes a scalar. In this case to zero mean spend.""" + return spend - spend_mean + + +def spend_std_dev(spend: pl.Series) -> float: + """Function that computes the standard deviation of the spend column.""" + return spend.std() + + +def spend_zero_mean_unit_variance(spend_zero_mean: pl.Series, spend_std_dev: float) -> pl.Series: + """Function showing one way to make spend have zero mean and unit variance.""" + return spend_zero_mean / spend_std_dev diff --git a/examples/polars/with_columns/my_functions_lazy.py b/examples/polars/with_columns/my_functions_lazy.py new file mode 100644 index 000000000..4b65b2ac2 --- /dev/null +++ b/examples/polars/with_columns/my_functions_lazy.py @@ -0,0 +1,47 @@ +import polars as pl + +from hamilton.function_modifiers import config + +""" +Notes: + 1. This file is used for all the [ray|dask|spark]/hello_world examples. + 2. It therefore show cases how you can write something once and not only scale it, but port it + to different frameworks with ease! +""" + + +@config.when(case="millions") +def avg_3wk_spend__millions(spend: pl.Expr) -> pl.Expr: + """Rolling 3 week average spend.""" + return spend.rolling_mean(window_size=3) / 1e6 + + +@config.when(case="thousands") +def avg_3wk_spend__thousands(spend: pl.Expr) -> pl.Expr: + """Rolling 3 week average spend.""" + return spend.rolling_mean(window_size=3) / 1e3 + + +def spend_per_signup(spend: pl.Expr, signups: pl.Expr) -> pl.Expr: + """The cost per signup in relation to spend.""" + return spend / signups + + +def spend_mean(spend: pl.Expr) -> float: + """Shows function creating a scalar. In this case it computes the mean of the entire column.""" + return spend.mean() + + +def spend_zero_mean(spend: pl.Expr, spend_mean: float) -> pl.Expr: + """Shows function that takes a scalar. In this case to zero mean spend.""" + return spend - spend_mean + + +def spend_std_dev(spend: pl.Expr) -> float: + """Function that computes the standard deviation of the spend column.""" + return spend.std() + + +def spend_zero_mean_unit_variance(spend_zero_mean: pl.Expr, spend_std_dev: float) -> pl.Expr: + """Function showing one way to make spend have zero mean and unit variance.""" + return spend_zero_mean / spend_std_dev diff --git a/examples/polars/with_columns/notebook.ipynb b/examples/polars/with_columns/notebook.ipynb new file mode 100644 index 000000000..39bd66d35 --- /dev/null +++ b/examples/polars/with_columns/notebook.ipynb @@ -0,0 +1,1239 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Execute this cell to install dependencies\n", + "%pip install sf-hamilton[visualization]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example of using with_columns for Polars DataFrame [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/polars/with_columns/notebook.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dagworks-inc/hamilton/blob/main/examples/polars/with_columns/notebook.ipynb)\n", + "\n", + "This allows you to efficiently run groups of map operations on a dataframe.\n", + "Here's an example of calling it -- if you've seen `@subdag`, you should be familiar with the concepts." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jernejfrank/miniconda3/envs/hamilton/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "%reload_ext hamilton.plugins.jupyter_magic\n", + "from hamilton import driver\n", + "import my_functions\n", + "\n", + "my_builder = driver.Builder().with_modules(my_functions).with_config({\"case\":\"thousands\"})\n", + "output_node = [\"final_df\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", + "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n", + " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n", + "<!-- Generated by graphviz version 12.1.2 (20240928.0832)\n", + " -->\n", + "<!-- Pages: 1 -->\n", + "<svg width=\"1323pt\" height=\"521pt\"\n", + " viewBox=\"0.00 0.00 1322.70 520.92\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", + "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 516.92)\">\n", + "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-516.92 1318.7,-516.92 1318.7,4 -4,4\"/>\n", + "<g id=\"clust1\" class=\"cluster\">\n", + "<title>cluster__legend</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"8,-142 8,-326 92.85,-326 92.85,-142 8,-142\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-308.7\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">Legend</text>\n", + "</g>\n", + "<!-- case -->\n", + "<g id=\"node1\" class=\"node\">\n", + "<title>case</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"83.55,-50 11.3,-50 11.3,0 89.55,0 89.55,-44 83.55,-50\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"83.55,-50 83.55,-44\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"89.55,-44 83.55,-44\"/>\n", + "<text text-anchor=\"start\" x=\"35.42\" y=\"-33.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">case</text>\n", + "<text text-anchor=\"start\" x=\"19.3\" y=\"-5.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">thousands</text>\n", + "</g>\n", + "<!-- final_df.spend -->\n", + "<g id=\"node2\" class=\"node\">\n", + "<title>final_df.spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M231.45,-311.8C231.45,-311.8 142.35,-311.8 142.35,-311.8 136.35,-311.8 130.35,-305.8 130.35,-299.8 130.35,-299.8 130.35,-260.2 130.35,-260.2 130.35,-254.2 136.35,-248.2 142.35,-248.2 142.35,-248.2 231.45,-248.2 231.45,-248.2 237.45,-248.2 243.45,-254.2 243.45,-260.2 243.45,-260.2 243.45,-299.8 243.45,-299.8 243.45,-305.8 237.45,-311.8 231.45,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"141.15\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend</text>\n", + "<text text-anchor=\"start\" x=\"167.77\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev -->\n", + "<g id=\"node5\" class=\"node\">\n", + "<title>final_df.spend_std_dev</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M460.18,-393.8C460.18,-393.8 312.57,-393.8 312.57,-393.8 306.57,-393.8 300.57,-387.8 300.57,-381.8 300.57,-381.8 300.57,-342.2 300.57,-342.2 300.57,-336.2 306.57,-330.2 312.57,-330.2 312.57,-330.2 460.18,-330.2 460.18,-330.2 466.18,-330.2 472.18,-336.2 472.18,-342.2 472.18,-342.2 472.18,-381.8 472.18,-381.8 472.18,-387.8 466.18,-393.8 460.18,-393.8\"/>\n", + "<text text-anchor=\"start\" x=\"311.38\" y=\"-370.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_std_dev</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-342.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_std_dev -->\n", + "<g id=\"edge5\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_std_dev</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.71,-305.89C255.14,-311.02 267.14,-316.28 278.45,-321 282.2,-322.57 286.04,-324.15 289.92,-325.72\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"288.28,-328.84 298.87,-329.33 290.9,-322.35 288.28,-328.84\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup -->\n", + "<g id=\"node7\" class=\"node\">\n", + "<title>final_df.spend_per_signup</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M471.05,-191.8C471.05,-191.8 301.7,-191.8 301.7,-191.8 295.7,-191.8 289.7,-185.8 289.7,-179.8 289.7,-179.8 289.7,-140.2 289.7,-140.2 289.7,-134.2 295.7,-128.2 301.7,-128.2 301.7,-128.2 471.05,-128.2 471.05,-128.2 477.05,-128.2 483.05,-134.2 483.05,-140.2 483.05,-140.2 483.05,-179.8 483.05,-179.8 483.05,-185.8 477.05,-191.8 471.05,-191.8\"/>\n", + "<text text-anchor=\"start\" x=\"300.5\" y=\"-168.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_per_signup</text>\n", + "<text text-anchor=\"start\" x=\"367.25\" y=\"-140.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_per_signup -->\n", + "<g id=\"edge6\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M222.31,-248.04C238.55,-234 258.67,-218.01 278.45,-206 283.34,-203.03 288.47,-200.15 293.7,-197.38\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.22,-200.54 302.54,-192.89 292.04,-194.3 295.22,-200.54\"/>\n", + "</g>\n", + "<!-- final_df.__append -->\n", + "<g id=\"node9\" class=\"node\">\n", + "<title>final_df.__append</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1183.85,-398.8C1183.85,-398.8 1071.5,-398.8 1071.5,-398.8 1065.5,-398.8 1059.5,-392.8 1059.5,-386.8 1059.5,-386.8 1059.5,-347.2 1059.5,-347.2 1059.5,-341.2 1065.5,-335.2 1071.5,-335.2 1071.5,-335.2 1183.85,-335.2 1183.85,-335.2 1189.85,-335.2 1195.85,-341.2 1195.85,-347.2 1195.85,-347.2 1195.85,-386.8 1195.85,-386.8 1195.85,-392.8 1189.85,-398.8 1183.85,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1070.3\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.__append</text>\n", + "<text text-anchor=\"start\" x=\"1093.55\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.__append -->\n", + "<g id=\"edge9\" class=\"edge\">\n", + "<title>final_df.spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.75,-250.87C254.98,-246.12 266.89,-241.84 278.45,-239 425.34,-202.97 467.29,-210.32 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean -->\n", + "<g id=\"node10\" class=\"node\">\n", + "<title>final_df.spend_zero_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M703.15,-469.8C703.15,-469.8 535.3,-469.8 535.3,-469.8 529.3,-469.8 523.3,-463.8 523.3,-457.8 523.3,-457.8 523.3,-418.2 523.3,-418.2 523.3,-412.2 529.3,-406.2 535.3,-406.2 535.3,-406.2 703.15,-406.2 703.15,-406.2 709.15,-406.2 715.15,-412.2 715.15,-418.2 715.15,-418.2 715.15,-457.8 715.15,-457.8 715.15,-463.8 709.15,-469.8 703.15,-469.8\"/>\n", + "<text text-anchor=\"start\" x=\"534.1\" y=\"-446.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean</text>\n", + "<text text-anchor=\"start\" x=\"600.1\" y=\"-418.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_zero_mean -->\n", + "<g id=\"edge16\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M191.41,-312.19C198.5,-359.52 219.28,-446.45 278.45,-485 358.34,-537.05 471.23,-505.6 544.57,-474.74\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"545.91,-477.98 553.71,-470.8 543.14,-471.55 545.91,-477.98\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean -->\n", + "<g id=\"node11\" class=\"node\">\n", + "<title>final_df.spend_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M452.68,-475.8C452.68,-475.8 320.07,-475.8 320.07,-475.8 314.07,-475.8 308.07,-469.8 308.07,-463.8 308.07,-463.8 308.07,-424.2 308.07,-424.2 308.07,-418.2 314.07,-412.2 320.07,-412.2 320.07,-412.2 452.68,-412.2 452.68,-412.2 458.68,-412.2 464.68,-418.2 464.68,-424.2 464.68,-424.2 464.68,-463.8 464.68,-463.8 464.68,-469.8 458.68,-475.8 452.68,-475.8\"/>\n", + "<text text-anchor=\"start\" x=\"318.88\" y=\"-452.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_mean</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-424.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_mean -->\n", + "<g id=\"edge18\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M204.56,-312.04C220.31,-339.53 246.29,-378.49 278.45,-403 284.4,-407.53 290.91,-411.62 297.69,-415.29\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.92,-418.32 306.42,-419.7 299.07,-412.07 295.92,-418.32\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend -->\n", + "<g id=\"node12\" class=\"node\">\n", + "<title>final_df.avg_3wk_spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M482.3,-311.8C482.3,-311.8 290.45,-311.8 290.45,-311.8 284.45,-311.8 278.45,-305.8 278.45,-299.8 278.45,-299.8 278.45,-260.2 278.45,-260.2 278.45,-254.2 284.45,-248.2 290.45,-248.2 290.45,-248.2 482.3,-248.2 482.3,-248.2 488.3,-248.2 494.3,-254.2 494.3,-260.2 494.3,-260.2 494.3,-299.8 494.3,-299.8 494.3,-305.8 488.3,-311.8 482.3,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"289.25\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.avg_3wk_spend: case</text>\n", + "<text text-anchor=\"start\" x=\"367.25\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.avg_3wk_spend -->\n", + "<g id=\"edge19\" class=\"edge\">\n", + "<title>final_df.spend->final_df.avg_3wk_spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.7,-280C251.06,-280 258.82,-280 266.76,-280\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"266.57,-283.5 276.57,-280 266.57,-276.5 266.57,-283.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"node3\" class=\"node\">\n", + "<title>final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1018.5,-398.8C1018.5,-398.8 756.15,-398.8 756.15,-398.8 750.15,-398.8 744.15,-392.8 744.15,-386.8 744.15,-386.8 744.15,-347.2 744.15,-347.2 744.15,-341.2 750.15,-335.2 756.15,-335.2 756.15,-335.2 1018.5,-335.2 1018.5,-335.2 1024.5,-335.2 1030.5,-341.2 1030.5,-347.2 1030.5,-347.2 1030.5,-386.8 1030.5,-386.8 1030.5,-392.8 1024.5,-398.8 1018.5,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"754.95\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean_unit_variance</text>\n", + "<text text-anchor=\"start\" x=\"868.2\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance->final_df.__append -->\n", + "<g id=\"edge14\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean_unit_variance->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1030.76,-367C1036.54,-367 1042.25,-367 1047.83,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.55,-370.5 1057.55,-367 1047.55,-363.5 1047.55,-370.5\"/>\n", + "</g>\n", + "<!-- final_df -->\n", + "<g id=\"node4\" class=\"node\">\n", + "<title>final_df</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M1302.7,-398.8C1302.7,-398.8 1236.85,-398.8 1236.85,-398.8 1230.85,-398.8 1224.85,-392.8 1224.85,-386.8 1224.85,-386.8 1224.85,-347.2 1224.85,-347.2 1224.85,-341.2 1230.85,-335.2 1236.85,-335.2 1236.85,-335.2 1302.7,-335.2 1302.7,-335.2 1308.7,-335.2 1314.7,-341.2 1314.7,-347.2 1314.7,-347.2 1314.7,-386.8 1314.7,-386.8 1314.7,-392.8 1308.7,-398.8 1302.7,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1245.78\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df</text>\n", + "<text text-anchor=\"start\" x=\"1235.65\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge3\" class=\"edge\">\n", + "<title>final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M472.58,-362.85C543.7,-363.57 646.89,-364.6 732.42,-365.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"732.28,-368.96 742.31,-365.56 732.35,-361.96 732.28,-368.96\"/>\n", + "</g>\n", + "<!-- initial_df -->\n", + "<g id=\"node6\" class=\"node\">\n", + "<title>initial_df</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M83.35,-131.8C83.35,-131.8 17.5,-131.8 17.5,-131.8 11.5,-131.8 5.5,-125.8 5.5,-119.8 5.5,-119.8 5.5,-80.2 5.5,-80.2 5.5,-74.2 11.5,-68.2 17.5,-68.2 17.5,-68.2 83.35,-68.2 83.35,-68.2 89.35,-68.2 95.35,-74.2 95.35,-80.2 95.35,-80.2 95.35,-119.8 95.35,-119.8 95.35,-125.8 89.35,-131.8 83.35,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"22.67\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">initial_df</text>\n", + "<text text-anchor=\"start\" x=\"16.3\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", + "</g>\n", + "<!-- initial_df->final_df.spend -->\n", + "<g id=\"edge1\" class=\"edge\">\n", + "<title>initial_df->final_df.spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M89.93,-132.1C91.82,-134.04 93.64,-136.02 95.35,-138 122.03,-168.87 146.53,-208.28 163.3,-237.75\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"160.21,-239.4 168.15,-246.4 166.31,-235.97 160.21,-239.4\"/>\n", + "</g>\n", + "<!-- final_df.signups -->\n", + "<g id=\"node8\" class=\"node\">\n", + "<title>final_df.signups</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M237.45,-131.8C237.45,-131.8 136.35,-131.8 136.35,-131.8 130.35,-131.8 124.35,-125.8 124.35,-119.8 124.35,-119.8 124.35,-80.2 124.35,-80.2 124.35,-74.2 130.35,-68.2 136.35,-68.2 136.35,-68.2 237.45,-68.2 237.45,-68.2 243.45,-68.2 249.45,-74.2 249.45,-80.2 249.45,-80.2 249.45,-119.8 249.45,-119.8 249.45,-125.8 243.45,-131.8 237.45,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"135.15\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.signups</text>\n", + "<text text-anchor=\"start\" x=\"167.77\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- initial_df->final_df.signups -->\n", + "<g id=\"edge8\" class=\"edge\">\n", + "<title>initial_df->final_df.signups</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.83,-100C101.33,-100 107.06,-100 112.85,-100\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"112.5,-103.5 122.5,-100 112.5,-96.5 112.5,-103.5\"/>\n", + "</g>\n", + "<!-- initial_df->final_df.__append -->\n", + "<g id=\"edge15\" class=\"edge\">\n", + "<title>initial_df->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M620.23,-220C776.95,-229.59 950.98,-292.31 1048.79,-332.66\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.2,-335.79 1057.78,-336.4 1049.89,-329.33 1047.2,-335.79\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M387.38,-100C433.3,-114.84 450.42,-98.89 494.3,-119 558.89,-148.61 548.22,-207.83 618.23,-220\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.79,-70.39C104.89,-65.69 114.66,-61.54 124.35,-59 237.95,-29.25 273.63,-63.89 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup->final_df.__append -->\n", + "<g id=\"edge12\" class=\"edge\">\n", + "<title>final_df.spend_per_signup->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M478.73,-192.28C520.63,-204.86 571.32,-216.99 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.spend_per_signup -->\n", + "<g id=\"edge7\" class=\"edge\">\n", + "<title>final_df.signups->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.84,-118.82C258.96,-121.59 268.57,-124.51 278.29,-127.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"277.26,-130.81 287.85,-130.37 279.29,-124.11 277.26,-130.81\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.__append -->\n", + "<g id=\"edge10\" class=\"edge\">\n", + "<title>final_df.signups->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.92,-90.37C289.6,-86.54 341.66,-85.87 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.__append->final_df -->\n", + "<g id=\"edge4\" class=\"edge\">\n", + "<title>final_df.__append->final_df</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1196.26,-367C1202,-367 1207.75,-367 1213.38,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1213.17,-370.5 1223.17,-367 1213.17,-363.5 1213.17,-370.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge2\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.54,-412.57C728.24,-409.18 741.44,-405.66 754.62,-402.14\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"755.46,-405.54 764.22,-399.58 753.65,-398.78 755.46,-405.54\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.__append -->\n", + "<g id=\"edge13\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.28,-439.84C799.32,-439.41 924.66,-433.68 1030.5,-408 1036.4,-406.57 1042.4,-404.81 1048.37,-402.84\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1049.36,-406.2 1057.63,-399.58 1047.04,-399.6 1049.36,-406.2\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean->final_df.spend_zero_mean -->\n", + "<g id=\"edge17\" class=\"edge\">\n", + "<title>final_df.spend_mean->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M464.89,-441.99C479.85,-441.6 495.78,-441.18 511.49,-440.77\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"511.45,-444.28 521.36,-440.52 511.27,-437.28 511.45,-444.28\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend->final_df.__append -->\n", + "<g id=\"edge11\" class=\"edge\">\n", + "<title>final_df.avg_3wk_spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M462.35,-247.73C506.94,-231.78 565.02,-216.59 618.23,-220\"/>\n", + "</g>\n", + "<!-- config -->\n", + "<g id=\"node13\" class=\"node\">\n", + "<title>config</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"71.42,-186 23.42,-186 23.42,-150 77.42,-150 77.42,-180 71.42,-186\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"71.42,-186 71.42,-180\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"77.42,-180 71.42,-180\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-162.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">config</text>\n", + "</g>\n", + "<!-- function -->\n", + "<g id=\"node14\" class=\"node\">\n", + "<title>function</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M72.85,-240.3C72.85,-240.3 28,-240.3 28,-240.3 22,-240.3 16,-234.3 16,-228.3 16,-228.3 16,-215.7 16,-215.7 16,-209.7 22,-203.7 28,-203.7 28,-203.7 72.85,-203.7 72.85,-203.7 78.85,-203.7 84.85,-209.7 84.85,-215.7 84.85,-215.7 84.85,-228.3 84.85,-228.3 84.85,-234.3 78.85,-240.3 72.85,-240.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-216.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">function</text>\n", + "</g>\n", + "<!-- output -->\n", + "<g id=\"node15\" class=\"node\">\n", + "<title>output</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M67.97,-295.3C67.97,-295.3 32.87,-295.3 32.87,-295.3 26.87,-295.3 20.87,-289.3 20.87,-283.3 20.87,-283.3 20.87,-270.7 20.87,-270.7 20.87,-264.7 26.87,-258.7 32.87,-258.7 32.87,-258.7 67.97,-258.7 67.97,-258.7 73.97,-258.7 79.97,-264.7 79.97,-270.7 79.97,-270.7 79.97,-283.3 79.97,-283.3 79.97,-289.3 73.97,-295.3 67.97,-295.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-271.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">output</text>\n", + "</g>\n", + "</g>\n", + "</svg>\n" + ], + "text/plain": [ + "<graphviz.graphs.Digraph at 0x7f85ca94ea10>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%cell_to_module with_columns_example --builder my_builder --display --execute output_node\n", + "import polars as pl\n", + "from hamilton.plugins.h_polars import with_columns\n", + "import my_functions\n", + "\n", + "output_columns = [\n", + " \"spend\",\n", + " \"signups\",\n", + " \"avg_3wk_spend\",\n", + " \"spend_per_signup\",\n", + " \"spend_zero_mean_unit_variance\",\n", + "]\n", + "\n", + "def initial_df()->pl.DataFrame:\n", + " return pl.DataFrame(\n", + " { \n", + " \"signups\": pl.Series([1, 10, 50, 100, 200, 400]),\n", + " \"spend\": pl.Series([10, 10, 20, 40, 40, 50])*1e6,\n", + " }\n", + " )\n", + "\n", + "# the with_columns call\n", + "@with_columns(\n", + " *[my_functions],\n", + " columns_to_pass=[\"spend\", \"signups\"], # The columns to select from the dataframe\n", + " # select=output_columns, # The columns to append to the dataframe\n", + " # config_required = [\"a\"]\n", + ")\n", + "def final_df(initial_df: pl.DataFrame) -> pl.DataFrame:\n", + " return initial_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (6, 6)\n", + "┌─────────┬───────┬───────────────┬──────────────────┬─────────────────┬───────────────────────────┐\n", + "│ signups ┆ spend ┆ avg_3wk_spend ┆ spend_per_signup ┆ spend_zero_mean ┆ spend_zero_mean_unit_vari │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ ance │\n", + "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ f64 │\n", + "╞═════════╪═══════╪═══════════════╪══════════════════╪═════════════════╪═══════════════════════════╡\n", + "│ 1 ┆ 1e7 ┆ null ┆ 1e7 ┆ -1.8333e7 ┆ -1.064405 │\n", + "│ 10 ┆ 1e7 ┆ null ┆ 1e6 ┆ -1.8333e7 ┆ -1.064405 │\n", + "│ 50 ┆ 2e7 ┆ 13.333333 ┆ 400000.0 ┆ -8.3333e6 ┆ -0.483821 │\n", + "│ 100 ┆ 4e7 ┆ 23.333333 ┆ 400000.0 ┆ 1.1667e7 ┆ 0.677349 │\n", + "│ 200 ┆ 4e7 ┆ 33.333333 ┆ 200000.0 ┆ 1.1667e7 ┆ 0.677349 │\n", + "│ 400 ┆ 5e7 ┆ 43.333333 ┆ 125000.0 ┆ 2.1667e7 ┆ 1.257934 │\n", + "└─────────┴───────┴───────────────┴──────────────────┴─────────────────┴───────────────────────────┘\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", + "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n", + " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n", + "<!-- Generated by graphviz version 12.1.2 (20240928.0832)\n", + " -->\n", + "<!-- Pages: 1 -->\n", + "<svg width=\"1323pt\" height=\"521pt\"\n", + " viewBox=\"0.00 0.00 1322.70 520.92\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", + "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 516.92)\">\n", + "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-516.92 1318.7,-516.92 1318.7,4 -4,4\"/>\n", + "<g id=\"clust1\" class=\"cluster\">\n", + "<title>cluster__legend</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"8,-142 8,-326 92.85,-326 92.85,-142 8,-142\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-308.7\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">Legend</text>\n", + "</g>\n", + "<!-- case -->\n", + "<g id=\"node1\" class=\"node\">\n", + "<title>case</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"75.3,-50 19.55,-50 19.55,0 81.3,0 81.3,-44 75.3,-50\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"75.3,-50 75.3,-44\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"81.3,-44 75.3,-44\"/>\n", + "<text text-anchor=\"start\" x=\"35.42\" y=\"-33.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">case</text>\n", + "<text text-anchor=\"start\" x=\"27.55\" y=\"-5.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">millions</text>\n", + "</g>\n", + "<!-- final_df.spend -->\n", + "<g id=\"node2\" class=\"node\">\n", + "<title>final_df.spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M231.45,-311.8C231.45,-311.8 142.35,-311.8 142.35,-311.8 136.35,-311.8 130.35,-305.8 130.35,-299.8 130.35,-299.8 130.35,-260.2 130.35,-260.2 130.35,-254.2 136.35,-248.2 142.35,-248.2 142.35,-248.2 231.45,-248.2 231.45,-248.2 237.45,-248.2 243.45,-254.2 243.45,-260.2 243.45,-260.2 243.45,-299.8 243.45,-299.8 243.45,-305.8 237.45,-311.8 231.45,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"141.15\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend</text>\n", + "<text text-anchor=\"start\" x=\"167.77\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev -->\n", + "<g id=\"node5\" class=\"node\">\n", + "<title>final_df.spend_std_dev</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M460.18,-393.8C460.18,-393.8 312.57,-393.8 312.57,-393.8 306.57,-393.8 300.57,-387.8 300.57,-381.8 300.57,-381.8 300.57,-342.2 300.57,-342.2 300.57,-336.2 306.57,-330.2 312.57,-330.2 312.57,-330.2 460.18,-330.2 460.18,-330.2 466.18,-330.2 472.18,-336.2 472.18,-342.2 472.18,-342.2 472.18,-381.8 472.18,-381.8 472.18,-387.8 466.18,-393.8 460.18,-393.8\"/>\n", + "<text text-anchor=\"start\" x=\"311.38\" y=\"-370.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_std_dev</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-342.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_std_dev -->\n", + "<g id=\"edge5\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_std_dev</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.71,-305.89C255.14,-311.02 267.14,-316.28 278.45,-321 282.2,-322.57 286.04,-324.15 289.92,-325.72\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"288.28,-328.84 298.87,-329.33 290.9,-322.35 288.28,-328.84\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup -->\n", + "<g id=\"node7\" class=\"node\">\n", + "<title>final_df.spend_per_signup</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M471.05,-191.8C471.05,-191.8 301.7,-191.8 301.7,-191.8 295.7,-191.8 289.7,-185.8 289.7,-179.8 289.7,-179.8 289.7,-140.2 289.7,-140.2 289.7,-134.2 295.7,-128.2 301.7,-128.2 301.7,-128.2 471.05,-128.2 471.05,-128.2 477.05,-128.2 483.05,-134.2 483.05,-140.2 483.05,-140.2 483.05,-179.8 483.05,-179.8 483.05,-185.8 477.05,-191.8 471.05,-191.8\"/>\n", + "<text text-anchor=\"start\" x=\"300.5\" y=\"-168.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_per_signup</text>\n", + "<text text-anchor=\"start\" x=\"367.25\" y=\"-140.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_per_signup -->\n", + "<g id=\"edge6\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M222.31,-248.04C238.55,-234 258.67,-218.01 278.45,-206 283.34,-203.03 288.47,-200.15 293.7,-197.38\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.22,-200.54 302.54,-192.89 292.04,-194.3 295.22,-200.54\"/>\n", + "</g>\n", + "<!-- final_df.__append -->\n", + "<g id=\"node9\" class=\"node\">\n", + "<title>final_df.__append</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1183.85,-398.8C1183.85,-398.8 1071.5,-398.8 1071.5,-398.8 1065.5,-398.8 1059.5,-392.8 1059.5,-386.8 1059.5,-386.8 1059.5,-347.2 1059.5,-347.2 1059.5,-341.2 1065.5,-335.2 1071.5,-335.2 1071.5,-335.2 1183.85,-335.2 1183.85,-335.2 1189.85,-335.2 1195.85,-341.2 1195.85,-347.2 1195.85,-347.2 1195.85,-386.8 1195.85,-386.8 1195.85,-392.8 1189.85,-398.8 1183.85,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1070.3\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.__append</text>\n", + "<text text-anchor=\"start\" x=\"1093.55\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.__append -->\n", + "<g id=\"edge9\" class=\"edge\">\n", + "<title>final_df.spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.75,-250.87C254.98,-246.12 266.89,-241.84 278.45,-239 425.34,-202.97 467.29,-210.32 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean -->\n", + "<g id=\"node10\" class=\"node\">\n", + "<title>final_df.spend_zero_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M703.15,-469.8C703.15,-469.8 535.3,-469.8 535.3,-469.8 529.3,-469.8 523.3,-463.8 523.3,-457.8 523.3,-457.8 523.3,-418.2 523.3,-418.2 523.3,-412.2 529.3,-406.2 535.3,-406.2 535.3,-406.2 703.15,-406.2 703.15,-406.2 709.15,-406.2 715.15,-412.2 715.15,-418.2 715.15,-418.2 715.15,-457.8 715.15,-457.8 715.15,-463.8 709.15,-469.8 703.15,-469.8\"/>\n", + "<text text-anchor=\"start\" x=\"534.1\" y=\"-446.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean</text>\n", + "<text text-anchor=\"start\" x=\"600.1\" y=\"-418.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_zero_mean -->\n", + "<g id=\"edge16\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M191.41,-312.19C198.5,-359.52 219.28,-446.45 278.45,-485 358.34,-537.05 471.23,-505.6 544.57,-474.74\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"545.91,-477.98 553.71,-470.8 543.14,-471.55 545.91,-477.98\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean -->\n", + "<g id=\"node11\" class=\"node\">\n", + "<title>final_df.spend_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M452.68,-475.8C452.68,-475.8 320.07,-475.8 320.07,-475.8 314.07,-475.8 308.07,-469.8 308.07,-463.8 308.07,-463.8 308.07,-424.2 308.07,-424.2 308.07,-418.2 314.07,-412.2 320.07,-412.2 320.07,-412.2 452.68,-412.2 452.68,-412.2 458.68,-412.2 464.68,-418.2 464.68,-424.2 464.68,-424.2 464.68,-463.8 464.68,-463.8 464.68,-469.8 458.68,-475.8 452.68,-475.8\"/>\n", + "<text text-anchor=\"start\" x=\"318.88\" y=\"-452.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_mean</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-424.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_mean -->\n", + "<g id=\"edge18\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M204.56,-312.04C220.31,-339.53 246.29,-378.49 278.45,-403 284.4,-407.53 290.91,-411.62 297.69,-415.29\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.92,-418.32 306.42,-419.7 299.07,-412.07 295.92,-418.32\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend -->\n", + "<g id=\"node12\" class=\"node\">\n", + "<title>final_df.avg_3wk_spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M482.3,-311.8C482.3,-311.8 290.45,-311.8 290.45,-311.8 284.45,-311.8 278.45,-305.8 278.45,-299.8 278.45,-299.8 278.45,-260.2 278.45,-260.2 278.45,-254.2 284.45,-248.2 290.45,-248.2 290.45,-248.2 482.3,-248.2 482.3,-248.2 488.3,-248.2 494.3,-254.2 494.3,-260.2 494.3,-260.2 494.3,-299.8 494.3,-299.8 494.3,-305.8 488.3,-311.8 482.3,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"289.25\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.avg_3wk_spend: case</text>\n", + "<text text-anchor=\"start\" x=\"367.25\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.avg_3wk_spend -->\n", + "<g id=\"edge19\" class=\"edge\">\n", + "<title>final_df.spend->final_df.avg_3wk_spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.7,-280C251.06,-280 258.82,-280 266.76,-280\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"266.57,-283.5 276.57,-280 266.57,-276.5 266.57,-283.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"node3\" class=\"node\">\n", + "<title>final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1018.5,-398.8C1018.5,-398.8 756.15,-398.8 756.15,-398.8 750.15,-398.8 744.15,-392.8 744.15,-386.8 744.15,-386.8 744.15,-347.2 744.15,-347.2 744.15,-341.2 750.15,-335.2 756.15,-335.2 756.15,-335.2 1018.5,-335.2 1018.5,-335.2 1024.5,-335.2 1030.5,-341.2 1030.5,-347.2 1030.5,-347.2 1030.5,-386.8 1030.5,-386.8 1030.5,-392.8 1024.5,-398.8 1018.5,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"754.95\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean_unit_variance</text>\n", + "<text text-anchor=\"start\" x=\"868.2\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance->final_df.__append -->\n", + "<g id=\"edge14\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean_unit_variance->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1030.76,-367C1036.54,-367 1042.25,-367 1047.83,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.55,-370.5 1057.55,-367 1047.55,-363.5 1047.55,-370.5\"/>\n", + "</g>\n", + "<!-- final_df -->\n", + "<g id=\"node4\" class=\"node\">\n", + "<title>final_df</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M1302.7,-398.8C1302.7,-398.8 1236.85,-398.8 1236.85,-398.8 1230.85,-398.8 1224.85,-392.8 1224.85,-386.8 1224.85,-386.8 1224.85,-347.2 1224.85,-347.2 1224.85,-341.2 1230.85,-335.2 1236.85,-335.2 1236.85,-335.2 1302.7,-335.2 1302.7,-335.2 1308.7,-335.2 1314.7,-341.2 1314.7,-347.2 1314.7,-347.2 1314.7,-386.8 1314.7,-386.8 1314.7,-392.8 1308.7,-398.8 1302.7,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1245.78\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df</text>\n", + "<text text-anchor=\"start\" x=\"1235.65\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge3\" class=\"edge\">\n", + "<title>final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M472.58,-362.85C543.7,-363.57 646.89,-364.6 732.42,-365.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"732.28,-368.96 742.31,-365.56 732.35,-361.96 732.28,-368.96\"/>\n", + "</g>\n", + "<!-- initial_df -->\n", + "<g id=\"node6\" class=\"node\">\n", + "<title>initial_df</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M83.35,-131.8C83.35,-131.8 17.5,-131.8 17.5,-131.8 11.5,-131.8 5.5,-125.8 5.5,-119.8 5.5,-119.8 5.5,-80.2 5.5,-80.2 5.5,-74.2 11.5,-68.2 17.5,-68.2 17.5,-68.2 83.35,-68.2 83.35,-68.2 89.35,-68.2 95.35,-74.2 95.35,-80.2 95.35,-80.2 95.35,-119.8 95.35,-119.8 95.35,-125.8 89.35,-131.8 83.35,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"22.67\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">initial_df</text>\n", + "<text text-anchor=\"start\" x=\"16.3\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">DataFrame</text>\n", + "</g>\n", + "<!-- initial_df->final_df.spend -->\n", + "<g id=\"edge1\" class=\"edge\">\n", + "<title>initial_df->final_df.spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M89.93,-132.1C91.82,-134.04 93.64,-136.02 95.35,-138 122.03,-168.87 146.53,-208.28 163.3,-237.75\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"160.21,-239.4 168.15,-246.4 166.31,-235.97 160.21,-239.4\"/>\n", + "</g>\n", + "<!-- final_df.signups -->\n", + "<g id=\"node8\" class=\"node\">\n", + "<title>final_df.signups</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M237.45,-131.8C237.45,-131.8 136.35,-131.8 136.35,-131.8 130.35,-131.8 124.35,-125.8 124.35,-119.8 124.35,-119.8 124.35,-80.2 124.35,-80.2 124.35,-74.2 130.35,-68.2 136.35,-68.2 136.35,-68.2 237.45,-68.2 237.45,-68.2 243.45,-68.2 249.45,-74.2 249.45,-80.2 249.45,-80.2 249.45,-119.8 249.45,-119.8 249.45,-125.8 243.45,-131.8 237.45,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"135.15\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.signups</text>\n", + "<text text-anchor=\"start\" x=\"167.77\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Series</text>\n", + "</g>\n", + "<!-- initial_df->final_df.signups -->\n", + "<g id=\"edge8\" class=\"edge\">\n", + "<title>initial_df->final_df.signups</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.83,-100C101.33,-100 107.06,-100 112.85,-100\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"112.5,-103.5 122.5,-100 112.5,-96.5 112.5,-103.5\"/>\n", + "</g>\n", + "<!-- initial_df->final_df.__append -->\n", + "<g id=\"edge15\" class=\"edge\">\n", + "<title>initial_df->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M620.23,-220C776.95,-229.59 950.98,-292.31 1048.79,-332.66\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.2,-335.79 1057.78,-336.4 1049.89,-329.33 1047.2,-335.79\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M387.38,-100C433.3,-114.84 450.42,-98.89 494.3,-119 558.89,-148.61 548.22,-207.83 618.23,-220\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.79,-70.39C104.89,-65.69 114.66,-61.54 124.35,-59 237.95,-29.25 273.63,-63.89 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup->final_df.__append -->\n", + "<g id=\"edge12\" class=\"edge\">\n", + "<title>final_df.spend_per_signup->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M478.73,-192.28C520.63,-204.86 571.32,-216.99 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.spend_per_signup -->\n", + "<g id=\"edge7\" class=\"edge\">\n", + "<title>final_df.signups->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.84,-118.82C258.96,-121.59 268.57,-124.51 278.29,-127.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"277.26,-130.81 287.85,-130.37 279.29,-124.11 277.26,-130.81\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.__append -->\n", + "<g id=\"edge10\" class=\"edge\">\n", + "<title>final_df.signups->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.92,-90.37C289.6,-86.54 341.66,-85.87 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.__append->final_df -->\n", + "<g id=\"edge4\" class=\"edge\">\n", + "<title>final_df.__append->final_df</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1196.26,-367C1202,-367 1207.75,-367 1213.38,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1213.17,-370.5 1223.17,-367 1213.17,-363.5 1213.17,-370.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge2\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.54,-412.57C728.24,-409.18 741.44,-405.66 754.62,-402.14\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"755.46,-405.54 764.22,-399.58 753.65,-398.78 755.46,-405.54\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.__append -->\n", + "<g id=\"edge13\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.28,-439.84C799.32,-439.41 924.66,-433.68 1030.5,-408 1036.4,-406.57 1042.4,-404.81 1048.37,-402.84\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1049.36,-406.2 1057.63,-399.58 1047.04,-399.6 1049.36,-406.2\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean->final_df.spend_zero_mean -->\n", + "<g id=\"edge17\" class=\"edge\">\n", + "<title>final_df.spend_mean->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M464.89,-441.99C479.85,-441.6 495.78,-441.18 511.49,-440.77\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"511.45,-444.28 521.36,-440.52 511.27,-437.28 511.45,-444.28\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend->final_df.__append -->\n", + "<g id=\"edge11\" class=\"edge\">\n", + "<title>final_df.avg_3wk_spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M462.35,-247.73C506.94,-231.78 565.02,-216.59 618.23,-220\"/>\n", + "</g>\n", + "<!-- config -->\n", + "<g id=\"node13\" class=\"node\">\n", + "<title>config</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"71.42,-186 23.42,-186 23.42,-150 77.42,-150 77.42,-180 71.42,-186\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"71.42,-186 71.42,-180\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"77.42,-180 71.42,-180\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-162.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">config</text>\n", + "</g>\n", + "<!-- function -->\n", + "<g id=\"node14\" class=\"node\">\n", + "<title>function</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M72.85,-240.3C72.85,-240.3 28,-240.3 28,-240.3 22,-240.3 16,-234.3 16,-228.3 16,-228.3 16,-215.7 16,-215.7 16,-209.7 22,-203.7 28,-203.7 28,-203.7 72.85,-203.7 72.85,-203.7 78.85,-203.7 84.85,-209.7 84.85,-215.7 84.85,-215.7 84.85,-228.3 84.85,-228.3 84.85,-234.3 78.85,-240.3 72.85,-240.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-216.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">function</text>\n", + "</g>\n", + "<!-- output -->\n", + "<g id=\"node15\" class=\"node\">\n", + "<title>output</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M67.97,-295.3C67.97,-295.3 32.87,-295.3 32.87,-295.3 26.87,-295.3 20.87,-289.3 20.87,-283.3 20.87,-283.3 20.87,-270.7 20.87,-270.7 20.87,-264.7 26.87,-258.7 32.87,-258.7 32.87,-258.7 67.97,-258.7 67.97,-258.7 73.97,-258.7 79.97,-264.7 79.97,-270.7 79.97,-270.7 79.97,-283.3 79.97,-283.3 79.97,-289.3 73.97,-295.3 67.97,-295.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-271.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">output</text>\n", + "</g>\n", + "</g>\n", + "</svg>\n" + ], + "text/plain": [ + "<graphviz.graphs.Digraph at 0x7f85ca94c0d0>" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import with_columns_example\n", + "dr = driver.Builder().with_modules(my_functions, with_columns_example).with_config({\"case\":\"millions\"}).build()\n", + "print(dr.execute(final_vars=[\"final_df\"])[\"final_df\"])\n", + "dr.visualize_execution(final_vars=[\"final_df\"])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example of using with_columns for Polars LazyFrame\n", + "\n", + "This allows you to efficiently run groups of map operations on a dataframe.\n", + "Here's an example of calling it -- if you've seen `@subdag`, you should be familiar with the concepts." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext hamilton.plugins.jupyter_magic\n", + "from hamilton import driver\n", + "import my_functions_lazy\n", + "\n", + "my_builder_lazy = driver.Builder().with_modules(my_functions_lazy).with_config({\"case\":\"thousands\"})\n", + "output_node = [\"final_df\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", + "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n", + " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n", + "<!-- Generated by graphviz version 12.1.2 (20240928.0832)\n", + " -->\n", + "<!-- Pages: 1 -->\n", + "<svg width=\"1323pt\" height=\"521pt\"\n", + " viewBox=\"0.00 0.00 1322.70 520.92\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", + "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 516.92)\">\n", + "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-516.92 1318.7,-516.92 1318.7,4 -4,4\"/>\n", + "<g id=\"clust1\" class=\"cluster\">\n", + "<title>cluster__legend</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"8,-142 8,-326 92.85,-326 92.85,-142 8,-142\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-308.7\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">Legend</text>\n", + "</g>\n", + "<!-- case -->\n", + "<g id=\"node1\" class=\"node\">\n", + "<title>case</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"83.55,-50 11.3,-50 11.3,0 89.55,0 89.55,-44 83.55,-50\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"83.55,-50 83.55,-44\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"89.55,-44 83.55,-44\"/>\n", + "<text text-anchor=\"start\" x=\"35.42\" y=\"-33.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">case</text>\n", + "<text text-anchor=\"start\" x=\"19.3\" y=\"-5.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">thousands</text>\n", + "</g>\n", + "<!-- final_df.spend -->\n", + "<g id=\"node2\" class=\"node\">\n", + "<title>final_df.spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M231.45,-311.8C231.45,-311.8 142.35,-311.8 142.35,-311.8 136.35,-311.8 130.35,-305.8 130.35,-299.8 130.35,-299.8 130.35,-260.2 130.35,-260.2 130.35,-254.2 136.35,-248.2 142.35,-248.2 142.35,-248.2 231.45,-248.2 231.45,-248.2 237.45,-248.2 243.45,-254.2 243.45,-260.2 243.45,-260.2 243.45,-299.8 243.45,-299.8 243.45,-305.8 237.45,-311.8 231.45,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"141.15\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend</text>\n", + "<text text-anchor=\"start\" x=\"173.02\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev -->\n", + "<g id=\"node5\" class=\"node\">\n", + "<title>final_df.spend_std_dev</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M460.18,-393.8C460.18,-393.8 312.57,-393.8 312.57,-393.8 306.57,-393.8 300.57,-387.8 300.57,-381.8 300.57,-381.8 300.57,-342.2 300.57,-342.2 300.57,-336.2 306.57,-330.2 312.57,-330.2 312.57,-330.2 460.18,-330.2 460.18,-330.2 466.18,-330.2 472.18,-336.2 472.18,-342.2 472.18,-342.2 472.18,-381.8 472.18,-381.8 472.18,-387.8 466.18,-393.8 460.18,-393.8\"/>\n", + "<text text-anchor=\"start\" x=\"311.38\" y=\"-370.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_std_dev</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-342.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_std_dev -->\n", + "<g id=\"edge5\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_std_dev</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.71,-305.89C255.14,-311.02 267.14,-316.28 278.45,-321 282.2,-322.57 286.04,-324.15 289.92,-325.72\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"288.28,-328.84 298.87,-329.33 290.9,-322.35 288.28,-328.84\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup -->\n", + "<g id=\"node7\" class=\"node\">\n", + "<title>final_df.spend_per_signup</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M471.05,-191.8C471.05,-191.8 301.7,-191.8 301.7,-191.8 295.7,-191.8 289.7,-185.8 289.7,-179.8 289.7,-179.8 289.7,-140.2 289.7,-140.2 289.7,-134.2 295.7,-128.2 301.7,-128.2 301.7,-128.2 471.05,-128.2 471.05,-128.2 477.05,-128.2 483.05,-134.2 483.05,-140.2 483.05,-140.2 483.05,-179.8 483.05,-179.8 483.05,-185.8 477.05,-191.8 471.05,-191.8\"/>\n", + "<text text-anchor=\"start\" x=\"300.5\" y=\"-168.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_per_signup</text>\n", + "<text text-anchor=\"start\" x=\"372.5\" y=\"-140.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_per_signup -->\n", + "<g id=\"edge6\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M222.31,-248.04C238.55,-234 258.67,-218.01 278.45,-206 283.34,-203.03 288.47,-200.15 293.7,-197.38\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.22,-200.54 302.54,-192.89 292.04,-194.3 295.22,-200.54\"/>\n", + "</g>\n", + "<!-- final_df.__append -->\n", + "<g id=\"node9\" class=\"node\">\n", + "<title>final_df.__append</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1183.85,-398.8C1183.85,-398.8 1071.5,-398.8 1071.5,-398.8 1065.5,-398.8 1059.5,-392.8 1059.5,-386.8 1059.5,-386.8 1059.5,-347.2 1059.5,-347.2 1059.5,-341.2 1065.5,-335.2 1071.5,-335.2 1071.5,-335.2 1183.85,-335.2 1183.85,-335.2 1189.85,-335.2 1195.85,-341.2 1195.85,-347.2 1195.85,-347.2 1195.85,-386.8 1195.85,-386.8 1195.85,-392.8 1189.85,-398.8 1183.85,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1070.3\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.__append</text>\n", + "<text text-anchor=\"start\" x=\"1093.55\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">LazyFrame</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.__append -->\n", + "<g id=\"edge9\" class=\"edge\">\n", + "<title>final_df.spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.75,-250.87C254.98,-246.12 266.89,-241.84 278.45,-239 425.34,-202.97 467.29,-210.32 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean -->\n", + "<g id=\"node10\" class=\"node\">\n", + "<title>final_df.spend_zero_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M703.15,-469.8C703.15,-469.8 535.3,-469.8 535.3,-469.8 529.3,-469.8 523.3,-463.8 523.3,-457.8 523.3,-457.8 523.3,-418.2 523.3,-418.2 523.3,-412.2 529.3,-406.2 535.3,-406.2 535.3,-406.2 703.15,-406.2 703.15,-406.2 709.15,-406.2 715.15,-412.2 715.15,-418.2 715.15,-418.2 715.15,-457.8 715.15,-457.8 715.15,-463.8 709.15,-469.8 703.15,-469.8\"/>\n", + "<text text-anchor=\"start\" x=\"534.1\" y=\"-446.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean</text>\n", + "<text text-anchor=\"start\" x=\"605.35\" y=\"-418.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_zero_mean -->\n", + "<g id=\"edge16\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M191.41,-312.19C198.5,-359.52 219.28,-446.45 278.45,-485 358.34,-537.05 471.23,-505.6 544.57,-474.74\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"545.91,-477.98 553.71,-470.8 543.14,-471.55 545.91,-477.98\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean -->\n", + "<g id=\"node11\" class=\"node\">\n", + "<title>final_df.spend_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M452.68,-475.8C452.68,-475.8 320.07,-475.8 320.07,-475.8 314.07,-475.8 308.07,-469.8 308.07,-463.8 308.07,-463.8 308.07,-424.2 308.07,-424.2 308.07,-418.2 314.07,-412.2 320.07,-412.2 320.07,-412.2 452.68,-412.2 452.68,-412.2 458.68,-412.2 464.68,-418.2 464.68,-424.2 464.68,-424.2 464.68,-463.8 464.68,-463.8 464.68,-469.8 458.68,-475.8 452.68,-475.8\"/>\n", + "<text text-anchor=\"start\" x=\"318.88\" y=\"-452.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_mean</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-424.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_mean -->\n", + "<g id=\"edge18\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M204.56,-312.04C220.31,-339.53 246.29,-378.49 278.45,-403 284.4,-407.53 290.91,-411.62 297.69,-415.29\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.92,-418.32 306.42,-419.7 299.07,-412.07 295.92,-418.32\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend -->\n", + "<g id=\"node12\" class=\"node\">\n", + "<title>final_df.avg_3wk_spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M482.3,-311.8C482.3,-311.8 290.45,-311.8 290.45,-311.8 284.45,-311.8 278.45,-305.8 278.45,-299.8 278.45,-299.8 278.45,-260.2 278.45,-260.2 278.45,-254.2 284.45,-248.2 290.45,-248.2 290.45,-248.2 482.3,-248.2 482.3,-248.2 488.3,-248.2 494.3,-254.2 494.3,-260.2 494.3,-260.2 494.3,-299.8 494.3,-299.8 494.3,-305.8 488.3,-311.8 482.3,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"289.25\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.avg_3wk_spend: case</text>\n", + "<text text-anchor=\"start\" x=\"372.5\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.avg_3wk_spend -->\n", + "<g id=\"edge19\" class=\"edge\">\n", + "<title>final_df.spend->final_df.avg_3wk_spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.7,-280C251.06,-280 258.82,-280 266.76,-280\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"266.57,-283.5 276.57,-280 266.57,-276.5 266.57,-283.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"node3\" class=\"node\">\n", + "<title>final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1018.5,-398.8C1018.5,-398.8 756.15,-398.8 756.15,-398.8 750.15,-398.8 744.15,-392.8 744.15,-386.8 744.15,-386.8 744.15,-347.2 744.15,-347.2 744.15,-341.2 750.15,-335.2 756.15,-335.2 756.15,-335.2 1018.5,-335.2 1018.5,-335.2 1024.5,-335.2 1030.5,-341.2 1030.5,-347.2 1030.5,-347.2 1030.5,-386.8 1030.5,-386.8 1030.5,-392.8 1024.5,-398.8 1018.5,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"754.95\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean_unit_variance</text>\n", + "<text text-anchor=\"start\" x=\"873.45\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance->final_df.__append -->\n", + "<g id=\"edge14\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean_unit_variance->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1030.76,-367C1036.54,-367 1042.25,-367 1047.83,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.55,-370.5 1057.55,-367 1047.55,-363.5 1047.55,-370.5\"/>\n", + "</g>\n", + "<!-- final_df -->\n", + "<g id=\"node4\" class=\"node\">\n", + "<title>final_df</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M1302.7,-398.8C1302.7,-398.8 1236.85,-398.8 1236.85,-398.8 1230.85,-398.8 1224.85,-392.8 1224.85,-386.8 1224.85,-386.8 1224.85,-347.2 1224.85,-347.2 1224.85,-341.2 1230.85,-335.2 1236.85,-335.2 1236.85,-335.2 1302.7,-335.2 1302.7,-335.2 1308.7,-335.2 1314.7,-341.2 1314.7,-347.2 1314.7,-347.2 1314.7,-386.8 1314.7,-386.8 1314.7,-392.8 1308.7,-398.8 1302.7,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1245.78\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df</text>\n", + "<text text-anchor=\"start\" x=\"1235.65\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">LazyFrame</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge3\" class=\"edge\">\n", + "<title>final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M472.58,-362.85C543.7,-363.57 646.89,-364.6 732.42,-365.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"732.28,-368.96 742.31,-365.56 732.35,-361.96 732.28,-368.96\"/>\n", + "</g>\n", + "<!-- initial_df -->\n", + "<g id=\"node6\" class=\"node\">\n", + "<title>initial_df</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M83.35,-131.8C83.35,-131.8 17.5,-131.8 17.5,-131.8 11.5,-131.8 5.5,-125.8 5.5,-119.8 5.5,-119.8 5.5,-80.2 5.5,-80.2 5.5,-74.2 11.5,-68.2 17.5,-68.2 17.5,-68.2 83.35,-68.2 83.35,-68.2 89.35,-68.2 95.35,-74.2 95.35,-80.2 95.35,-80.2 95.35,-119.8 95.35,-119.8 95.35,-125.8 89.35,-131.8 83.35,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"22.67\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">initial_df</text>\n", + "<text text-anchor=\"start\" x=\"16.3\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">LazyFrame</text>\n", + "</g>\n", + "<!-- initial_df->final_df.spend -->\n", + "<g id=\"edge1\" class=\"edge\">\n", + "<title>initial_df->final_df.spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M89.93,-132.1C91.82,-134.04 93.64,-136.02 95.35,-138 122.03,-168.87 146.53,-208.28 163.3,-237.75\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"160.21,-239.4 168.15,-246.4 166.31,-235.97 160.21,-239.4\"/>\n", + "</g>\n", + "<!-- final_df.signups -->\n", + "<g id=\"node8\" class=\"node\">\n", + "<title>final_df.signups</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M237.45,-131.8C237.45,-131.8 136.35,-131.8 136.35,-131.8 130.35,-131.8 124.35,-125.8 124.35,-119.8 124.35,-119.8 124.35,-80.2 124.35,-80.2 124.35,-74.2 130.35,-68.2 136.35,-68.2 136.35,-68.2 237.45,-68.2 237.45,-68.2 243.45,-68.2 249.45,-74.2 249.45,-80.2 249.45,-80.2 249.45,-119.8 249.45,-119.8 249.45,-125.8 243.45,-131.8 237.45,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"135.15\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.signups</text>\n", + "<text text-anchor=\"start\" x=\"173.02\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- initial_df->final_df.signups -->\n", + "<g id=\"edge8\" class=\"edge\">\n", + "<title>initial_df->final_df.signups</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.83,-100C101.33,-100 107.06,-100 112.85,-100\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"112.5,-103.5 122.5,-100 112.5,-96.5 112.5,-103.5\"/>\n", + "</g>\n", + "<!-- initial_df->final_df.__append -->\n", + "<g id=\"edge15\" class=\"edge\">\n", + "<title>initial_df->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M620.23,-220C776.95,-229.59 950.98,-292.31 1048.79,-332.66\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.2,-335.79 1057.78,-336.4 1049.89,-329.33 1047.2,-335.79\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M387.38,-100C433.3,-114.84 450.42,-98.89 494.3,-119 558.89,-148.61 548.22,-207.83 618.23,-220\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.79,-70.39C104.89,-65.69 114.66,-61.54 124.35,-59 237.95,-29.25 273.63,-63.89 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup->final_df.__append -->\n", + "<g id=\"edge12\" class=\"edge\">\n", + "<title>final_df.spend_per_signup->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M478.73,-192.28C520.63,-204.86 571.32,-216.99 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.spend_per_signup -->\n", + "<g id=\"edge7\" class=\"edge\">\n", + "<title>final_df.signups->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.84,-118.82C258.96,-121.59 268.57,-124.51 278.29,-127.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"277.26,-130.81 287.85,-130.37 279.29,-124.11 277.26,-130.81\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.__append -->\n", + "<g id=\"edge10\" class=\"edge\">\n", + "<title>final_df.signups->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.92,-90.37C289.6,-86.54 341.66,-85.87 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.__append->final_df -->\n", + "<g id=\"edge4\" class=\"edge\">\n", + "<title>final_df.__append->final_df</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1196.26,-367C1202,-367 1207.75,-367 1213.38,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1213.17,-370.5 1223.17,-367 1213.17,-363.5 1213.17,-370.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge2\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.54,-412.57C728.24,-409.18 741.44,-405.66 754.62,-402.14\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"755.46,-405.54 764.22,-399.58 753.65,-398.78 755.46,-405.54\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.__append -->\n", + "<g id=\"edge13\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.28,-439.84C799.32,-439.41 924.66,-433.68 1030.5,-408 1036.4,-406.57 1042.4,-404.81 1048.37,-402.84\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1049.36,-406.2 1057.63,-399.58 1047.04,-399.6 1049.36,-406.2\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean->final_df.spend_zero_mean -->\n", + "<g id=\"edge17\" class=\"edge\">\n", + "<title>final_df.spend_mean->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M464.89,-441.99C479.85,-441.6 495.78,-441.18 511.49,-440.77\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"511.45,-444.28 521.36,-440.52 511.27,-437.28 511.45,-444.28\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend->final_df.__append -->\n", + "<g id=\"edge11\" class=\"edge\">\n", + "<title>final_df.avg_3wk_spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M462.35,-247.73C506.94,-231.78 565.02,-216.59 618.23,-220\"/>\n", + "</g>\n", + "<!-- config -->\n", + "<g id=\"node13\" class=\"node\">\n", + "<title>config</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"71.42,-186 23.42,-186 23.42,-150 77.42,-150 77.42,-180 71.42,-186\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"71.42,-186 71.42,-180\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"77.42,-180 71.42,-180\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-162.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">config</text>\n", + "</g>\n", + "<!-- function -->\n", + "<g id=\"node14\" class=\"node\">\n", + "<title>function</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M72.85,-240.3C72.85,-240.3 28,-240.3 28,-240.3 22,-240.3 16,-234.3 16,-228.3 16,-228.3 16,-215.7 16,-215.7 16,-209.7 22,-203.7 28,-203.7 28,-203.7 72.85,-203.7 72.85,-203.7 78.85,-203.7 84.85,-209.7 84.85,-215.7 84.85,-215.7 84.85,-228.3 84.85,-228.3 84.85,-234.3 78.85,-240.3 72.85,-240.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-216.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">function</text>\n", + "</g>\n", + "<!-- output -->\n", + "<g id=\"node15\" class=\"node\">\n", + "<title>output</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M67.97,-295.3C67.97,-295.3 32.87,-295.3 32.87,-295.3 26.87,-295.3 20.87,-289.3 20.87,-283.3 20.87,-283.3 20.87,-270.7 20.87,-270.7 20.87,-264.7 26.87,-258.7 32.87,-258.7 32.87,-258.7 67.97,-258.7 67.97,-258.7 73.97,-258.7 79.97,-264.7 79.97,-270.7 79.97,-270.7 79.97,-283.3 79.97,-283.3 79.97,-289.3 73.97,-295.3 67.97,-295.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-271.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">output</text>\n", + "</g>\n", + "</g>\n", + "</svg>\n" + ], + "text/plain": [ + "<graphviz.graphs.Digraph at 0x7f85ca923cd0>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%cell_to_module with_columns_lazy_example --builder my_builder_lazy --display --execute output_node\n", + "import polars as pl\n", + "from hamilton.plugins.h_polars_lazyframe import with_columns\n", + "import my_functions_lazy\n", + "\n", + "output_columns = [\n", + " \"spend\",\n", + " \"signups\",\n", + " \"avg_3wk_spend\",\n", + " \"spend_per_signup\",\n", + " \"spend_zero_mean_unit_variance\",\n", + "]\n", + "\n", + "def initial_df()->pl.LazyFrame:\n", + " return pl.DataFrame(\n", + " { \n", + " \"signups\": pl.Series([1, 10, 50, 100, 200, 400]),\n", + " \"spend\": pl.Series([10, 10, 20, 40, 40, 50])*1e6,\n", + " }\n", + " ).lazy()\n", + "\n", + "# the with_columns call\n", + "@with_columns(\n", + " *[my_functions_lazy],\n", + " columns_to_pass=[\"spend\", \"signups\"], # The columns to select from the dataframe\n", + " # select=output_columns, # The columns to append to the dataframe\n", + " # config_required = [\"a\"]\n", + ")\n", + "def final_df(initial_df: pl.LazyFrame) -> pl.LazyFrame:\n", + " return initial_df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (6, 6)\n", + "┌─────────┬───────┬───────────────┬──────────────────┬─────────────────┬───────────────────────────┐\n", + "│ signups ┆ spend ┆ avg_3wk_spend ┆ spend_per_signup ┆ spend_zero_mean ┆ spend_zero_mean_unit_vari │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ ance │\n", + "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ f64 │\n", + "╞═════════╪═══════╪═══════════════╪══════════════════╪═════════════════╪═══════════════════════════╡\n", + "│ 1 ┆ 1e7 ┆ null ┆ 1e7 ┆ -1.8333e7 ┆ -1.064405 │\n", + "│ 10 ┆ 1e7 ┆ null ┆ 1e6 ┆ -1.8333e7 ┆ -1.064405 │\n", + "│ 50 ┆ 2e7 ┆ 13.333333 ┆ 400000.0 ┆ -8.3333e6 ┆ -0.483821 │\n", + "│ 100 ┆ 4e7 ┆ 23.333333 ┆ 400000.0 ┆ 1.1667e7 ┆ 0.677349 │\n", + "│ 200 ┆ 4e7 ┆ 33.333333 ┆ 200000.0 ┆ 1.1667e7 ┆ 0.677349 │\n", + "│ 400 ┆ 5e7 ┆ 43.333333 ┆ 125000.0 ┆ 2.1667e7 ┆ 1.257934 │\n", + "└─────────┴───────┴───────────────┴──────────────────┴─────────────────┴───────────────────────────┘\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", + "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n", + " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n", + "<!-- Generated by graphviz version 12.1.2 (20240928.0832)\n", + " -->\n", + "<!-- Pages: 1 -->\n", + "<svg width=\"1323pt\" height=\"521pt\"\n", + " viewBox=\"0.00 0.00 1322.70 520.92\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", + "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 516.92)\">\n", + "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-516.92 1318.7,-516.92 1318.7,4 -4,4\"/>\n", + "<g id=\"clust1\" class=\"cluster\">\n", + "<title>cluster__legend</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"8,-142 8,-326 92.85,-326 92.85,-142 8,-142\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-308.7\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">Legend</text>\n", + "</g>\n", + "<!-- case -->\n", + "<g id=\"node1\" class=\"node\">\n", + "<title>case</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"75.3,-50 19.55,-50 19.55,0 81.3,0 81.3,-44 75.3,-50\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"75.3,-50 75.3,-44\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"81.3,-44 75.3,-44\"/>\n", + "<text text-anchor=\"start\" x=\"35.42\" y=\"-33.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">case</text>\n", + "<text text-anchor=\"start\" x=\"27.55\" y=\"-5.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">millions</text>\n", + "</g>\n", + "<!-- final_df.spend -->\n", + "<g id=\"node2\" class=\"node\">\n", + "<title>final_df.spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M231.45,-311.8C231.45,-311.8 142.35,-311.8 142.35,-311.8 136.35,-311.8 130.35,-305.8 130.35,-299.8 130.35,-299.8 130.35,-260.2 130.35,-260.2 130.35,-254.2 136.35,-248.2 142.35,-248.2 142.35,-248.2 231.45,-248.2 231.45,-248.2 237.45,-248.2 243.45,-254.2 243.45,-260.2 243.45,-260.2 243.45,-299.8 243.45,-299.8 243.45,-305.8 237.45,-311.8 231.45,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"141.15\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend</text>\n", + "<text text-anchor=\"start\" x=\"173.02\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev -->\n", + "<g id=\"node5\" class=\"node\">\n", + "<title>final_df.spend_std_dev</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M460.18,-393.8C460.18,-393.8 312.57,-393.8 312.57,-393.8 306.57,-393.8 300.57,-387.8 300.57,-381.8 300.57,-381.8 300.57,-342.2 300.57,-342.2 300.57,-336.2 306.57,-330.2 312.57,-330.2 312.57,-330.2 460.18,-330.2 460.18,-330.2 466.18,-330.2 472.18,-336.2 472.18,-342.2 472.18,-342.2 472.18,-381.8 472.18,-381.8 472.18,-387.8 466.18,-393.8 460.18,-393.8\"/>\n", + "<text text-anchor=\"start\" x=\"311.38\" y=\"-370.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_std_dev</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-342.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_std_dev -->\n", + "<g id=\"edge5\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_std_dev</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.71,-305.89C255.14,-311.02 267.14,-316.28 278.45,-321 282.2,-322.57 286.04,-324.15 289.92,-325.72\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"288.28,-328.84 298.87,-329.33 290.9,-322.35 288.28,-328.84\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup -->\n", + "<g id=\"node7\" class=\"node\">\n", + "<title>final_df.spend_per_signup</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M471.05,-191.8C471.05,-191.8 301.7,-191.8 301.7,-191.8 295.7,-191.8 289.7,-185.8 289.7,-179.8 289.7,-179.8 289.7,-140.2 289.7,-140.2 289.7,-134.2 295.7,-128.2 301.7,-128.2 301.7,-128.2 471.05,-128.2 471.05,-128.2 477.05,-128.2 483.05,-134.2 483.05,-140.2 483.05,-140.2 483.05,-179.8 483.05,-179.8 483.05,-185.8 477.05,-191.8 471.05,-191.8\"/>\n", + "<text text-anchor=\"start\" x=\"300.5\" y=\"-168.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_per_signup</text>\n", + "<text text-anchor=\"start\" x=\"372.5\" y=\"-140.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_per_signup -->\n", + "<g id=\"edge6\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M222.31,-248.04C238.55,-234 258.67,-218.01 278.45,-206 283.34,-203.03 288.47,-200.15 293.7,-197.38\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.22,-200.54 302.54,-192.89 292.04,-194.3 295.22,-200.54\"/>\n", + "</g>\n", + "<!-- final_df.__append -->\n", + "<g id=\"node9\" class=\"node\">\n", + "<title>final_df.__append</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1183.85,-398.8C1183.85,-398.8 1071.5,-398.8 1071.5,-398.8 1065.5,-398.8 1059.5,-392.8 1059.5,-386.8 1059.5,-386.8 1059.5,-347.2 1059.5,-347.2 1059.5,-341.2 1065.5,-335.2 1071.5,-335.2 1071.5,-335.2 1183.85,-335.2 1183.85,-335.2 1189.85,-335.2 1195.85,-341.2 1195.85,-347.2 1195.85,-347.2 1195.85,-386.8 1195.85,-386.8 1195.85,-392.8 1189.85,-398.8 1183.85,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1070.3\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.__append</text>\n", + "<text text-anchor=\"start\" x=\"1093.55\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">LazyFrame</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.__append -->\n", + "<g id=\"edge9\" class=\"edge\">\n", + "<title>final_df.spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.75,-250.87C254.98,-246.12 266.89,-241.84 278.45,-239 425.34,-202.97 467.29,-210.32 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean -->\n", + "<g id=\"node10\" class=\"node\">\n", + "<title>final_df.spend_zero_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M703.15,-469.8C703.15,-469.8 535.3,-469.8 535.3,-469.8 529.3,-469.8 523.3,-463.8 523.3,-457.8 523.3,-457.8 523.3,-418.2 523.3,-418.2 523.3,-412.2 529.3,-406.2 535.3,-406.2 535.3,-406.2 703.15,-406.2 703.15,-406.2 709.15,-406.2 715.15,-412.2 715.15,-418.2 715.15,-418.2 715.15,-457.8 715.15,-457.8 715.15,-463.8 709.15,-469.8 703.15,-469.8\"/>\n", + "<text text-anchor=\"start\" x=\"534.1\" y=\"-446.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean</text>\n", + "<text text-anchor=\"start\" x=\"605.35\" y=\"-418.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_zero_mean -->\n", + "<g id=\"edge16\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M191.41,-312.19C198.5,-359.52 219.28,-446.45 278.45,-485 358.34,-537.05 471.23,-505.6 544.57,-474.74\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"545.91,-477.98 553.71,-470.8 543.14,-471.55 545.91,-477.98\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean -->\n", + "<g id=\"node11\" class=\"node\">\n", + "<title>final_df.spend_mean</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M452.68,-475.8C452.68,-475.8 320.07,-475.8 320.07,-475.8 314.07,-475.8 308.07,-469.8 308.07,-463.8 308.07,-463.8 308.07,-424.2 308.07,-424.2 308.07,-418.2 314.07,-412.2 320.07,-412.2 320.07,-412.2 452.68,-412.2 452.68,-412.2 458.68,-412.2 464.68,-418.2 464.68,-424.2 464.68,-424.2 464.68,-463.8 464.68,-463.8 464.68,-469.8 458.68,-475.8 452.68,-475.8\"/>\n", + "<text text-anchor=\"start\" x=\"318.88\" y=\"-452.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_mean</text>\n", + "<text text-anchor=\"start\" x=\"373.62\" y=\"-424.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">float</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.spend_mean -->\n", + "<g id=\"edge18\" class=\"edge\">\n", + "<title>final_df.spend->final_df.spend_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M204.56,-312.04C220.31,-339.53 246.29,-378.49 278.45,-403 284.4,-407.53 290.91,-411.62 297.69,-415.29\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"295.92,-418.32 306.42,-419.7 299.07,-412.07 295.92,-418.32\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend -->\n", + "<g id=\"node12\" class=\"node\">\n", + "<title>final_df.avg_3wk_spend</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M482.3,-311.8C482.3,-311.8 290.45,-311.8 290.45,-311.8 284.45,-311.8 278.45,-305.8 278.45,-299.8 278.45,-299.8 278.45,-260.2 278.45,-260.2 278.45,-254.2 284.45,-248.2 290.45,-248.2 290.45,-248.2 482.3,-248.2 482.3,-248.2 488.3,-248.2 494.3,-254.2 494.3,-260.2 494.3,-260.2 494.3,-299.8 494.3,-299.8 494.3,-305.8 488.3,-311.8 482.3,-311.8\"/>\n", + "<text text-anchor=\"start\" x=\"289.25\" y=\"-288.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.avg_3wk_spend: case</text>\n", + "<text text-anchor=\"start\" x=\"372.5\" y=\"-260.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend->final_df.avg_3wk_spend -->\n", + "<g id=\"edge19\" class=\"edge\">\n", + "<title>final_df.spend->final_df.avg_3wk_spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M243.7,-280C251.06,-280 258.82,-280 266.76,-280\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"266.57,-283.5 276.57,-280 266.57,-276.5 266.57,-283.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"node3\" class=\"node\">\n", + "<title>final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M1018.5,-398.8C1018.5,-398.8 756.15,-398.8 756.15,-398.8 750.15,-398.8 744.15,-392.8 744.15,-386.8 744.15,-386.8 744.15,-347.2 744.15,-347.2 744.15,-341.2 750.15,-335.2 756.15,-335.2 756.15,-335.2 1018.5,-335.2 1018.5,-335.2 1024.5,-335.2 1030.5,-341.2 1030.5,-347.2 1030.5,-347.2 1030.5,-386.8 1030.5,-386.8 1030.5,-392.8 1024.5,-398.8 1018.5,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"754.95\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.spend_zero_mean_unit_variance</text>\n", + "<text text-anchor=\"start\" x=\"873.45\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean_unit_variance->final_df.__append -->\n", + "<g id=\"edge14\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean_unit_variance->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1030.76,-367C1036.54,-367 1042.25,-367 1047.83,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.55,-370.5 1057.55,-367 1047.55,-363.5 1047.55,-370.5\"/>\n", + "</g>\n", + "<!-- final_df -->\n", + "<g id=\"node4\" class=\"node\">\n", + "<title>final_df</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M1302.7,-398.8C1302.7,-398.8 1236.85,-398.8 1236.85,-398.8 1230.85,-398.8 1224.85,-392.8 1224.85,-386.8 1224.85,-386.8 1224.85,-347.2 1224.85,-347.2 1224.85,-341.2 1230.85,-335.2 1236.85,-335.2 1236.85,-335.2 1302.7,-335.2 1302.7,-335.2 1308.7,-335.2 1314.7,-341.2 1314.7,-347.2 1314.7,-347.2 1314.7,-386.8 1314.7,-386.8 1314.7,-392.8 1308.7,-398.8 1302.7,-398.8\"/>\n", + "<text text-anchor=\"start\" x=\"1245.78\" y=\"-375.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df</text>\n", + "<text text-anchor=\"start\" x=\"1235.65\" y=\"-347.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">LazyFrame</text>\n", + "</g>\n", + "<!-- final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge3\" class=\"edge\">\n", + "<title>final_df.spend_std_dev->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M472.58,-362.85C543.7,-363.57 646.89,-364.6 732.42,-365.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"732.28,-368.96 742.31,-365.56 732.35,-361.96 732.28,-368.96\"/>\n", + "</g>\n", + "<!-- initial_df -->\n", + "<g id=\"node6\" class=\"node\">\n", + "<title>initial_df</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M83.35,-131.8C83.35,-131.8 17.5,-131.8 17.5,-131.8 11.5,-131.8 5.5,-125.8 5.5,-119.8 5.5,-119.8 5.5,-80.2 5.5,-80.2 5.5,-74.2 11.5,-68.2 17.5,-68.2 17.5,-68.2 83.35,-68.2 83.35,-68.2 89.35,-68.2 95.35,-74.2 95.35,-80.2 95.35,-80.2 95.35,-119.8 95.35,-119.8 95.35,-125.8 89.35,-131.8 83.35,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"22.67\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">initial_df</text>\n", + "<text text-anchor=\"start\" x=\"16.3\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">LazyFrame</text>\n", + "</g>\n", + "<!-- initial_df->final_df.spend -->\n", + "<g id=\"edge1\" class=\"edge\">\n", + "<title>initial_df->final_df.spend</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M89.93,-132.1C91.82,-134.04 93.64,-136.02 95.35,-138 122.03,-168.87 146.53,-208.28 163.3,-237.75\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"160.21,-239.4 168.15,-246.4 166.31,-235.97 160.21,-239.4\"/>\n", + "</g>\n", + "<!-- final_df.signups -->\n", + "<g id=\"node8\" class=\"node\">\n", + "<title>final_df.signups</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M237.45,-131.8C237.45,-131.8 136.35,-131.8 136.35,-131.8 130.35,-131.8 124.35,-125.8 124.35,-119.8 124.35,-119.8 124.35,-80.2 124.35,-80.2 124.35,-74.2 130.35,-68.2 136.35,-68.2 136.35,-68.2 237.45,-68.2 237.45,-68.2 243.45,-68.2 249.45,-74.2 249.45,-80.2 249.45,-80.2 249.45,-119.8 249.45,-119.8 249.45,-125.8 243.45,-131.8 237.45,-131.8\"/>\n", + "<text text-anchor=\"start\" x=\"135.15\" y=\"-108.7\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">final_df.signups</text>\n", + "<text text-anchor=\"start\" x=\"173.02\" y=\"-80.7\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Expr</text>\n", + "</g>\n", + "<!-- initial_df->final_df.signups -->\n", + "<g id=\"edge8\" class=\"edge\">\n", + "<title>initial_df->final_df.signups</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.83,-100C101.33,-100 107.06,-100 112.85,-100\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"112.5,-103.5 122.5,-100 112.5,-96.5 112.5,-103.5\"/>\n", + "</g>\n", + "<!-- initial_df->final_df.__append -->\n", + "<g id=\"edge15\" class=\"edge\">\n", + "<title>initial_df->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M620.23,-220C776.95,-229.59 950.98,-292.31 1048.79,-332.66\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1047.2,-335.79 1057.78,-336.4 1049.89,-329.33 1047.2,-335.79\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M387.38,-100C433.3,-114.84 450.42,-98.89 494.3,-119 558.89,-148.61 548.22,-207.83 618.23,-220\"/>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M95.79,-70.39C104.89,-65.69 114.66,-61.54 124.35,-59 237.95,-29.25 273.63,-63.89 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.spend_per_signup->final_df.__append -->\n", + "<g id=\"edge12\" class=\"edge\">\n", + "<title>final_df.spend_per_signup->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M478.73,-192.28C520.63,-204.86 571.32,-216.99 618.23,-220\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.spend_per_signup -->\n", + "<g id=\"edge7\" class=\"edge\">\n", + "<title>final_df.signups->final_df.spend_per_signup</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.84,-118.82C258.96,-121.59 268.57,-124.51 278.29,-127.46\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"277.26,-130.81 287.85,-130.37 279.29,-124.11 277.26,-130.81\"/>\n", + "</g>\n", + "<!-- final_df.signups->final_df.__append -->\n", + "<g id=\"edge10\" class=\"edge\">\n", + "<title>final_df.signups->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M249.92,-90.37C289.6,-86.54 341.66,-85.87 385.38,-100\"/>\n", + "</g>\n", + "<!-- final_df.__append->final_df -->\n", + "<g id=\"edge4\" class=\"edge\">\n", + "<title>final_df.__append->final_df</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M1196.26,-367C1202,-367 1207.75,-367 1213.38,-367\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1213.17,-370.5 1223.17,-367 1213.17,-363.5 1213.17,-370.5\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance -->\n", + "<g id=\"edge2\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.spend_zero_mean_unit_variance</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.54,-412.57C728.24,-409.18 741.44,-405.66 754.62,-402.14\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"755.46,-405.54 764.22,-399.58 753.65,-398.78 755.46,-405.54\"/>\n", + "</g>\n", + "<!-- final_df.spend_zero_mean->final_df.__append -->\n", + "<g id=\"edge13\" class=\"edge\">\n", + "<title>final_df.spend_zero_mean->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M715.28,-439.84C799.32,-439.41 924.66,-433.68 1030.5,-408 1036.4,-406.57 1042.4,-404.81 1048.37,-402.84\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"1049.36,-406.2 1057.63,-399.58 1047.04,-399.6 1049.36,-406.2\"/>\n", + "</g>\n", + "<!-- final_df.spend_mean->final_df.spend_zero_mean -->\n", + "<g id=\"edge17\" class=\"edge\">\n", + "<title>final_df.spend_mean->final_df.spend_zero_mean</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M464.89,-441.99C479.85,-441.6 495.78,-441.18 511.49,-440.77\"/>\n", + "<polygon fill=\"black\" stroke=\"black\" points=\"511.45,-444.28 521.36,-440.52 511.27,-437.28 511.45,-444.28\"/>\n", + "</g>\n", + "<!-- final_df.avg_3wk_spend->final_df.__append -->\n", + "<g id=\"edge11\" class=\"edge\">\n", + "<title>final_df.avg_3wk_spend->final_df.__append</title>\n", + "<path fill=\"none\" stroke=\"black\" d=\"M462.35,-247.73C506.94,-231.78 565.02,-216.59 618.23,-220\"/>\n", + "</g>\n", + "<!-- config -->\n", + "<g id=\"node13\" class=\"node\">\n", + "<title>config</title>\n", + "<polygon fill=\"#ffffff\" stroke=\"black\" points=\"71.42,-186 23.42,-186 23.42,-150 77.42,-150 77.42,-180 71.42,-186\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"71.42,-186 71.42,-180\"/>\n", + "<polyline fill=\"none\" stroke=\"black\" points=\"77.42,-180 71.42,-180\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-162.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">config</text>\n", + "</g>\n", + "<!-- function -->\n", + "<g id=\"node14\" class=\"node\">\n", + "<title>function</title>\n", + "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M72.85,-240.3C72.85,-240.3 28,-240.3 28,-240.3 22,-240.3 16,-234.3 16,-228.3 16,-228.3 16,-215.7 16,-215.7 16,-209.7 22,-203.7 28,-203.7 28,-203.7 72.85,-203.7 72.85,-203.7 78.85,-203.7 84.85,-209.7 84.85,-215.7 84.85,-215.7 84.85,-228.3 84.85,-228.3 84.85,-234.3 78.85,-240.3 72.85,-240.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-216.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">function</text>\n", + "</g>\n", + "<!-- output -->\n", + "<g id=\"node15\" class=\"node\">\n", + "<title>output</title>\n", + "<path fill=\"#ffc857\" stroke=\"black\" d=\"M67.97,-295.3C67.97,-295.3 32.87,-295.3 32.87,-295.3 26.87,-295.3 20.87,-289.3 20.87,-283.3 20.87,-283.3 20.87,-270.7 20.87,-270.7 20.87,-264.7 26.87,-258.7 32.87,-258.7 32.87,-258.7 67.97,-258.7 67.97,-258.7 73.97,-258.7 79.97,-264.7 79.97,-270.7 79.97,-270.7 79.97,-283.3 79.97,-283.3 79.97,-289.3 73.97,-295.3 67.97,-295.3\"/>\n", + "<text text-anchor=\"middle\" x=\"50.42\" y=\"-271.2\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">output</text>\n", + "</g>\n", + "</g>\n", + "</svg>\n" + ], + "text/plain": [ + "<graphviz.graphs.Digraph at 0x7f85ca94dab0>" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import with_columns_lazy_example\n", + "from hamilton import base\n", + "from hamilton.plugins import h_polars\n", + "\n", + "dr = (\n", + " driver.Builder()\n", + " .with_adapter(\n", + " adapter=base.SimplePythonGraphAdapter(result_builder=h_polars.PolarsDataFrameResult()))\n", + " .with_modules(my_functions_lazy, with_columns_lazy_example)\n", + " .with_config({\"case\":\"millions\"})\n", + " .build()\n", + " )\n", + "print(dr.execute(final_vars=[\"final_df\"]))\n", + "dr.visualize_execution(final_vars=[\"final_df\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hamilton", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hamilton/plugins/h_polars.py b/hamilton/plugins/h_polars.py index 799882a30..4ef8609ab 100644 --- a/hamilton/plugins/h_polars.py +++ b/hamilton/plugins/h_polars.py @@ -1,8 +1,27 @@ -from typing import Any, Dict, Type, Union +import sys +from types import ModuleType +from typing import Any, Callable, Collection, Dict, List, Tuple, Type, Union, get_type_hints import polars as pl -from hamilton import base +_sys_version_info = sys.version_info +_version_tuple = (_sys_version_info.major, _sys_version_info.minor, _sys_version_info.micro) + +if _version_tuple < (3, 11, 0): + pass +else: + pass + +# Copied this over from function_graph +# TODO -- determine the best place to put this code +from hamilton import base, node, registry +from hamilton.function_modifiers.expanders import extract_columns +from hamilton.function_modifiers.recursive import ( + _default_inject_parameter, + subdag, + with_columns_base, +) +from hamilton.plugins.polars_extensions import DATAFRAME_TYPE class PolarsDataFrameResult(base.ResultMixin): @@ -54,3 +73,216 @@ def build_result( def output_type(self) -> Type: return pl.DataFrame + + +# Do we need this here? +class with_columns(with_columns_base): + """Initializes a with_columns decorator for polars. + + This allows you to efficiently run groups of map operations on a dataframe. We support + both eager and lazy mode in polars. In case of using eager mode the type should be + pl.DataFrame and the subsequent operations run on columns with type pl.Series. + + Here's an example of calling in eager mode -- if you've seen ``@subdag``, you should be familiar with + the concepts: + + .. code-block:: python + + # my_module.py + def a_b_average(a: pl.Series, b: pl.Series) -> pl.Series: + return (a + b) / 2 + + + .. code-block:: python + + # with_columns_module.py + def a_plus_b(a: pl.Series, b: pl.Series) -> pl.Series: + return a + b + + + # the with_columns call + @with_columns( + *[my_module], # Load from any module + *[a_plus_b], # or list operations directly + columns_to_pass=["a", "b"], # The columns to pass from the dataframe to + # the subdag + select=["a_plus_b", "a_b_average"], # The columns to append to the dataframe + ) + def final_df(initial_df: pl.DataFrame) -> pl.DataFrame: + # process, or just return unprocessed + ... + + In this instance the ``initial_df`` would get two columns added: ``a_plus_b`` and ``a_b_average``. + + Note that the operation is "append", meaning that the columns that are selected are appended + onto the dataframe. + + If the function takes multiple dataframes, the dataframe input to process will always be + the first argument. This will be passed to the subdag, transformed, and passed back to the function. + This follows the hamilton rule of reference by parameter name. To demonstarte this, in the code + above, the dataframe that is passed to the subdag is `initial_df`. That is transformed + by the subdag, and then returned as the final dataframe. + + You can read it as: + + "final_df is a function that transforms the upstream dataframe initial_df, running the transformations + from my_module. It starts with the columns a_from_df and b_from_df, and then adds the columns + a, b, and a_plus_b to the dataframe. It then returns the dataframe, and does some processing on it." + + In case you need more flexibility you can alternatively use ``on_input``, for example, + + .. code-block:: python + + # with_columns_module.py + def a_from_df() -> pl.Expr: + return pl.col(a).alias("a") / 100 + + def b_from_df() -> pl.Expr: + return pl.col(b).alias("b") / 100 + + + # the with_columns call + @with_columns( + *[my_module], + on_input="initial_df", + select=["a_from_df", "b_from_df", "a_plus_b", "a_b_average"], + ) + def final_df(initial_df: pl.DataFrame) -> pl.DataFrame: + # process, or just return unprocessed + ... + + the above would output a dataframe where the two columns ``a`` and ``b`` get + overwritten. + """ + + def __init__( + self, + *load_from: Union[Callable, ModuleType], + columns_to_pass: List[str] = None, + pass_dataframe_as: str = None, + on_input: str = None, + select: List[str] = None, + namespace: str = None, + config_required: List[str] = None, + ): + """Instantiates a ``@with_columns`` decorator. + + :param load_from: The functions or modules that will be used to generate the group of map operations. + :param columns_to_pass: The initial schema of the dataframe. This is used to determine which + upstream inputs should be taken from the dataframe, and which shouldn't. Note that, if this is + left empty (and external_inputs is as well), we will assume that all dependencies come + from the dataframe. This cannot be used in conjunction with on_input. + :param on_input: The name of the dataframe that we're modifying, as known to the subdag. + If you pass this in, you are responsible for extracting columns out. If not provided, you have + to pass columns_to_pass in, and we will extract the columns out on the first parameter for you. + :param select: The end nodes that represent columns to be appended to the original dataframe + via with_columns. Existing columns will be overridden. The selected nodes need to have the + corresponding column type, in this case pl.Series, to be appended to the original dataframe. + :param namespace: The namespace of the nodes, so they don't clash with the global namespace + and so this can be reused. If its left out, there will be no namespace (in which case you'll want + to be careful about repeating it/reusing the nodes in other parts of the DAG.) + :param config_required: the list of config keys that are required to resolve any functions. Pass in None\ + if you want the functions/modules to have access to all possible config. + """ + + if pass_dataframe_as is not None: + raise NotImplementedError( + "We currently do not support pass_dataframe_as for pandas. Please reach out if you need this " + "functionality." + ) + + super().__init__( + *load_from, + columns_to_pass=columns_to_pass, + on_input=on_input, + select=select, + namespace=namespace, + config_required=config_required, + dataframe_type=DATAFRAME_TYPE, + ) + + def _create_column_nodes( + self, fn: Callable, inject_parameter: str, params: Dict[str, Type[Type]] + ) -> List[node.Node]: + output_type = params[inject_parameter] + + def temp_fn(**kwargs) -> Any: + return kwargs[inject_parameter] + + # We recreate the df node to use extract columns + temp_node = node.Node( + name=inject_parameter, + typ=output_type, + callabl=temp_fn, + input_types={inject_parameter: output_type}, + ) + + extract_columns_decorator = extract_columns(*self.initial_schema) + + out_nodes = extract_columns_decorator.transform_node(temp_node, config={}, fn=temp_fn) + return out_nodes[1:] + + def get_initial_nodes( + self, fn: Callable, params: Dict[str, Type[Type]] + ) -> Tuple[str, Collection[node.Node]]: + """Selects the correct dataframe and optionally extracts out columns.""" + inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe) + with_columns_base.validate_dataframe( + fn=fn, + inject_parameter=inject_parameter, + params=params, + required_type=self.dataframe_type, + ) + + initial_nodes = ( + [] + if self.target_dataframe is not None + else self._create_column_nodes(fn=fn, inject_parameter=inject_parameter, params=params) + ) + + return inject_parameter, initial_nodes + + def get_subdag_nodes(self, fn: Callable, config: Dict[str, Any]) -> Collection[node.Node]: + return subdag.collect_nodes(config, self.subdag_functions) + + def chain_subdag_nodes( + self, fn: Callable, inject_parameter: str, generated_nodes: Collection[node.Node] + ) -> node.Node: + "Node that adds to / overrides columns for the original dataframe based on selected output." + + if self.select is None: + self.select = [ + sink_node.name + for sink_node in generated_nodes + if sink_node.type == registry.get_column_type_from_df_type(self.dataframe_type) + ] + + def new_callable(**kwargs) -> Any: + df = kwargs[inject_parameter] + columns_to_append = {} + for column in self.select: + columns_to_append[column] = kwargs[column] + + return df.with_columns(**columns_to_append) + + column_type = registry.get_column_type_from_df_type(self.dataframe_type) + input_map = {column: column_type for column in self.select} + input_map[inject_parameter] = self.dataframe_type + merge_node = node.Node( + name="_append", + typ=self.dataframe_type, + callabl=new_callable, + input_types=input_map, + ) + output_nodes = generated_nodes + [merge_node] + return output_nodes, merge_node.name + + def validate(self, fn: Callable): + inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe) + params = get_type_hints(fn) + with_columns_base.validate_dataframe( + fn=fn, + inject_parameter=inject_parameter, + params=params, + required_type=self.dataframe_type, + ) diff --git a/hamilton/plugins/h_polars_lazyframe.py b/hamilton/plugins/h_polars_lazyframe.py index a933762a7..00f4326e1 100644 --- a/hamilton/plugins/h_polars_lazyframe.py +++ b/hamilton/plugins/h_polars_lazyframe.py @@ -1,8 +1,16 @@ -from typing import Any, Dict, Type, Union +from types import ModuleType +from typing import Any, Callable, Collection, Dict, List, Tuple, Type, Union, get_type_hints import polars as pl -from hamilton import base +from hamilton import base, node, registry +from hamilton.function_modifiers.expanders import extract_columns +from hamilton.function_modifiers.recursive import ( + _default_inject_parameter, + subdag, + with_columns_base, +) +from hamilton.plugins.polars_lazyframe_extensions import DATAFRAME_TYPE class PolarsLazyFrameResult(base.ResultMixin): @@ -45,3 +53,214 @@ def build_result( def output_type(self) -> Type: return pl.LazyFrame + + +class with_columns(with_columns_base): + """Initializes a with_columns decorator for polars. + + This allows you to efficiently run groups of map operations on a dataframe. We support + both eager and lazy mode in polars. For lazy execution, use pl.LazyFrame and the subsequent + operations should be typed as pl.Expr. See examples/polars/with_columns for a practical + implementation in both variations. + + The lazy execution would be: + + .. code-block:: python + + # my_module.py + def a_b_average(a: pl.Expr, b: pl.Expr) -> pl.Expr: + return (a + b) / 2 + + + .. code-block:: python + + # with_columns_module.py + def a_plus_b(a: pl.Expr, b: pl.Expr) -> pl.Expr: + return a + b + + + # the with_columns call + @with_columns( + *[my_module], # Load from any module + *[a_plus_b], # or list operations directly + columns_to_pass=["a_from_df", "b_from_df"], # The columns to pass from the dataframe to + # the subdag + select=["a_plus_b", "a_b_average"], # The columns to append to the dataframe + ) + def final_df(initial_df: pl.LazyFrame) -> pl.LazyFrame: + # process, or just return unprocessed + ... + + Note that the operation is "append", meaning that the columns that are selected are appended + onto the dataframe. + + If the function takes multiple dataframes, the dataframe input to process will always be + the first argument. This will be passed to the subdag, transformed, and passed back to the function. + This follows the hamilton rule of reference by parameter name. To demonstarte this, in the code + above, the dataframe that is passed to the subdag is `initial_df`. That is transformed + by the subdag, and then returned as the final dataframe. + + You can read it as: + + "final_df is a function that transforms the upstream dataframe initial_df, running the transformations + from my_module. It starts with the columns a_from_df and b_from_df, and then adds the columns + a, b, and a_plus_b to the dataframe. It then returns the dataframe, and does some processing on it." + + In case you need more flexibility you can alternatively use ``on_input``, for example, + + .. code-block:: python + + # with_columns_module.py + def a_from_df() -> pl.Expr: + return pl.col(a).alias("a") / 100 + + def b_from_df() -> pd.Expr: + return pl.col(a).alias("b") / 100 + + + # the with_columns call + @with_columns( + *[my_module], + on_input="initial_df", + select=["a_from_df", "b_from_df", "a_plus_b", "a_b_average"], + ) + def final_df(initial_df: pl.LazyFrame) -> pl.LazyFrame: + # process, or just return unprocessed + ... + + the above would output a dataframe where the two columns ``a`` and ``b`` get + overwritten. + """ + + def __init__( + self, + *load_from: Union[Callable, ModuleType], + columns_to_pass: List[str] = None, + pass_dataframe_as: str = None, + on_input: str = None, + select: List[str] = None, + namespace: str = None, + config_required: List[str] = None, + ): + """Instantiates a ``@with_columns`` decorator. + + :param load_from: The functions or modules that will be used to generate the group of map operations. + :param columns_to_pass: The initial schema of the dataframe. This is used to determine which + upstream inputs should be taken from the dataframe, and which shouldn't. Note that, if this is + left empty (and external_inputs is as well), we will assume that all dependencies come + from the dataframe. This cannot be used in conjunction with on_input. + :param on_input: The name of the dataframe that we're modifying, as known to the subdag. + If you pass this in, you are responsible for extracting columns out. If not provided, you have + to pass columns_to_pass in, and we will extract the columns out on the first parameter for you. + :param select: The end nodes that represent columns to be appended to the original dataframe + via with_columns. Existing columns will be overridden. The selected nodes need to have the + corresponding column type, in this case pl.Expr, to be appended to the original dataframe. + :param namespace: The namespace of the nodes, so they don't clash with the global namespace + and so this can be reused. If its left out, there will be no namespace (in which case you'll want + to be careful about repeating it/reusing the nodes in other parts of the DAG.) + :param config_required: the list of config keys that are required to resolve any functions. Pass in None\ + if you want the functions/modules to have access to all possible config. + """ + + if pass_dataframe_as is not None: + raise NotImplementedError( + "We currently do not support pass_dataframe_as for pandas. Please reach out if you need this " + "functionality." + ) + + super().__init__( + *load_from, + columns_to_pass=columns_to_pass, + on_input=on_input, + select=select, + namespace=namespace, + config_required=config_required, + dataframe_type=DATAFRAME_TYPE, + ) + + def _create_column_nodes( + self, fn: Callable, inject_parameter: str, params: Dict[str, Type[Type]] + ) -> List[node.Node]: + output_type = params[inject_parameter] + + def temp_fn(**kwargs) -> Any: + return kwargs[inject_parameter] + + # We recreate the df node to use extract columns + temp_node = node.Node( + name=inject_parameter, + typ=output_type, + callabl=temp_fn, + input_types={inject_parameter: output_type}, + ) + + extract_columns_decorator = extract_columns(*self.initial_schema) + + out_nodes = extract_columns_decorator.transform_node(temp_node, config={}, fn=temp_fn) + return out_nodes[1:] + + def get_initial_nodes( + self, fn: Callable, params: Dict[str, Type[Type]] + ) -> Tuple[str, Collection[node.Node]]: + """Selects the correct dataframe and optionally extracts out columns.""" + inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe) + + with_columns_base.validate_dataframe( + fn=fn, + inject_parameter=inject_parameter, + params=params, + required_type=self.dataframe_type, + ) + + initial_nodes = ( + [] + if self.target_dataframe is not None + else self._create_column_nodes(fn=fn, inject_parameter=inject_parameter, params=params) + ) + + return inject_parameter, initial_nodes + + def get_subdag_nodes(self, fn: Callable, config: Dict[str, Any]) -> Collection[node.Node]: + return subdag.collect_nodes(config, self.subdag_functions) + + def chain_subdag_nodes( + self, fn: Callable, inject_parameter: str, generated_nodes: Collection[node.Node] + ) -> node.Node: + "Node that adds to / overrides columns for the original dataframe based on selected output." + + if self.select is None: + self.select = [ + sink_node.name + for sink_node in generated_nodes + if sink_node.type == registry.get_column_type_from_df_type(self.dataframe_type) + ] + + def new_callable(**kwargs) -> Any: + df = kwargs[inject_parameter] + columns_to_append = {} + for column in self.select: + columns_to_append[column] = kwargs[column] + + return df.with_columns(**columns_to_append) + + column_type = registry.get_column_type_from_df_type(self.dataframe_type) + input_map = {column: column_type for column in self.select} + input_map[inject_parameter] = self.dataframe_type + merge_node = node.Node( + name="_append", + typ=self.dataframe_type, + callabl=new_callable, + input_types=input_map, + ) + output_nodes = generated_nodes + [merge_node] + return output_nodes, merge_node.name + + def validate(self, fn: Callable): + inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe) + params = get_type_hints(fn) + with_columns_base.validate_dataframe( + fn=fn, + inject_parameter=inject_parameter, + params=params, + required_type=self.dataframe_type, + ) diff --git a/plugin_tests/h_polars/__init__.py b/plugin_tests/h_polars/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/plugin_tests/h_polars/conftest.py b/plugin_tests/h_polars/conftest.py new file mode 100644 index 000000000..bc5ef5b5a --- /dev/null +++ b/plugin_tests/h_polars/conftest.py @@ -0,0 +1,4 @@ +from hamilton import telemetry + +# disable telemetry for all tests! +telemetry.disable_telemetry() diff --git a/plugin_tests/h_polars/resources/__init__.py b/plugin_tests/h_polars/resources/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/plugin_tests/h_polars/resources/with_columns_end_to_end.py b/plugin_tests/h_polars/resources/with_columns_end_to_end.py new file mode 100644 index 000000000..a893818fc --- /dev/null +++ b/plugin_tests/h_polars/resources/with_columns_end_to_end.py @@ -0,0 +1,68 @@ +import polars as pl + +from hamilton.function_modifiers import config +from hamilton.plugins.h_polars import with_columns + + +def upstream_factor() -> int: + return 3 + + +def initial_df() -> pl.DataFrame: + return pl.DataFrame({"col_1": [1, 2, 3, 4], "col_2": [11, 12, 13, 14], "col_3": [1, 1, 1, 1]}) + + +def subtract_1_from_2(col_1: pl.Series, col_2: pl.Series) -> pl.Series: + return col_2 - col_1 + + +@config.when(factor=5) +def multiply_3__by_5(col_3: pl.Series) -> pl.Series: + return col_3 * 5 + + +@config.when(factor=7) +def multiply_3__by_7(col_3: pl.Series) -> pl.Series: + return col_3 * 7 + + +def add_1_by_user_adjustment_factor(col_1: pl.Series, user_factor: int) -> pl.Series: + return col_1 + user_factor + + +def multiply_2_by_upstream_3(col_2: pl.Series, upstream_factor: int) -> pl.Series: + return col_2 * upstream_factor + + +@with_columns( + subtract_1_from_2, + multiply_3__by_5, + multiply_3__by_7, + add_1_by_user_adjustment_factor, + multiply_2_by_upstream_3, + columns_to_pass=["col_1", "col_2", "col_3"], + select=[ + "subtract_1_from_2", + "multiply_3", + "add_1_by_user_adjustment_factor", + "multiply_2_by_upstream_3", + ], + namespace="some_subdag", +) +def final_df(initial_df: pl.DataFrame) -> pl.DataFrame: + return initial_df + + +def col_3(initial_df: pl.DataFrame) -> pl.Series: + return pl.Series([0, 2, 4, 6]) + + +@with_columns( + col_3, + multiply_3__by_5, + multiply_3__by_7, + on_input="initial_df", + select=["col_3", "multiply_3"], +) +def final_df_2(initial_df: pl.DataFrame) -> pl.DataFrame: + return initial_df diff --git a/plugin_tests/h_polars/resources/with_columns_end_to_end_lazy.py b/plugin_tests/h_polars/resources/with_columns_end_to_end_lazy.py new file mode 100644 index 000000000..367cfacf4 --- /dev/null +++ b/plugin_tests/h_polars/resources/with_columns_end_to_end_lazy.py @@ -0,0 +1,80 @@ +import polars as pl + +from hamilton.function_modifiers import config +from hamilton.plugins.h_polars_lazyframe import with_columns + + +def upstream_factor() -> int: + return 3 + + +def initial_df() -> pl.LazyFrame: + return pl.DataFrame( + {"col_1": [1, 2, 3, 4], "col_2": [11, 12, 13, 14], "col_3": [1, 1, 1, 1]} + ).lazy() + + +def subtract_1_from_2(col_1: pl.Expr, col_2: pl.Expr) -> pl.Expr: + return col_2 - col_1 + + +@config.when(factor=5) +def multiply_3__by_5(col_3: pl.Expr) -> pl.Expr: + return col_3 * 5 + + +@config.when(factor=7) +def multiply_3__by_7(col_3: pl.Expr) -> pl.Expr: + return col_3 * 7 + + +def add_1_by_user_adjustment_factor(col_1: pl.Expr, user_factor: int) -> pl.Expr: + return col_1 + user_factor + + +def multiply_2_by_upstream_3(col_2: pl.Expr, upstream_factor: int) -> pl.Expr: + return col_2 * upstream_factor + + +@with_columns( + subtract_1_from_2, + multiply_3__by_5, + multiply_3__by_7, + add_1_by_user_adjustment_factor, + multiply_2_by_upstream_3, + columns_to_pass=["col_1", "col_2", "col_3"], + select=[ + "subtract_1_from_2", + "multiply_3", + "add_1_by_user_adjustment_factor", + "multiply_2_by_upstream_3", + ], + namespace="some_subdag", +) +def final_df(initial_df: pl.LazyFrame) -> pl.LazyFrame: + return initial_df + + +def col_1(initial_df: pl.LazyFrame) -> pl.Expr: + return pl.col("col_1") + + +@config.when(factor=5) +def multiply_1__by_5(col_1: pl.Expr) -> pl.Expr: + return col_1 * 5 + + +@config.when_not(factor=5) +def multiply_1__by_1(col_1: pl.Expr) -> pl.Expr: + return col_1 * 1 + + +@with_columns( + col_1, + multiply_1__by_5, + multiply_1__by_1, + on_input="initial_df", + select=["col_1", "multiply_1"], +) +def final_df_2(initial_df: pl.LazyFrame) -> pl.LazyFrame: + return initial_df diff --git a/plugin_tests/h_polars/test_with_columns.py b/plugin_tests/h_polars/test_with_columns.py new file mode 100644 index 000000000..151347fb7 --- /dev/null +++ b/plugin_tests/h_polars/test_with_columns.py @@ -0,0 +1,265 @@ +import polars as pl +import pytest +from polars.testing import assert_frame_equal + +from hamilton import driver, node +from hamilton.function_modifiers.base import NodeInjector +from hamilton.plugins.h_polars import with_columns + +from .resources import with_columns_end_to_end + + +def test_create_column_nodes_pass_dataframe(): + def dummy_fn_with_columns(col_1: pl.Series) -> pl.Series: + return col_1 + 100 + + def target_fn(some_var: int, upstream_df: pl.DataFrame) -> pl.DataFrame: + return upstream_df + + dummy_node = node.Node.from_fn(target_fn) + + decorator = with_columns( + dummy_fn_with_columns, on_input="upstream_df", select=["dummy_fn_with_columns"] + ) + + injectable_params = NodeInjector.find_injectable_params([dummy_node]) + inject_parameter, initial_nodes = decorator.get_initial_nodes( + fn=target_fn, params=injectable_params + ) + + assert inject_parameter == "upstream_df" + assert len(initial_nodes) == 0 + + +def test_create_column_nodes_extract_single_columns(): + def dummy_fn_with_columns(col_1: pl.Series) -> pl.Series: + return col_1 + 100 + + def dummy_df() -> pl.DataFrame: + return pl.DataFrame({"col_1": [1, 2, 3, 4], "col_2": [11, 12, 13, 14]}) + + def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: + return upstream_df + + dummy_node = node.Node.from_fn(target_fn) + + decorator = with_columns( + dummy_fn_with_columns, columns_to_pass=["col_1"], select=["dummy_fn_with_columns"] + ) + injectable_params = NodeInjector.find_injectable_params([dummy_node]) + + inject_parameter, initial_nodes = decorator.get_initial_nodes( + fn=target_fn, params=injectable_params + ) + + assert inject_parameter == "upstream_df" + assert len(initial_nodes) == 1 + assert initial_nodes[0].name == "col_1" + assert initial_nodes[0].type == pl.Series + pl.testing.assert_series_equal( + initial_nodes[0].callable(upstream_df=dummy_df()), + pl.Series([1, 2, 3, 4]), + check_names=False, + ) + + +def test_create_column_nodes_extract_multiple_columns(): + def dummy_fn_with_columns(col_1: pl.Series) -> pl.Series: + return col_1 + 100 + + def dummy_df() -> pl.DataFrame: + return pl.DataFrame({"col_1": [1, 2, 3, 4], "col_2": [11, 12, 13, 14]}) + + def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: + return upstream_df + + dummy_node = node.Node.from_fn(target_fn) + + decorator = with_columns( + dummy_fn_with_columns, columns_to_pass=["col_1", "col_2"], select=["dummy_fn_with_columns"] + ) + injectable_params = NodeInjector.find_injectable_params([dummy_node]) + + inject_parameter, initial_nodes = decorator.get_initial_nodes( + fn=target_fn, params=injectable_params + ) + + assert inject_parameter == "upstream_df" + assert len(initial_nodes) == 2 + assert initial_nodes[0].name == "col_1" + assert initial_nodes[1].name == "col_2" + assert initial_nodes[0].type == pl.Series + assert initial_nodes[1].type == pl.Series + pl.testing.assert_series_equal( + initial_nodes[0].callable(upstream_df=dummy_df()), + pl.Series([1, 2, 3, 4]), + check_names=False, + ) + pl.testing.assert_series_equal( + initial_nodes[1].callable(upstream_df=dummy_df()), + pl.Series([11, 12, 13, 14]), + check_names=False, + ) + + +def test_no_matching_select_column_error(): + def dummy_fn_with_columns(col_1: pl.Series) -> pl.Series: + return col_1 + 100 + + def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: + return upstream_df + + dummy_node = node.Node.from_fn(target_fn) + select = "wrong_column" + + decorator = with_columns( + dummy_fn_with_columns, columns_to_pass=["col_1", "col_2"], select=select + ) + injectable_params = NodeInjector.find_injectable_params([dummy_node]) + + with pytest.raises(ValueError): + decorator.inject_nodes(injectable_params, {}, fn=target_fn) + + +def test_append_into_original_df(): + def dummy_fn_with_columns(col_1: pl.Series) -> pl.Series: + return col_1 + 100 + + def dummy_df() -> pl.DataFrame: + return pl.DataFrame({"col_1": [1, 2, 3, 4], "col_2": [11, 12, 13, 14]}) + + def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: + return upstream_df + + decorator = with_columns( + dummy_fn_with_columns, columns_to_pass=["col_1", "col_2"], select=["dummy_fn_with_columns"] + ) + + output_nodes, _ = decorator.chain_subdag_nodes( + fn=target_fn, inject_parameter="upstream_df", generated_nodes=[] + ) + merge_node = output_nodes[-1] + + output_df = merge_node.callable( + upstream_df=dummy_df(), + dummy_fn_with_columns=dummy_fn_with_columns(col_1=pl.Series([1, 2, 3, 4])), + ) + assert merge_node.name == "__append" + assert merge_node.type == pl.DataFrame + + pl.testing.assert_series_equal(output_df["col_1"], pl.Series([1, 2, 3, 4]), check_names=False) + pl.testing.assert_series_equal( + output_df["col_2"], pl.Series([11, 12, 13, 14]), check_names=False + ) + pl.testing.assert_series_equal( + output_df["dummy_fn_with_columns"], pl.Series([101, 102, 103, 104]), check_names=False + ) + + +def test_override_original_column_in_df(): + def dummy_df() -> pl.DataFrame: + return pl.DataFrame({"col_1": [1, 2, 3, 4], "col_2": [11, 12, 13, 14]}) + + def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: + return upstream_df + + def col_1() -> pl.Series: + return pl.col("col_1") * 100 + + decorator = with_columns(col_1, on_input="upstream_df", select=["col_1"]) + + output_nodes, _ = decorator.chain_subdag_nodes( + fn=target_fn, inject_parameter="upstream_df", generated_nodes=[] + ) + merge_node = output_nodes[-1] + + output_df = merge_node.callable(upstream_df=dummy_df(), col_1=col_1()) + assert merge_node.name == "__append" + assert merge_node.type == pl.DataFrame + + pl.testing.assert_series_equal( + output_df["col_1"], pl.Series([100, 200, 300, 400]), check_names=False + ) + pl.testing.assert_series_equal( + output_df["col_2"], pl.Series([11, 12, 13, 14]), check_names=False + ) + + +def test_assign_custom_namespace_with_columns(): + def dummy_fn_with_columns(col_1: pl.Series) -> pl.Series: + return col_1 + 100 + + def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: + return upstream_df + + dummy_node = node.Node.from_fn(target_fn) + decorator = with_columns( + dummy_fn_with_columns, + columns_to_pass=["col_1", "col_2"], + select=["dummy_fn_with_columns"], + namespace="dummy_namespace", + ) + nodes_ = decorator.transform_dag([dummy_node], {}, target_fn) + + assert nodes_[0].name == "target_fn" + assert nodes_[1].name == "dummy_namespace.dummy_fn_with_columns" + assert nodes_[2].name == "dummy_namespace.col_1" + assert nodes_[3].name == "dummy_namespace.__append" + + +def test_end_to_end_with_columns_automatic_extract(): + config_5 = { + "factor": 5, + } + dr = driver.Builder().with_modules(with_columns_end_to_end).with_config(config_5).build() + result = dr.execute(final_vars=["final_df"], inputs={"user_factor": 1000})["final_df"] + + expected_df = pl.DataFrame( + { + "col_1": [1, 2, 3, 4], + "col_2": [11, 12, 13, 14], + "col_3": [1, 1, 1, 1], + "subtract_1_from_2": [10, 10, 10, 10], + "multiply_3": [5, 5, 5, 5], + "add_1_by_user_adjustment_factor": [1001, 1002, 1003, 1004], + "multiply_2_by_upstream_3": [33, 36, 39, 42], + } + ) + pl.testing.assert_frame_equal(result, expected_df) + + config_7 = { + "factor": 7, + } + dr = driver.Builder().with_modules(with_columns_end_to_end).with_config(config_7).build() + result = dr.execute(final_vars=["final_df"], inputs={"user_factor": 1000})["final_df"] + + expected_df = pl.DataFrame( + { + "col_1": [1, 2, 3, 4], + "col_2": [11, 12, 13, 14], + "col_3": [1, 1, 1, 1], + "subtract_1_from_2": [10, 10, 10, 10], + "multiply_3": [7, 7, 7, 7], + "add_1_by_user_adjustment_factor": [1001, 1002, 1003, 1004], + "multiply_2_by_upstream_3": [33, 36, 39, 42], + } + ) + assert_frame_equal(result, expected_df) + + +def test_end_to_end_with_columns_pass_dataframe(): + config_5 = { + "factor": 5, + } + dr = driver.Builder().with_modules(with_columns_end_to_end).with_config(config_5).build() + + result = dr.execute(final_vars=["final_df_2"])["final_df_2"] + expected_df = pl.DataFrame( + { + "col_1": [1, 2, 3, 4], + "col_2": [11, 12, 13, 14], + "col_3": [0, 2, 4, 6], + "multiply_3": [0, 10, 20, 30], + } + ) + assert_frame_equal(result, expected_df) diff --git a/plugin_tests/h_polars/test_with_columns_lazy.py b/plugin_tests/h_polars/test_with_columns_lazy.py new file mode 100644 index 000000000..2cb52c4db --- /dev/null +++ b/plugin_tests/h_polars/test_with_columns_lazy.py @@ -0,0 +1,64 @@ +import polars as pl +from polars.testing import assert_frame_equal + +from hamilton import driver + +from .resources import with_columns_end_to_end_lazy + + +def test_end_to_end_with_columns_automatic_extract_lazy(): + config_5 = { + "factor": 5, + } + dr = driver.Builder().with_modules(with_columns_end_to_end_lazy).with_config(config_5).build() + result = dr.execute(final_vars=["final_df"], inputs={"user_factor": 1000})["final_df"] + + expected_df = pl.DataFrame( + { + "col_1": [1, 2, 3, 4], + "col_2": [11, 12, 13, 14], + "col_3": [1, 1, 1, 1], + "subtract_1_from_2": [10, 10, 10, 10], + "multiply_3": [5, 5, 5, 5], + "add_1_by_user_adjustment_factor": [1001, 1002, 1003, 1004], + "multiply_2_by_upstream_3": [33, 36, 39, 42], + } + ) + pl.testing.assert_frame_equal(result.collect(), expected_df) + + config_7 = { + "factor": 7, + } + dr = driver.Builder().with_modules(with_columns_end_to_end_lazy).with_config(config_7).build() + result = dr.execute(final_vars=["final_df"], inputs={"user_factor": 1000})["final_df"] + + expected_df = pl.DataFrame( + { + "col_1": [1, 2, 3, 4], + "col_2": [11, 12, 13, 14], + "col_3": [1, 1, 1, 1], + "subtract_1_from_2": [10, 10, 10, 10], + "multiply_3": [7, 7, 7, 7], + "add_1_by_user_adjustment_factor": [1001, 1002, 1003, 1004], + "multiply_2_by_upstream_3": [33, 36, 39, 42], + } + ) + assert_frame_equal(result.collect(), expected_df) + + +def test_end_to_end_with_columns_pass_dataframe_lazy(): + config_5 = { + "factor": 5, + } + dr = driver.Builder().with_modules(with_columns_end_to_end_lazy).with_config(config_5).build() + + result = dr.execute(final_vars=["final_df_2"])["final_df_2"] + expected_df = pl.DataFrame( + { + "col_1": [1, 2, 3, 4], + "col_2": [11, 12, 13, 14], + "col_3": [1, 1, 1, 1], + "multiply_1": [5, 10, 15, 20], + } + ) + assert_frame_equal(result.collect(), expected_df)