Skip to content

Commit

Permalink
odify notebook with simplified API
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <[email protected]>
  • Loading branch information
touma-I committed Nov 18, 2024
1 parent 981930c commit 3481667
Show file tree
Hide file tree
Showing 2 changed files with 247 additions and 63 deletions.
220 changes: 220 additions & 0 deletions transforms/language/html2parquet/notebooks/html2parquet-V0.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"!pip install data-prep-toolkit==0.2.2.dev2\n",
"!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n",
"!pip install pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "20663a67-5aa1-4b61-b989-94201613e41f",
"metadata": {},
"outputs": [],
"source": [
"from data_processing.runtime.pure_python import PythonTransformLauncher\n",
"from data_processing.utils import ParamsUtils\n",
"\n",
"from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n",
"import ast\n",
"\n",
"# create parameters\n",
"local_conf = {\n",
" \"input_folder\": \"input\",\n",
" \"output_folder\": \"output\",\n",
"}\n",
"\n",
"params = {\n",
" # Data access. Only required parameters are specified\n",
" \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
" \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n",
"}\n",
"import sys\n",
"sys.argv = ParamsUtils.dict_to_req(d=(params))\n",
"# create launcher\n",
"launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n",
"# launch\n",
"return_code = launcher.launch()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e75f6922-eb0f-4164-a536-f96393e04604",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4d2354db-1bb3-4a71-98df-f0f148af3a02",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"11:36:04 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n",
"11:36:04 INFO - pipeline id pipeline_id\n",
"11:36:04 INFO - code location None\n",
"11:36:04 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n",
"11:36:04 INFO - data factory data_ max_files -1, n_sample -1\n",
"11:36:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n",
"11:36:04 INFO - orchestrator html2parquet started at 2024-11-18 11:36:04\n",
"11:36:04 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n",
"11:36:04 INFO - Completed 1 files (100.0%) in 0.004 min\n",
"11:36:04 INFO - Done processing 1 files, waiting for flush() completion.\n",
"11:36:04 INFO - done flushing in 0.0 sec\n",
"11:36:04 INFO - Completed execution in 0.004 min, execution result 0\n"
]
}
],
"source": [
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e2bee8da-c566-4e45-bca1-354dfd04b0df",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>document</th>\n",
" <th>contents</th>\n",
" <th>document_id</th>\n",
" <th>size</th>\n",
" <th>date_acquired</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ai-alliance-index.html</td>\n",
" <td>ai-alliance-index.html</td>\n",
" <td>![](https://images.prismic.io/ai-alliance/Ztf3...</td>\n",
" <td>f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...</td>\n",
" <td>394</td>\n",
" <td>2024-11-18T11:36:04.040169</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title document \\\n",
"0 ai-alliance-index.html ai-alliance-index.html \n",
"\n",
" contents \\\n",
"0 ![](https://images.prismic.io/ai-alliance/Ztf3... \n",
"\n",
" document_id size \\\n",
"0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n",
"\n",
" date_acquired \n",
"0 2024-11-18T11:36:04.040169 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pyarrow.parquet as pq\n",
"import pandas as pd\n",
"table = pq.read_table('output/ai-alliance-index.parquet')\n",
"table.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "cde6e37d-c437-490f-8e01-f4f51a123484",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table.to_pandas()['contents'][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
90 changes: 27 additions & 63 deletions transforms/language/html2parquet/notebooks/html2parquet.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,88 +4,52 @@
"cell_type": "code",
"execution_count": 1,
"id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715",
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install data-prep-toolkit==0.2.2.dev2\n",
"!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n",
"!pip install 'data-prep-toolkit-transforms[html2parquet]'\n",
"!pip install pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "20663a67-5aa1-4b61-b989-94201613e41f",
"metadata": {},
"outputs": [],
"source": [
"from data_processing.runtime.pure_python import PythonTransformLauncher\n",
"from data_processing.utils import ParamsUtils\n",
"\n",
"from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e75f6922-eb0f-4164-a536-f96393e04604",
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"\n",
"# create parameters\n",
"local_conf = {\n",
" \"input_folder\": \"input\",\n",
" \"output_folder\": \"output\",\n",
"}\n",
"\n",
"params = {\n",
" # Data access. Only required parameters are specified\n",
" \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
" \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4d2354db-1bb3-4a71-98df-f0f148af3a02",
"id": "76469c87-cfd0-4acd-b57f-36edc52018ff",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"17:09:40 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n",
"17:09:40 INFO - pipeline id pipeline_id\n",
"17:09:40 INFO - code location None\n",
"17:09:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n",
"17:09:40 INFO - data factory data_ max_files -1, n_sample -1\n",
"17:09:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n",
"17:09:40 INFO - orchestrator html2parquet started at 2024-11-13 17:09:40\n",
"17:09:40 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n",
"17:09:47 INFO - Completed 1 files (100.0%) in 0.111 min\n",
"17:09:47 INFO - Done processing 1 files, waiting for flush() completion.\n",
"17:09:47 INFO - done flushing in 0.0 sec\n",
"17:09:47 INFO - Completed execution in 0.111 min, execution result 0\n"
"11:38:33 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n",
"11:38:33 INFO - pipeline id pipeline_id\n",
"11:38:33 INFO - code location None\n",
"11:38:33 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n",
"11:38:33 INFO - data factory data_ max_files -1, n_sample -1\n",
"11:38:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n",
"11:38:33 INFO - orchestrator html2parquet started at 2024-11-18 11:38:33\n",
"11:38:33 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n",
"11:38:33 INFO - Completed 1 files (100.0%) in 0.003 min\n",
"11:38:33 INFO - Done processing 1 files, waiting for flush() completion.\n",
"11:38:33 INFO - done flushing in 0.0 sec\n",
"11:38:33 INFO - Completed execution in 0.003 min, execution result 0\n"
]
}
],
"source": [
"\n",
"import sys\n",
"sys.argv = ParamsUtils.dict_to_req(d=(params))\n",
"# create launcher\n",
"launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n",
"# launch\n",
"return_code = launcher.launch()\n"
"from dpk_html2parquet.transform_python import Html2ParquetRuntime\n",
"x=Html2ParquetRuntime(input_folder= \"input\", \n",
" output_folder= \"output\", \n",
" data_files_to_use=['.html']).transform()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"id": "e2bee8da-c566-4e45-bca1-354dfd04b0df",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -126,7 +90,7 @@
" <td>![](https://images.prismic.io/ai-alliance/Ztf3...</td>\n",
" <td>f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...</td>\n",
" <td>394</td>\n",
" <td>2024-11-13T17:09:40.947095</td>\n",
" <td>2024-11-18T11:38:33.448019</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -143,10 +107,10 @@
"0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n",
"\n",
" date_acquired \n",
"0 2024-11-13T17:09:40.947095 "
"0 2024-11-18T11:38:33.448019 "
]
},
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -160,7 +124,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"id": "cde6e37d-c437-490f-8e01-f4f51a123484",
"metadata": {},
"outputs": [
Expand All @@ -170,7 +134,7 @@
"'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'"
]
},
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand Down

0 comments on commit 3481667

Please sign in to comment.