-
Notifications
You must be signed in to change notification settings - Fork 171
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Maroun Touma <[email protected]>
- Loading branch information
Showing
2 changed files
with
247 additions
and
63 deletions.
There are no files selected for viewing
220 changes: 220 additions & 0 deletions
220
transforms/language/html2parquet/notebooks/html2parquet-V0.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%capture\n", | ||
"!pip install data-prep-toolkit==0.2.2.dev2\n", | ||
"!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n", | ||
"!pip install pandas" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "20663a67-5aa1-4b61-b989-94201613e41f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from data_processing.runtime.pure_python import PythonTransformLauncher\n", | ||
"from data_processing.utils import ParamsUtils\n", | ||
"\n", | ||
"from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n", | ||
"import ast\n", | ||
"\n", | ||
"# create parameters\n", | ||
"local_conf = {\n", | ||
" \"input_folder\": \"input\",\n", | ||
" \"output_folder\": \"output\",\n", | ||
"}\n", | ||
"\n", | ||
"params = {\n", | ||
" # Data access. Only required parameters are specified\n", | ||
" \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", | ||
" \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n", | ||
"}\n", | ||
"import sys\n", | ||
"sys.argv = ParamsUtils.dict_to_req(d=(params))\n", | ||
"# create launcher\n", | ||
"launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n", | ||
"# launch\n", | ||
"return_code = launcher.launch()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "e75f6922-eb0f-4164-a536-f96393e04604", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "4d2354db-1bb3-4a71-98df-f0f148af3a02", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"11:36:04 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n", | ||
"11:36:04 INFO - pipeline id pipeline_id\n", | ||
"11:36:04 INFO - code location None\n", | ||
"11:36:04 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", | ||
"11:36:04 INFO - data factory data_ max_files -1, n_sample -1\n", | ||
"11:36:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n", | ||
"11:36:04 INFO - orchestrator html2parquet started at 2024-11-18 11:36:04\n", | ||
"11:36:04 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n", | ||
"11:36:04 INFO - Completed 1 files (100.0%) in 0.004 min\n", | ||
"11:36:04 INFO - Done processing 1 files, waiting for flush() completion.\n", | ||
"11:36:04 INFO - done flushing in 0.0 sec\n", | ||
"11:36:04 INFO - Completed execution in 0.004 min, execution result 0\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "e2bee8da-c566-4e45-bca1-354dfd04b0df", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>title</th>\n", | ||
" <th>document</th>\n", | ||
" <th>contents</th>\n", | ||
" <th>document_id</th>\n", | ||
" <th>size</th>\n", | ||
" <th>date_acquired</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>ai-alliance-index.html</td>\n", | ||
" <td>ai-alliance-index.html</td>\n", | ||
" <td>![](https://images.prismic.io/ai-alliance/Ztf3...</td>\n", | ||
" <td>f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...</td>\n", | ||
" <td>394</td>\n", | ||
" <td>2024-11-18T11:36:04.040169</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" title document \\\n", | ||
"0 ai-alliance-index.html ai-alliance-index.html \n", | ||
"\n", | ||
" contents \\\n", | ||
"0 ![](https://images.prismic.io/ai-alliance/Ztf3... \n", | ||
"\n", | ||
" document_id size \\\n", | ||
"0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n", | ||
"\n", | ||
" date_acquired \n", | ||
"0 2024-11-18T11:36:04.040169 " | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import pyarrow.parquet as pq\n", | ||
"import pandas as pd\n", | ||
"table = pq.read_table('output/ai-alliance-index.parquet')\n", | ||
"table.to_pandas()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "cde6e37d-c437-490f-8e01-f4f51a123484", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"table.to_pandas()['contents'][0]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters