forked from intel/llm-on-ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Near dedup support in tools (intel#53)
* Add new folder for near_dedup process Signed-off-by: Xue, Chendi <[email protected]> * replace with data sketch minHash calculation Signed-off-by: Xue, Chendi <[email protected]> * dd slimpajama codes as submodule Signed-off-by: Xue, Chendi <[email protected]> * update near dedup codes Signed-off-by: Xue, Chendi <[email protected]> * pythonize notebook Signed-off-by: Xue, Chendi <[email protected]> * dedup is completed and verified on PILE Signed-off-by: Xue, Chendi <[email protected]> * remove modelzoo Signed-off-by: Xue, Chendi <[email protected]> * Clean up and put all utils support to RecDP Now only install pyrecdp will fix all dependencies's issue Signed-off-by: Xue, Chendi <[email protected]> * update readme Signed-off-by: Xue, Chendi <[email protected]> * update README and fix typo Signed-off-by: Xue, Chendi <[email protected]> --------- Signed-off-by: Xue, Chendi <[email protected]>
- Loading branch information
Showing
19 changed files
with
5,204 additions
and
0 deletions.
There are no files selected for viewing
447 changes: 447 additions & 0 deletions
447
tools/near_dedup/PILE_notebooks/EuroParl_near_dedup.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
421 changes: 421 additions & 0 deletions
421
tools/near_dedup/PILE_notebooks/PUBMED_near_dedup.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
350 changes: 350 additions & 0 deletions
350
tools/near_dedup/PILE_notebooks/PhilArch_near_dedup.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
236 changes: 236 additions & 0 deletions
236
tools/near_dedup/PILE_notebooks/apply_deduplication.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "dd13cba6-6f8f-40b4-a956-e707f8fcb877", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 1 for number of processes\n", | ||
"100%|████████████████████████████████████████████| 1/1 [03:05<00:00, 185.44s/it]\n", | ||
"apply duplicates.pickle to create new data took 185.49705739098135 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"input_path = \"/home/vmagent/app/PILE\"\n", | ||
"output_path = \"/home/vmagent/app/PILE_output/\"\n", | ||
"\n", | ||
"bucket = \"FreeLaw\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "58008b35-e595-47e5-be56-78a9927c24be", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 1 for number of processes\n", | ||
"100%|█████████████████████████████████████████████| 1/1 [01:07<00:00, 67.05s/it]\n", | ||
"apply duplicates.pickle to create new data took 67.08336020295974 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bucket = \"PUBMED\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"id": "4465fc9b-a38e-4e26-b6bf-8d21a77873d0", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 45 for number of processes\n", | ||
"100%|███████████████████████████████████████████| 45/45 [00:42<00:00, 1.06it/s]\n", | ||
"apply duplicates.pickle to create new data took 42.78813537606038 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bucket = \"pile_uspto\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "b7636688-8ea7-4f87-968a-30ca11011f78", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 1 for number of processes\n", | ||
"100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.91s/it]\n", | ||
"apply duplicates.pickle to create new data took 25.924333511968143 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bucket = \"EuroParl\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "70809848-c1f5-491c-9ac0-af964a269f01", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 1 for number of processes\n", | ||
"100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.25s/it]\n", | ||
"apply duplicates.pickle to create new data took 11.300638337968849 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bucket = \"NIH\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "8590f524-0b7a-4641-8bb4-2fa88221a344", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 1 for number of processes\n", | ||
"100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.57s/it]\n", | ||
"apply duplicates.pickle to create new data took 14.592552542919293 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bucket = \"PhilArch\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"id": "4c1c2536-9562-4122-987a-9471ce5825a4", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 1 for number of processes\n", | ||
"100%|█████████████████████████████████████████████| 1/1 [00:04<00:00, 4.84s/it]\n", | ||
"apply duplicates.pickle to create new data took 4.862166045000777 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bucket = \"hn\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"id": "f9b4c02f-db61-4c70-bdc9-5dc8e65a28fa", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", | ||
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | ||
" warnings.warn(\n", | ||
"apply duplicates.pickle to create new data started ...\n", | ||
"resetting to 48 for number of processes\n", | ||
"100%|███████████████████████████████████████████| 48/48 [01:38<00:00, 2.05s/it]\n", | ||
"apply duplicates.pickle to create new data took 98.7749809169909 sec\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bucket = \"pmc\"\n", | ||
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "46f4a90d-8f46-4c2b-bbd7-e9dc6b85b6dc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.