Skip to content

Commit

Permalink
Add Near dedup support in tools (intel#53)
Browse files Browse the repository at this point in the history
* Add new folder for near_dedup process

Signed-off-by: Xue, Chendi <[email protected]>

* replace with data sketch minHash calculation

Signed-off-by: Xue, Chendi <[email protected]>

* dd slimpajama codes as submodule

Signed-off-by: Xue, Chendi <[email protected]>

* update near dedup codes

Signed-off-by: Xue, Chendi <[email protected]>

* pythonize notebook

Signed-off-by: Xue, Chendi <[email protected]>

* dedup is completed and verified on PILE

Signed-off-by: Xue, Chendi <[email protected]>

* remove modelzoo

Signed-off-by: Xue, Chendi <[email protected]>

* Clean up and put all utils support to RecDP

Now only install pyrecdp will fix all dependencies's issue

Signed-off-by: Xue, Chendi <[email protected]>

* update readme

Signed-off-by: Xue, Chendi <[email protected]>

* update README and fix typo

Signed-off-by: Xue, Chendi <[email protected]>

---------

Signed-off-by: Xue, Chendi <[email protected]>
  • Loading branch information
xuechendi authored Aug 31, 2023
1 parent 2d7bfbe commit 5883b03
Show file tree
Hide file tree
Showing 19 changed files with 5,204 additions and 0 deletions.
447 changes: 447 additions & 0 deletions tools/near_dedup/PILE_notebooks/EuroParl_near_dedup.ipynb

Large diffs are not rendered by default.

422 changes: 422 additions & 0 deletions tools/near_dedup/PILE_notebooks/NIH_near_dedup.ipynb

Large diffs are not rendered by default.

421 changes: 421 additions & 0 deletions tools/near_dedup/PILE_notebooks/PUBMED_near_dedup.ipynb

Large diffs are not rendered by default.

350 changes: 350 additions & 0 deletions tools/near_dedup/PILE_notebooks/PhilArch_near_dedup.ipynb

Large diffs are not rendered by default.

250 changes: 250 additions & 0 deletions tools/near_dedup/PILE_notebooks/analysis.ipynb

Large diffs are not rendered by default.

236 changes: 236 additions & 0 deletions tools/near_dedup/PILE_notebooks/apply_deduplication.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "dd13cba6-6f8f-40b4-a956-e707f8fcb877",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 1 for number of processes\n",
"100%|████████████████████████████████████████████| 1/1 [03:05<00:00, 185.44s/it]\n",
"apply duplicates.pickle to create new data took 185.49705739098135 sec\n"
]
}
],
"source": [
"input_path = \"/home/vmagent/app/PILE\"\n",
"output_path = \"/home/vmagent/app/PILE_output/\"\n",
"\n",
"bucket = \"FreeLaw\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "58008b35-e595-47e5-be56-78a9927c24be",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 1 for number of processes\n",
"100%|█████████████████████████████████████████████| 1/1 [01:07<00:00, 67.05s/it]\n",
"apply duplicates.pickle to create new data took 67.08336020295974 sec\n"
]
}
],
"source": [
"bucket = \"PUBMED\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4465fc9b-a38e-4e26-b6bf-8d21a77873d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 45 for number of processes\n",
"100%|███████████████████████████████████████████| 45/45 [00:42<00:00, 1.06it/s]\n",
"apply duplicates.pickle to create new data took 42.78813537606038 sec\n"
]
}
],
"source": [
"bucket = \"pile_uspto\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b7636688-8ea7-4f87-968a-30ca11011f78",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 1 for number of processes\n",
"100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.91s/it]\n",
"apply duplicates.pickle to create new data took 25.924333511968143 sec\n"
]
}
],
"source": [
"bucket = \"EuroParl\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "70809848-c1f5-491c-9ac0-af964a269f01",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 1 for number of processes\n",
"100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.25s/it]\n",
"apply duplicates.pickle to create new data took 11.300638337968849 sec\n"
]
}
],
"source": [
"bucket = \"NIH\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8590f524-0b7a-4641-8bb4-2fa88221a344",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 1 for number of processes\n",
"100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.57s/it]\n",
"apply duplicates.pickle to create new data took 14.592552542919293 sec\n"
]
}
],
"source": [
"bucket = \"PhilArch\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4c1c2536-9562-4122-987a-9471ce5825a4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 1 for number of processes\n",
"100%|█████████████████████████████████████████████| 1/1 [00:04<00:00, 4.84s/it]\n",
"apply duplicates.pickle to create new data took 4.862166045000777 sec\n"
]
}
],
"source": [
"bucket = \"hn\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f9b4c02f-db61-4c70-bdc9-5dc8e65a28fa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"apply duplicates.pickle to create new data started ...\n",
"resetting to 48 for number of processes\n",
"100%|███████████████████████████████████████████| 48/48 [01:38<00:00, 2.05s/it]\n",
"apply duplicates.pickle to create new data took 98.7749809169909 sec\n"
]
}
],
"source": [
"bucket = \"pmc\"\n",
"! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46f4a90d-8f46-4c2b-bbd7-e9dc6b85b6dc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 5883b03

Please sign in to comment.