Add Near dedup support in tools (intel#53)

* Add new folder for near_dedup process Signed-off-by: Xue, Chendi <[email protected]> * replace with data sketch minHash calculation Signed-off-by: Xue, Chendi <[email protected]> * dd slimpajama codes as submodule Signed-off-by: Xue, Chendi <[email protected]> * update near dedup codes Signed-off-by: Xue, Chendi <[email protected]> * pythonize notebook Signed-off-by: Xue, Chendi <[email protected]> * dedup is completed and verified on PILE Signed-off-by: Xue, Chendi <[email protected]> * remove modelzoo Signed-off-by: Xue, Chendi <[email protected]> * Clean up and put all utils support to RecDP Now only install pyrecdp will fix all dependencies's issue Signed-off-by: Xue, Chendi <[email protected]> * update readme Signed-off-by: Xue, Chendi <[email protected]> * update README and fix typo Signed-off-by: Xue, Chendi <[email protected]> --------- Signed-off-by: Xue, Chendi <[email protected]>
zhangjian94cn · Aug 31, 2023 · 5883b03 · 5883b03
1 parent 2d7bfbe
commit 5883b03
Show file tree

Hide file tree

Showing 19 changed files with 5,204 additions and 0 deletions.
diff --git a/tools/near_dedup/PILE_notebooks/EuroParl_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/EuroParl_near_dedup.ipynb
diff --git a/tools/near_dedup/PILE_notebooks/NIH_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/NIH_near_dedup.ipynb
diff --git a/tools/near_dedup/PILE_notebooks/PUBMED_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/PUBMED_near_dedup.ipynb
diff --git a/tools/near_dedup/PILE_notebooks/PhilArch_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/PhilArch_near_dedup.ipynb
diff --git a/tools/near_dedup/PILE_notebooks/analysis.ipynb b/tools/near_dedup/PILE_notebooks/analysis.ipynb
diff --git a/tools/near_dedup/PILE_notebooks/apply_deduplication.ipynb b/tools/near_dedup/PILE_notebooks/apply_deduplication.ipynb
@@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "dd13cba6-6f8f-40b4-a956-e707f8fcb877",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 1 for number of processes\n",
+      "100%|████████████████████████████████████████████| 1/1 [03:05<00:00, 185.44s/it]\n",
+      "apply duplicates.pickle to create new data took 185.49705739098135 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_path = \"/home/vmagent/app/PILE\"\n",
+    "output_path = \"/home/vmagent/app/PILE_output/\"\n",
+    "\n",
+    "bucket = \"FreeLaw\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "58008b35-e595-47e5-be56-78a9927c24be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 1 for number of processes\n",
+      "100%|█████████████████████████████████████████████| 1/1 [01:07<00:00, 67.05s/it]\n",
+      "apply duplicates.pickle to create new data took 67.08336020295974 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "bucket = \"PUBMED\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "4465fc9b-a38e-4e26-b6bf-8d21a77873d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 45 for number of processes\n",
+      "100%|███████████████████████████████████████████| 45/45 [00:42<00:00,  1.06it/s]\n",
+      "apply duplicates.pickle to create new data took 42.78813537606038 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "bucket = \"pile_uspto\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b7636688-8ea7-4f87-968a-30ca11011f78",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 1 for number of processes\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.91s/it]\n",
+      "apply duplicates.pickle to create new data took 25.924333511968143 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "bucket = \"EuroParl\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "70809848-c1f5-491c-9ac0-af964a269f01",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 1 for number of processes\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.25s/it]\n",
+      "apply duplicates.pickle to create new data took 11.300638337968849 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "bucket = \"NIH\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8590f524-0b7a-4641-8bb4-2fa88221a344",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 1 for number of processes\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.57s/it]\n",
+      "apply duplicates.pickle to create new data took 14.592552542919293 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "bucket = \"PhilArch\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4c1c2536-9562-4122-987a-9471ce5825a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 1 for number of processes\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.84s/it]\n",
+      "apply duplicates.pickle to create new data took 4.862166045000777 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "bucket = \"hn\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f9b4c02f-db61-4c70-bdc9-5dc8e65a28fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n",
+      "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
+      "  warnings.warn(\n",
+      "apply duplicates.pickle to create new data started ...\n",
+      "resetting to 48 for number of processes\n",
+      "100%|███████████████████████████████████████████| 48/48 [01:38<00:00,  2.05s/it]\n",
+      "apply duplicates.pickle to create new data took 98.7749809169909 sec\n"
+     ]
+    }
+   ],
+   "source": [
+    "bucket = \"pmc\"\n",
+    "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46f4a90d-8f46-4c2b-bbd7-e9dc6b85b6dc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}