diff --git a/tools/near_dedup/PILE_notebooks/EuroParl_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/EuroParl_near_dedup.ipynb new file mode 100644 index 000000000..4f9778881 --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/EuroParl_near_dedup.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will assign 48 cores and 308492 M memory for spark\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23/08/30 16:59:52 WARN Utils: Your hostname, sr414 resolves to a loopback address: 127.0.1.1; using 10.1.2.14 instead (on interface enp134s0f1)\n", + "23/08/30 16:59:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "23/08/30 16:59:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "per core memory size is 6.276 GB and shuffle_disk maximum capacity is 8589934592.000 GB\n" + ] + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "\n", + "from near_dedup import *" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f54ef8ee-4a10-475e-a83d-6a9a5f5fed07", + "metadata": {}, + "outputs": [], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/EuroParl')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/EuroParl/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/EuroParl/EuroParliamentProceedings_1996_2011.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 3:=====================================================> (195 + 5) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 16.729979965020902 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 14:=============================================> (173 + 27) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate minHashLsh took 271.9689698460279 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "82df9de1-8118-4d9b-9c44-4ce4da3039a9", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_connected_components all started ...\n", + "Started graph building\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 203/203 [00:00<00:00, 6960.22it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to process /home/vmagent/app/PILE_output/EuroParl/deduplicate/duplicates.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "Failed to process /home/vmagent/app/PILE_output/EuroParl/deduplicate/connected_components.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "length of the set of duplicates: 902 0.034215688705444336\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 902/902 [00:00<00:00, 861594.67it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of connected components: 340 0.03926563262939453\n", + "Graph generated duplicates list!!! 0.041168928146362305\n", + "generate_connected_components all took 0.045084498007781804 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_duplicates_dict all started ...\n", + "Processing duplicates!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 175970.31it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of duplicate documents that will be removed: 886\n", + "generate_duplicates_dict all took 0.008639081963337958 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed!!\n", + " total processed 69814 documents\n", + " total detected 886 duplicated documents\n", + " duplicate ratio is 0.01269086429655943\n" + ] + } + ], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/NIH_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/NIH_near_dedup.ipynb new file mode 100644 index 000000000..bafb2eebd --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/NIH_near_dedup.ipynb @@ -0,0 +1,422 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "\n", + "from near_dedup import *" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/NIH')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/NIH/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be318620-6fe1-44c6-991e-da208f8da1af", + "metadata": {}, + "outputs": [], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/NIH/NIH_ExPORTER_awarded_grant_text.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 3:=============================================> (170 + 30) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 14.274296162999235 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 14:==============================================> (177 + 23) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate minHashLsh took 114.12813645100687 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aff3047a-6f75-4529-9bbf-e02d8545e7eb", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_connected_components all started ...\n", + "Started graph building\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 203/203 [00:00<00:00, 1362.17it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to process /home/vmagent/app/PILE_output/NIH/deduplicate/duplicates.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "Failed to process /home/vmagent/app/PILE_output/NIH/deduplicate/connected_components.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "length of the set of duplicates: 103640 0.15403461456298828\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 103640/103640 [00:00<00:00, 879390.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of connected components: 83410 0.4011833667755127\n", + "Graph generated duplicates list!!! 0.4935333728790283\n", + "generate_connected_components all took 0.5348551459610462 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_duplicates_dict all started ...\n", + "Processing duplicates!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 83410/83410 [00:00<00:00, 871147.90it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of duplicate documents that will be removed: 98299\n", + "generate_duplicates_dict all took 0.24805862794164568 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed!!\n", + " total processed 939661 documents\n", + " total detected 98299 duplicated documents\n", + " duplicate ratio is 0.1046111310355543\n" + ] + } + ], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/PUBMED_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/PUBMED_near_dedup.ipynb new file mode 100644 index 000000000..c9db8fb9f --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/PUBMED_near_dedup.ipynb @@ -0,0 +1,421 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "from near_dedup import *\n" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/PUBMED')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/PUBMED/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d80742b3-9989-4c94-9953-383795826456", + "metadata": {}, + "outputs": [], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/PUBMED/PUBMED_title_abstracts_2019_baseline.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 3:====================================================> (192 + 8) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 51.56134534499142 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 14:==================================================> (189 + 11) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate minHashLsh took 1617.5527220640797 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c40de55-ae3a-4747-90df-554cc737d206", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_connected_components all started ...\n", + "Started graph building\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 203/203 [00:00<00:00, 3169.55it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to process /home/vmagent/app/PILE_output/PUBMED/deduplicate/duplicates.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "Failed to process /home/vmagent/app/PILE_output/PUBMED/deduplicate/connected_components.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "length of the set of duplicates: 27003 0.06862425804138184\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27003/27003 [00:00<00:00, 1095388.52it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of connected components: 15384 0.10237360000610352\n", + "Graph generated duplicates list!!! 0.11692953109741211\n", + "generate_connected_components all took 0.12613834207877517 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_duplicates_dict all started ...\n", + "Processing duplicates!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15384/15384 [00:00<00:00, 774370.22it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of duplicate documents that will be removed: 26417\n", + "generate_duplicates_dict all took 0.035620245966129005 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed!!\n", + " total processed 15518009 documents\n", + " total detected 26417 duplicated documents\n", + " duplicate ratio is 0.0017023446757892717\n" + ] + } + ], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/PhilArch_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/PhilArch_near_dedup.ipynb new file mode 100644 index 000000000..8289fa47f --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/PhilArch_near_dedup.ipynb @@ -0,0 +1,350 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "from near_dedup import *\n" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/PhilArch')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/PhilArch/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "da18c763-634b-4333-9c5b-6bfe7c325ec2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will assign 48 cores and 308492 M memory for spark\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23/08/31 16:15:23 WARN Utils: Your hostname, sr414 resolves to a loopback address: 127.0.1.1; using 10.1.2.14 instead (on interface enp134s0f1)\n", + "23/08/31 16:15:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "23/08/31 16:15:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "per core memory size is 6.276 GB and shuffle_disk maximum capacity is 8589934592.000 GB\n" + ] + } + ], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/PhilArch/PhilArchive.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 3:=================================================> (183 + 17) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 13.072971040033735 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 12:=========================> (96 + 48) / 200]\r" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba4b0b14-7976-4a31-80db-2ed2fe6ec3ed", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": {}, + "outputs": [], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/analysis.ipynb b/tools/near_dedup/PILE_notebooks/analysis.ipynb new file mode 100644 index 000000000..da18d1fa3 --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/analysis.ipynb @@ -0,0 +1,250 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6f6a7b1a-c1e5-41d4-a9e1-06708fd83352", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "! cp -r {cur_path}/near_dedup.py /usr/local/lib/python3.10/dist-packages/\n", + "! cp -r {cur_path}/third_party /usr/local/lib/python3.10/dist-packages/\n", + "\n", + "from near_dedup import *" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "00d759c0-0cfa-4685-821a-2231a091a4de", + "metadata": {}, + "outputs": [], + "source": [ + "path = '/home/vmagent/app/PILE/NIH'\n", + "dup_dir = \"/home/vmagent/app/PILE_output/NIH/deduplicate\"\n", + "from third_party.generate_duplicates_dict import *\n", + "input_file = os.path.join(dup_dir, \"connected_components.pickle\")\n", + "with open(input_file, \"rb\") as fin:\n", + " components, n_components, reversed_mapper = pickle.load(fin)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "8451e410-318e-4209-8a55-b49b2ce598d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "left is\n", + " NIH_ExPORTER_awarded_grant_text.jsonl@656163\n", + "right is\n", + " NIH_ExPORTER_awarded_grant_text.jsonl@551098\n", + "NIH_ExPORTER_awarded_grant_text.jsonl@656163 content is \n", + "{\"meta\": {\"APPLICATION_ID\": 8134498}, \"text\": \"The bacterial twin-arginine translocation (Tat) system exports proteins across the cytoplasmic membrane. Unlike most well-studied protein translocation systems, which transport \\\"linearized,\\\" or unfolded, polypeptides across a membrane, the Tat system translocates fully folded and assembled proteins and protein complexes. The Tat system transports many proteins that must assemble complex metallo-redox centers before transport. In some cases, the quaternary contacts between distinct subunits must be established before an assembled protein complex can be transported. Since the bacterial cytoplasmic membrane supports ion gradients, a major unresolved question is how large protein complexes >100 kDa can be transported across this membrane by the Tat machinery without collapsing the proton motive force used to make ATP. Due to its ability to transport large protein structures that must be fully folded before export, the Tat machinery is potentially important for the biotechnology industry as a system to bacterially express protein therapeutics that require a cytoplasm for maturation. Products could be recovered directly from the growth medium. Though the Tat transport system is not required for growth in all organisms that encode it, it is responsible for the export of a number of bacterial virulence factors, and the absence of a functional Tat system often leads to growth defects. Considering that the Tat system is found in many bacteria, but not found in animals, including humans, the Tat system is likely to be an excellent target for antibiotic development. Currently, the mechanism of Tat translocation is poorly understood. Three membrane proteins, TatA, TatB and TatC comprise the membrane translocase, forming numerous oligomeric complexes within the membrane. The transmembrane electric field is essential for driving efficient transport, presumably through a gated-pore. The common method to characterize protein translocation systems involves trapping a cargo protein during transport, that is, to form translocation intermediates. The Tat machinery has thus far resisted this approach. The Specific Aim of this proposal is to develop a high-throughput screening assay that will be used to search for candidate inhibitors of Tat transport. Positive hits from the primary screen will be validated using secondary screens and in vitro assays. Bona fide Tat transport inhibitors will be used to assist with mechanistic studies of Tat transport, and will be evaluated for pharmaceutical potential. PUBLIC HEALTH RELEVANCE: RELEVANCE: This proposal seeks inhibitors of the bacterial twin-arginine translocation (Tat) system, protein secretion machinery that is responsible for the export of a number of bacterial virulence factors, and that contributes to efficient bacterial growth. These inhibitors will be used to assist with future mechanistic studies of Tat transport, and will be evaluated for their possible pharmaceutical potential. Understanding the mechanism of Tat transport is essential for utilizing this unique system for the bacterial expression of protein therapeutics that requires a cytoplasm for maturation.\"}\n", + "NIH_ExPORTER_awarded_grant_text.jsonl@551098 content is \n", + "{\"meta\": {\"APPLICATION_ID\": 7617460}, \"text\": \"The bacterial twin-arginine translocation (Tat) system exports proteins across the cytoplasmic membrane. Unlike most well-studied protein translocation systems, which transport \\\"linearized,\\\" or unfolded, polypeptides across a membrane, the Tat system translocates fully folded and assembled proteins and protein complexes. The Tat system transports many proteins that must assemble complex metallo-redox centers before transport. In some cases, the quaternary contacts between distinct subunits must be established before an assembled protein complex can be transported. Since the bacterial cytoplasmic membrane supports ion gradients, a major unresolved question is how large protein complexes > 100 kDa can be transported across this membrane by the Tat machinery without collapsing the proton motive force used to make ATP. Due to its ability to transport large protein structures that must be fully folded before export, the Tat machinery is potentially important for the biotechnology industry as a system to bacterially express protein therapeutics that require a cytoplasm for maturation. Products could be recovered directly from the growth medium. Though the Tat transport system is not required for growth in all organisms that encode it, it is responsible for the export of a number of bacterial virulence factors, and the absence of a functional Tat system often leads to growth defects. Considering that the Tat system is found in many bacteria, but not found in animals, including humans, the Tat system is likely to be an excellent target for antibiotic development. Currently, the mechanism of Tat translocation is poorly understood. Three membrane proteins, TatA, TatB and TatC comprise the membrane translocase, forming numerous oligomeric complexes within the membrane. The transmembrane electric field is essential for driving efficient transport, presumably through a gated-pore. The common method to characterize protein translocation systems involves trapping a cargo protein during transport, that is, to form translocation intermediates. The Tat machinery has thus far resisted this approach. The Specific Aim of this proposal is to develop a high-throughput screening assay that will be used to search for candidate inhibitors of Tat transport. Positive hits from the primary screen will be validated using secondary screens and in vitro assays. Bona fide Tat transport inhibitors will be used to assist with mechanistic studies of Tat transport, and will be evaluated for pharmaceutical potential. [unreadable] PUBLIC HEALTH RELEVANCE: RELEVANCE: This proposal seeks inhibitors of the bacterial twin-arginine translocation (Tat) system, protein secretion machinery that is responsible for the export of a number of bacterial virulence factors, and that contributes to efficient bacterial growth. These inhibitors will be used to assist with future mechanistic studies of Tat transport, and will be evaluated for their possible pharmaceutical potential. Understanding the mechanism of Tat transport is essential for utilizing this unique system for the bacterial expression of protein therapeutics that requires a cytoplasm for maturation. [unreadable] [unreadable] [unreadable]\"}\n" + ] + } + ], + "source": [ + "idx = 1000\n", + "left = []\n", + "right = []\n", + "for rid, component in enumerate(components):\n", + " if rid > idx:\n", + " break\n", + " if rid != idx:\n", + " continue\n", + " print(\"left is\")\n", + " left.append(reversed_mapper[component[0]])\n", + " print(f\" {left[0]}\")\n", + " print(\"right is\")\n", + " for j in range(1, len(component)):\n", + " doc = reversed_mapper[component[j]]\n", + " right.append(doc)\n", + " print(f\" {doc}\")\n", + "file_name, n_row = left[0].split(\"@\")\n", + "print(left[0], \"content is \")\n", + "! sed -n {n_row}p {path}/{file_name}\n", + "for r in right:\n", + " print(r, \"content is \")\n", + " file_name, n_row = r.split(\"@\")\n", + " ! sed -n {n_row}p {path}/{file_name}" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "9df23cd2-7f52-4821-8578-f2bf1e04238c", + "metadata": {}, + "outputs": [], + "source": [ + "path = '/home/vmagent/app/PILE/pile_uspto'\n", + "dup_dir = \"/home/vmagent/app/PILE_output/pile_uspto/deduplicate\"\n", + "from third_party.generate_duplicates_dict import *\n", + "input_file = os.path.join(dup_dir, \"connected_components.pickle\")\n", + "with open(input_file, \"rb\") as fin:\n", + " components, n_components, reversed_mapper = pickle.load(fin)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "d583ffd2-a241-4604-9700-49da294d18d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "left is\n", + " data_4_time1600295577_2000.jsonl@132944\n", + "right is\n", + " data_17_time1600253218_1993.jsonl@24671\n", + " data_3_time1600293502_1999.jsonl@53846\n", + " data_0_time1600288933_1996.jsonl@52814\n", + "data_4_time1600295577_2000.jsonl@132944 content is \n", + "{\"text\": \"In 1981, documentation began on the disease that became known as Acquired Immune Deficiency Syndrome (AIDS), as well as its forerunner AIDS Related Complex (ARC). In 1983, the cause of the disease AIDS was established as a virus named the Human Immunodeficiency Virus type 1 (HIV-1). Usually, a person infected with the virus will eventually develop AIDS; in all known cases of AIDS the final outcome has always been death.\\nThe disease AIDS is the end result of an HIV-1 virus following its own complex life cycle. The virion life cycle begins with the virion attaching itself to the host human T-4 lymphocyte immune cell through the bonding of a glycoprotein on the surface of the virion's protective coat with the CD4 glycoprotein on the lymphocyte cell. Once attached, the virion sheds its glycoprotein coat, penetrates into the membrane of the host cell, and uncoats its RNA. The virion enzyme, reverse transcriptase, directs the process of transcribing the RNA into single stranded DNA. The viral RNA is degraded and a second DNA strand is created. The now double-stranded DNA is integrated into the human cell's genes and those genes are used for cell reproduction.\\nAt this point, the human cell carries out its reproductive process by using its own RNA polymerase to transcribe the integrated DNA into viral RNA. The viral RNA is translated into glycoproteins, structural proteins, and viral enzymes, which assemble with the viral RNA intact. When the host cell finishes the reproductive step, a new virion cell, not a T-4 lymphocyte, buds forth. The number of HIV-1 virus cells thus grows while the number of T-4 lymphocytes decline.\\nThe typical human immune system response, killing the invading virion, is taxed because a large portion of the virion's life cycle is spent in a latent state within the immune cell. In addition, viral reverse transcriptase, the enzyme used in making a new virion cell, is not very specific, and causes transcription mistakes that result in continually changed glycoproteins on the surface of the viral protective coat. This lack of specificity decreases the immune system's effectiveness because antibodies specifically produced against one glycoprotein may be useless against another, hence reducing the number of antibodies available to fight the virus. The virus continues to grow while the immune response system continues to weaken. Eventually, the HIV largely holds free reign over the body's immune system, allowing opportunistic infections to set in and ensuring that, without the administration of antiviral agents and/or immunomodulators, death will result.\\nThere are three critical points in the virus' life cycle which have been identified as targets for antiviral drugs: (1) the initial attachment of the virion to the T-4 lymphocyte, or macrophage, site, (2) the transcription of viral RNA to viral DNA, and (3) the assemblage of the new virion cell during reproduction.\\nInhibition of the virus at the second critical point, the viral RNA to viral DNA transcription process, has provided the bulk of the therapies used in treating AIDS. This transcription must occur for the virion to reproduce because the virion's genes are encoded in RNA; the host cell reads only DNA. By introducing drugs that block the reverse transcriptase from completing the formation of viral DNA, HIV-1 replication can be stopped.\\nNucleoside analogs, such as 3'-azido-3'-deoxythymidine (AZT), 2',3'-dideoxycytidine (DDC), 2',3'-dideoxythymidinene (D4T), 2',3'-dideoxyinosine (DDI), and various fluoro-derivatives of these nucleosides are relatively effective in halting HIV replication at the reverse transcriptase stage. Another promising reverse transcriptase inhibitor is 2',3'-dideoxy-3'-thia-cytidine (BCH-189), which contains an oxathiolane ring substituting for the sugar moiety in the nucleoside.\\nAZT is a successful anti-HIV drug because it sabotages the formation of viral DNA inside the host T-4 lymphocyte cell. When AZT enters the cell, cellular kinases activate AZT by phosphorylation to AZT triphosphate. AZT triphosphate then competes with natural thymidine nucleosides for the receptor site of HIV reverse transcriptase enzyme. The natural nucleoside possesses two reactive ends, the first for attachment to the previous nucleoside and the second for linking to the next nucleoside. The AZT molecule has only the first reactive end; once inside the HIV enzyme site, the AZT azide group terminates viral DNA formation because the azide cannot make the 3',5'-phosphodiester with the ribose moiety of the following nucleoside.\\nAZT's clinical benefits include increased longevity, reduced frequency and severity of opportunistic infections, and increased peripheral CD4 lymphocyte count. Immunosorbent assays for viral p24, an antigen used to track HIV-1 activity, show a significant decrease with use of AZT. However, AZT's benefits must be weighed against the severe adverse reactions of bone marrow suppression, nausea, myalgia, insomnia, severe headaches, anemia, peripheral neuropathy, and seizures. Furthermore, these adverse side effects occur immediately after treatment begins whereas a minimum of six weeks of therapy is necessary to realize AZT's benefits.\\nBoth DDC and D4T are potent inhibitors of HIV replication with activities comparable (D4T) or superior (DDC) to AZT. However, both DDC and D4T are converted to their 5' triphosphates less efficiently than their natural analogs and are resistent to deaminases and phosphorylases. Clinically, both compounds are toxic. Currently, DDI is used to conjunction with AZT to treat AIDS. However, DDI's side effects include sporadic pancreatis and peripheral neuropathy. Initial tests on 3'-fluoro-2'-3'-dideoxythymidine show that its anti-viral activity is comparable to that of AZT.\\nRecent tests on BCH-189 have shown that it possesses anti-HIV activity similar to AZT and DDC, but without the cell toxicity which causes the debilitating side effects of AZT and DDC. A sufficient quantity of BCH-189 is needed to allow clinical testing and treatment using the drug.\\nThe commonly-used chemical approaches for synthesizing nucleosides or nucleoside analogs can be classified into two broad categories: (1) those which modify intact nucleosides by altering the carbohydrate, the base, or both and (2) those which modify carbohydrates and incorporate the base, or its synthetic precursor, at a suitable stage in the synthesis. Because BCH-189 substitutes a sulfur atom for a carbon atom in the carbohydrate ring, the second approach is more feasible. The most important factor in this latter strategy involves delivering the base from the .beta.-face of the carbohydrate ring in the glycosylation reaction because only the .beta.-isomers exhibit useful biological activity.\\nIt is well known in the art that the stereoselective introduction of bases to the anomeric centers of carbohydrates can be controlled by capitalizing on the neighboring group participation of a 2-substituent on the carbohydrate ring (Chem. Ber. 114:1234 (1981)). However, BCH-189 and its analogs do not possess a 2-substitutent and, therefore, cannot utilize this procedure unless additional steps to introduce a functional group that is both directing and disposable are incorporated into the synthesis. These added steps would lower the overall efficiency of the synthesis.\\nIt is also well known in the art that \\\"considerable amounts of the undesired .alpha.-nucleosides are always formed during the synthesis of 2'-deoxyribosides\\\" (Chem. Ber. 114:1234, 1244 (1981)). Furthermore, this reference teaches that the use of simple Friedel-Crafts catalysts like SnCl.sub.4 in nucleoside syntheses produces undesirable emulsions upon the workup of the reaction mixture, generates complex mixtures of the .alpha. and .beta.-isomers, and leads to stable .delta.-complexes between the SnCl.sub.4 and the more basic silyated heterocycles such as silyated cytosine. These complexes lead to longer reaction times, lower yields, and production of the undesired unnatural N-3-nucleosides. Thus, the prior art teaches the use of trimethysilyl triflate or trimethylsilyl perchlorate as a catalyst during the coupling of pyrimidine bases with a carbohydrate ring to achieve high yields of the biologically active .beta.-isomers. However, the use of these catalysts to synthesize BCH-189 or BCH-189 analogs does not produce the .beta.-isomer preferentially; these reactions result in approximately a 50:50 ratio of the isomers.\\nThus, there exists a need for an efficient synthetic route to BCH-189 and its analogs. There also exists a need for a stereoselective synthetic route to the biologically active isomer of these compounds, .beta.-BCH-189 and related .beta.-analogs. Furthermore, there exists a need for a stereoselective synthetic route to enantiomerically-enriched .beta.-BCH-189 because the other enantiomer is inactive and, therefore, represents a 50% impurity.\", \"meta\": {\"bibliographic_information\": {\"Patent Number\": \"061537519\", \"Series Code\": \"9\", \"Application Number\": \"3379108\", \"Application Type\": \"1\", \"Art unit\": \"164\", \"Application Filing Date\": \"19990622\", \"Title of Invention\": \"Method and compositions for the synthesis of BCH-189 and related compounds\", \"Issue Date\": \"20001128\", \"Number of Claims\": \"1\", \"Exemplary Claim Number(s)\": \"1\", \"Assistant Examiner\": \"McKenzie; Thomas\", \"Primary Examiner\": \"Ford; John M.\", \"Number of Drawing Sheets\": \"4\", \"Number of figures\": \"4\"}, \"source_file\": \"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2000/pftaps20001128_wk48.zip\", \"abstract\": \"The present invention relates to a method of preparing BCH-189 and various analogs of BCH-189 from inexpensive precursors with the option of introducing functionality as needed. This synthetic route allows the stereoselective preparation of the biologically active isomer of these compounds, .beta.-BCH-189 and related compounds. Furthermore, the steochemistry at the nucleoside 4' position can be controlled to produce enantiomerically-enriched .beta.-BCH-189 and its analogs.\", \"citations\": [{\"Patent number\": \"4000137\", \"Issue date\": \"19761200\", \"Patentee name\": \"Dvonoch et al.\", \"US classification\": \"260252\"}, {\"Patent number\": \"4336381\", \"Issue date\": \"19820600\", \"Patentee name\": \"Nagata et al.\", \"US classification\": \"544313\"}, {\"Patent number\": \"4861759\", \"Issue date\": \"19890800\", \"Patentee name\": \"Mitsuya et al.\", \"US classification\": \"514 46\"}, {\"Patent number\": \"4879277\", \"Issue date\": \"19891100\", \"Patentee name\": \"Mitsuya et al.\", \"US classification\": \"514 49\"}, {\"Patent number\": \"4916122\", \"Issue date\": \"19900400\", \"Patentee name\": \"Chu et al.\", \"US classification\": \"514 50\"}, {\"Patent number\": \"4963533\", \"Issue date\": \"19901000\", \"Patentee name\": \"de Clerq et al.\", \"US classification\": \"514 49\"}, {\"Patent number\": \"5011774\", \"Issue date\": \"19910400\", \"Patentee name\": \"Farina et al.\", \"US classification\": \"435 87\"}, {\"Patent number\": \"5041449\", \"Issue date\": \"19910800\", \"Patentee name\": \"Belleau et al.\", \"US classification\": \"544317\"}, {\"Patent number\": \"5047407\", \"Issue date\": \"19910900\", \"Patentee name\": \"Belleau et al.\", \"US classification\": \"514254\"}, {\"Patent number\": \"5059690\", \"Issue date\": \"19911000\", \"Patentee name\": \"Zahler et al.\", \"US classification\": \"544317\"}, {\"Patent number\": \"5204466\", \"Issue date\": \"19930400\", \"Patentee name\": \"Liotta et al.\", \"US classification\": \"544317\"}, {\"Patent number\": \"5538975\", \"Issue date\": \"19960700\", \"Patentee name\": \"Dionne\"}, {\"Patent number\": \"5539116\", \"Issue date\": \"19960700\", \"Patentee name\": \"Liotta et al.\", \"US classification\": \"544317\"}, {\"Patent number\": \"5618820\", \"Issue date\": \"19970400\", \"Patentee name\": \"Dionne\"}, {\"Patent number\": \"5639787\", \"Issue date\": \"19970600\", \"Patentee name\": \"Mansour\"}, {\"Patent number\": \"5663320\", \"Issue date\": \"19970900\", \"Patentee name\": \"Mansour\"}, {\"Patent number\": \"5684164\", \"Issue date\": \"19971100\", \"Patentee name\": \"Belleau et al.\", \"US classification\": \"549 30\"}, {\"Patent number\": \"5696254\", \"Issue date\": \"19971200\", \"Patentee name\": \"Mansour\"}, {\"Patent number\": \"5700937\", \"Issue date\": \"19971200\", \"Patentee name\": \"Liotta\"}, {\"Patent number\": \"5728575\", \"Issue date\": \"19980300\", \"Patentee name\": \"Liotta\"}, {\"Patent number\": \"5744596\", \"Issue date\": \"19980400\", \"Patentee name\": \"Mansour\"}, {\"Patent number\": \"5756706\", \"Issue date\": \"19980500\", \"Patentee name\": \"Mansour\"}, {\"Patent number\": \"5814639\", \"Issue date\": \"19980600\", \"Patentee name\": \"Liotta\"}, {\"Patent number\": \"5827727\", \"Issue date\": \"19981000\", \"Patentee name\": \"Liotta\"}, {\"Patent number\": \"5892025\", \"Issue date\": \"19990600\", \"Patentee name\": \"Liotta\"}, {\"Patent number\": \"5914331\", \"Issue date\": \"19990600\", \"Patentee name\": \"Liotta\"}, {\"Patent number\": \"5914400\", \"Issue date\": \"19990600\", \"Patentee name\": \"Liotta et al.\", \"US classification\": \"544314\"}], \"assignees\": [{\"inventor name\": \"Emory University\", \"City\": \"Atlanta\", \"State\": \"GA\", \"Assignee type code\": \"02\"}], \"classifications\": [{\"OCL\": [\"544319\"], \"EDF\": [\"7\"], \"ICL\": [\"C07D41104\"], \"FSC\": [\"544\"], \"FSS\": [\"319\"]}], \"inventors\": [{\"inventor name\": \"Liotta; Dennis C.\", \"City\": \"McDonough\", \"State\": \"GA\"}, {\"inventor name\": \"Choi; Woo-Baeg\", \"City\": \"North Brunswick\", \"State\": \"NJ\"}]}}\n", + "data_17_time1600253218_1993.jsonl@24671 content is \n", + "{\"text\": \"The present invention relates to methods and compositions for preparing antiviral nucleoside analogs, particularly BCH-189 (2',3'-dideoxy-3'-thia-cytidine). More particularly, the invention relates to the selective synthesis of the .beta.-isomer of BCH-189 and related compounds as well as the selective synthesis of enantiomerically-enriched BCH-189 and related compounds.\\nIn 1981, documentation began on the disease that became known as Acquired Immune Deficiency Syndrome (AIDS), as well as its forerunner AIDS Related Complex (ARC). In 1983, the cause of the disease AIDS was established as a virus named the Human Immunodeficiency Virus type 1 (HIV-1). Usually, a person infected with the virus will eventually develop AIDS; in all known cases of AIDS the final outcome has always been death.\\nThe disease AIDS is the end result of an HIV-1 virus following its own complex life cycle. The virion life cycle begins with the virion attaching itself to the host human T-4 lymphocyte immune cell through the bonding of a glycoprotein on the surface of the virion's protective coat with the CD4 glycoprotein on the lymphocyte cell. Once attached, the virion sheds its glycoprotein coat, penetrates into the membrane of the host cell, and uncoats its RNA. The virion enzyme, reverse transcriptase, directs the process of transcribing the RNA into single stranded DNA. The viral RNA is degraded and a second DNA strand is created. The now double-stranded DNA is integrated into the human cell's genes and those genes are used for cell reproduction.\\nAt this point, the human cell carries out its reproductive process by using its own RNA polymerase to transcribe the integrated DNA into viral RNA. The viral RNA is translated into glycoproteins, structural proteins, and viral enzymes, which assemble with the viral RNA intact. When the host cell finishes the reproductive step, a new virion cell, not a T-4 lymphocyte, buds forth. The number of HIV-1 virus cells thus grows while the number of T-4 lymphocytes decline.\\nThe typical human immune system response, killing the invading virion, is taxed because a large portion of the virion's life cycle is spent in a latent state within the immune cell. In addition, viral reverse transcriptase, the enzyme used in making a new virion cell, is not very specific, and causes transcription mistakes that result in continually changed glycoproteins on the surface of the viral protective coat. This lack of specificity decreases the immune system's effectiveness because antibodies specifically produced against one glycoprotein may be useless against another, hence reducing the number of antibodies available to fight the virus. The virus continues to grow while the immune response system continues to weaken. Eventually, the HIV largely holds free reign over the body's immune system, allowing opportunistic infections to set in and ensuring that, without the administration of antiviral agents and/or immunomodulators, death will result.\\nThere are three critical points in the virus's life cycle which have been identified as targets for antiviral drugs: (1) the initial attachment of the virion to the T-4 lymphocyte, or macrophage, site, (2) the transcription of viral RNA to viral DNA, and (3) the assemblage of the new virion cell during reproduction.\\nInhibition of the virus at the second critical point, the viral RNA to viral DNA transcription process, has provided the bulk of the therapies used in treating AIDS. This transcription must occur for the virion to reproduce because the virion's genes are encoded in RNA; the host cell reads only DNA. By introducing drugs that block the reverse transcriptase from completing the formation of viral DNA, HIV-1 replication can be stopped.\\nNucleoside analogs, such as 3'-azido-3'-deoxythymidine (AZT), 2',3'-dideoxycytidine (DDC), 2',3'-dideoxythymidinene (D4T), 2',3'-dideoxyinosine (DDI), and various fluoro-derivatives of these nucleosides are relatively effective in halting HIV replication at the reverse transcriptase stage. Another promising reverse transcriptase inhibitor is 2',3'-dideoxy-3'-thia-cytidine (BCH-189), which contains an oxathiolane ring substituting for the sugar moiety in the nucleoside.\\nAZT is a successful anti-HIV drug because it sabotages the formation of viral DNA inside the host T-4 lymphocyte cell. When AZT enters the cell, cellular kinases activate AZT by phosphorylation to AZT triphosphate. AZT triphosphate then competes with natural thymidine nucleosides for the receptor site of HIV reverse transcriptase enzyme. The natural nucleoside possesses two reactive ends, the first for attachment to the previous nucleoside and the second for linking to the next nucleoside. The AZT molecule has only the first reactive end; once inside the HIV enzyme site, the AZT azide group terminates viral DNA formation because the azide cannot make the 3',5'-phosphodiester with the ribose moiety of the following nucleoside.\\nAZT's clinical benefits include increased longevity, reduced frequency and severity of opportunistic infections, and increased peripheral CD4 lymphocyte count. Immunosorbent assays for viral p24, an antigen used to track HIV-1 activity, show a significant decrease with use of AZT. However, AZT's benefits must be weighed against the severe adverse reactions of bone marrow suppression, nausea, myalgia, insomnia, severe headaches, anemia, peripheral neuropathy, and seizures. Furthermore, these adverse side effects occur immediately after treatment begins whereas a minimum of six weeks of therapy is necessary to realize AZT's benefits.\\nBoth DDC and D4T are potent inhibitors of HIV replication with activities comparable (D4T) or superior (DDC) to AZT. However, both DDC and D4T are converted to their 5' triphosphates less efficiently than their natural analogs and are resistent to deaminases and phosphorylases. Clinically, both compounds are toxic. Currently, DDI is used in conjunction with AZT to treat AIDS. However, DDI's side effects include sporadic pancreatis and peripheral neuropathy. Initial tests on 3'-fluoro-2'-3'-dideoxythymidine show that its anti-viral activity is comparable to that of AZT.\\nRecent tests on BCH-189 have shown that it possesses anti-HIV activity similar to AZT and DDC, but without the cell toxicity which causes the debilitating side effects of AZT and DDC. A sufficient quantity of BCH-189 is needed to allow clinical testing and treatment using the drug.\\nThe commonly-used chemical approaches for synthesizing nucleosides or nucleoside analogs can be classified into two broad categories: (1) those which modify intact nucleosides by altering the carbohydrate, the base, or both and (2) those which modify carbohydrates and incorporate the base, or its synthetic precursor, at a suitable stage in the synthesis. Because BCH-189 substitutes a sulfur atom for a carbon atom in the carbohydrate ring, the second approach is more feasible. The most important factor in this latter strategy involves delivering the base from the .beta.-face of the carbohydrate ring in the glycosylation reaction because only the .beta.-isomers exhibit useful biological activity.\\nIt is well known in the art that the stereoselective introduction of bases to the anomeric centers of carbohydrates can be controlled by capitalizing on the neighboring group participation of a 2-substituent on the carbohydrate ring (Chem. Ber. 114:1234 (1981)). However, BCH-189 and its analogs do not possess a 2-substitutent and, therefore, cannot utilize this procedure unless additional steps to introduce a functional group that is both directing and disposable are incorporated into the synthesis. These added steps would lower the overall efficiency of the synthesis.\\nIt is also well known in the art that \\\"considerable amounts of the undesired .alpha.-nucleosides are always formed during the synthesis of 2'-deoxyribosides\\\" (Chem. Ber. 114:1234, 1244 (1981)). Furthermore, this reference teaches that the use of simple Friedel-Crafts catalysts like SnCl.sub.4 in nucleoside syntheses produces undesirable emulsions upon the workup of the reaction mixture, generates complex mixtures of the .alpha. and .beta.-isomers, and leads to stable o-complexes between the SnCl.sub.4 and the more basic silyated heterocycles such as silyated cytosine. These complexes lead to longer reaction times, lower yields, and production of the undesired unnatural N-3-nucleosides. Thus, the prior art teaches the use of trimethysilyl triflate or trimethylsilyl perchlorate as a catalyst during the coupling of pyrimidine bases with a carbohydrate ring to achieve high yields of the biologically active .beta.-isomers. However, the use of these catalysts to synthesize BCH-189 or BCH-189 analogs does not produce the .beta.-isomer preferentially; these reactions result in approximately a 50:50 ratio of the isomers.\\nThus, there exists a need for an efficient synthetic route to BCH-189 and its analogs. There also exists a need for a stereoselective synthetic route to the biologically active isomer of these compounds, .beta.-BCH-189 and related .beta.-analogs. Furthermore, there exists a need for a stereoselective synthetic route to enantiomerically-enriched .beta.-BCH-189 because the other enantiomer is inactive and, therefore, represents a 50% impurity.\", \"meta\": {\"bibliographic_information\": {\"Patent Number\": \"052044661\", \"Series Code\": \"7\", \"Application Number\": \"4733185\", \"Application Type\": \"1\", \"Art unit\": \"122\", \"Application Filing Date\": \"19900201\", \"Title of Invention\": \"Method and compositions for the synthesis of BCH-189 and related compounds\", \"Issue Date\": \"19930420\", \"Number of Claims\": \"11\", \"Exemplary Claim Number(s)\": \"1\", \"Primary Examiner\": \"Tsang; Cecilia\", \"Number of Drawing Sheets\": \"4\", \"Number of figures\": \"4\"}, \"source_file\": \"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/1993/pftaps19930420_wk16.zip\", \"abstract\": \"The present invention relates to a method of preparing BCH-189 and various analogs of BCH-189 from inexpensive precursors with the option of introducing functionality as needed. This synthetic route allows the stereoselective preparation of the biologically active isomer of these compounds, .beta.-BCH-189 and related compounds. Furthermore, the steochemistry at the nucleoside 4' position can be controlled to produce enantiomerically-enriched .beta.-BCH-189 and its analogs.\", \"citations\": [{\"Patent number\": \"5047407\", \"Issue date\": \"19910900\", \"Patentee name\": \"Belleau et al.\", \"US classification\": \"544310\"}], \"assignees\": [{\"inventor name\": \"Emory University\", \"City\": \"Atlanta\", \"State\": \"GA\", \"Assignee type code\": \"02\"}], \"classifications\": [{\"OCL\": [\"544317\"], \"XCL\": [\"544310\", \"XCL 544212\", \"XCL 544276\", \"XCL 544277\", \"XCL 544313\", \"XCL 544229\", \"XCL 544314\", \"XCL 549 4\", \"XCL 549 30\"], \"EDF\": [\"5\"], \"ICL\": [\"C07D41104\", \"ICL C07D47300\", \"ICL C07F 502\"], \"FSC\": [\"549\", \"FSC 544\"], \"FSS\": [\"310;4;30\", \"FSS 212;276;277;313;314;229\"]}], \"inventors\": [{\"inventor name\": \"Liotta; Dennis C.\", \"City\": \"Stone Mountain\", \"State\": \"GA\"}, {\"inventor name\": \"Choi; Woo-Baeg\", \"City\": \"Atlanta\", \"State\": \"GA\"}]}}\n", + "data_3_time1600293502_1999.jsonl@53846 content is \n", + "{\"text\": \"In 1981, documentation began on the disease that became known as Acquired Immune Deficiency Syndrome (AIDS), as well as its forerunner AIDS Related Complex (ARC). In 1983, the cause of the disease AIDS was established as a virus named the Human Immunodeficiency Virus type 1 (HIV-1). Usually, a person infected with the virus will eventually develop AIDS; in all known cases of AIDS the final outcome has always been death.\\nThe disease AIDS is the end result of an HIV-1 virus following its own complex life cycle. The virion life cycle begins with the virion attaching itself to the host human T-4 lymphocyte immune cell through the bonding of a glycoprotein on the surface of the virion's protective coat with the CD4 glycoprotein on the lymphocyte cell. Once attached, the virion sheds its glycoprotein coat, penetrates into the membrane of the host cell, and uncoats its RNA. The virion enzyme, reverse transcriptase, directs the process of transcribing the RNA into single stranded DNA. The viral RNA is degraded and a second DNA strand is created. The now double-stranded DNA is integrated into the human cell's genes and those genes are used for cell reproduction.\\nAt this point, the human cell carries out its reproductive process by using its own RNA polymerase to transcribe the integrated DNA into viral RNA. The viral RNA is translated into glycoproteins, structural proteins, and viral enzymes, which assemble with the viral RNA intact. When the host cell finishes the reproductive step, a new virion cell, not a T-4 lymphocyte, buds forth. The number of HIV-1 virus cells thus grows while the number of T-4 lymphocytes decline.\\nThe typical human immune system response, killing the invading virion, is taxed because a large portion of the virion's life cycle is spent in a latent state within the immune cell. In addition, viral reverse transcriptase, the enzyme used in making a new virion cell, is not very specific, and causes transcription mistakes that result in continually changed glycoproteins on the surface of the viral protective coat. This lack of specificity decreases the immune system's effectiveness because antibodies specifically produced against one glycoprotein may be useless against another, hence reducing the number of antibodies available to fight the virus. The virus continues to grow while the immune response system continues to weaken. Eventually, the HIV largely holds free reign over the body's immune system, allowing opportunistic infections to set in and ensuring that, without the administration of antiviral agents and/or immunomodulators, death will result.\\nThere are three critical points in the virus' life cycle which have been identified as targets for antiviral drugs: (1) the initial attachment of the virion to the T-4 lymphocyte, or macrophage, site, (2) the transcription of viral RNA to viral DNA, and (3) the assemblage of the new virion cell during reproduction.\\nInhibition of the virus at the second critical point, the viral RNA to viral DNA transcription process, has provided the bulk of the therapies used in treating AIDS. This transcription must occur for the virion to reproduce because the virion's genes are encoded in RNA; the host cell reads only DNA. By introducing drugs that block the reverse transcriptase from completing the formation of viral DNA, HIV-1 replication can be stopped.\\nNucleoside analogs, such as 3'-azido-3'-deoxythymidine (AZT), 2',3'-dideoxycytidine (DDC), 2',3'-dideoxythymidinene (D4T), 2',3'-dideoxyinosine (DDI), and various fluoro-derivatives of these nucleosides are relatively effective in halting HIV replication at the reverse transcriptase stage. Another promising reverse transcriptase inhibitor is 2',3'-dideoxy-3'-thia-cytidine (BCH-189), which contains an oxathiolane ring substituting for the sugar moiety in the nucleoside.\\nAZT is a successful anti-HIV drug because it sabotages the formation of viral DNA inside the host T-4 lymphocyte cell. When AZT enters the cell, cellular kinases activate AZT by phosphorylation to AZT triphosphate. AZT triphosphate then competes with natural thymidine nucleosides for the receptor site of HIV reverse transcriptase enzyme. The natural nucleoside possesses two reactive ends, the first for attachment to the previous nucleoside and the second for linking to the next nucleoside. The AZT molecule has only the first reactive end; once inside the HIV enzyme site, the AZT azide group terminates viral DNA formation because the azide cannot make the 3',5'-phosphodiester with the ribose moiety of the following nucleoside.\\nAZT's clinical benefits include increased longevity, reduced frequency and severity of opportunistic infections, and increased peripheral CD4 lymphocyte count. Immunosorbent assays for viral p24, an antigen used to track HIV-1 activity, show a significant decrease with use of AZT. However, AZT's benefits must be weighed against the severe adverse reactions of bone marrow suppression, nausea, myalgia, insomnia, severe headaches, anemia, peripheral neuropathy, and seizures. Furthermore, these adverse side effects occur immediately after treatment begins whereas a minimum of six weeks of therapy is necessary to realize AZT's benefits.\\nBoth DDC and D4T are potent inhibitors of HIV replication with activities comparable (D4T) or superior (DDC) to AZT. However, both DDC and D4T are converted to their 5' triphosphates less efficiently than their natural analogs and are resistent to deaminases and phosphorylases. Clinically, both compounds are toxic. Currently, DDI is used to conjunction with AZT to treat AIDS. However, DDI's side effects include sporadic pancreatis and peripheral neuropathy. Initial tests on 3'-fluoro-2'-3'-dideoxythymidine show that its anti-viral activity is comparable to that of AZT.\\nRecent tests on BCH-189 have shown that it possesses anti-HIV activity similar to AZT and DDC, but without the cell toxicity which causes the debilitating side effects of AZT and DDC. A sufficient quantity of BCH-189 is needed to allow clinical testing and treatment using the drug.\\nThe commonly-used chemical approaches for synthesizing nucleosides or nucleoside analogs can be classified into two broad categories: (1) those which modify intact nucleosides by altering the carbohydrate, the base, or both and (2) those which modify carbohydrates and incorporate the base, or its synthetic precursor, at a suitable stage in the synthesis. Because BCH-189 substitutes a sulfur atom for a carbon atom in the carbohydrate ring, the second approach is more feasible. The most important factor in this latter strategy involves delivering the base from the .beta.-face of the carbohydrate ring in the glycosylation reaction because only the $isomers exhibit useful biological activity.\\nIt is well known in the art that the stereoselective introduction of bases to the anomeric centers of carbohydrates can be controlled by capitalizing on the neighboring group participation of a 2-substituent on the carbohydrate ring (Chem. Ber. 114:1234 (1981)). However, BCH-189 and its analogs do not possess a 2-substitutent and, therefore, cannot utilize this procedure unless additional steps to introduce a functional group that is both directing and disposable are incorporated into the synthesis. These added steps would lower the overall efficiency of the synthesis.\\nIt is also well known in the art that \\\"considerable amounts of the undesired .alpha.-nucleosides are always formed during the synthesis of 2'-deoxyribosides\\\" (Chem. Ber. 114:1234, 1244 (1981)). Furthermore, this reference teaches that the use of simple Friedel-Crafts catalysts like SnCl.sub.4 in nucleoside syntheses produces undesirable emulsions upon the workup of the reaction mixture, generates complex mixtures of the .alpha. and .beta.-isomers, and leads to stable .delta.-complexes between the SnCl.sub.4 and the more basic silyated heterocycles such as silyated cytosine. These complexes lead to longer reaction times, lower yields, and production of the undesired unnatural N-3-nucleosides. Thus, the prior art teaches the use of trimethysilyl triflate or trimethylsilyl perchlorate as a catalyst during the coupling of pyrimidine bases with a carbohydrate ring to achieve high yields of the biologically active .beta.-isomers. However, the use of these catalysts to synthesize BCH-189 or BCH-189 analogs does not produce the .beta.-isoner preferentially; these reactions result in approximately a 50:50 ratio of the isomers.\\nThus, there exists a need for an efficient synthetic route to BCH-189 and its analogs. There also exists a need for a stereoselective synthetic route to the biologically active isomer of these compounds, .beta.-BCH-189 and related .beta.-analogs. Furthermore, there exists a need for a stereoselective synthetic route to enantiomerically-enriched .beta.-BCH-189 because the other enantiomer is inactive and, therefore, represents a 50% impurity.\", \"meta\": {\"bibliographic_information\": {\"Patent Number\": \"059144009\", \"Series Code\": \"8\", \"Application Number\": \"4723457\", \"Application Type\": \"1\", \"Art unit\": \"161\", \"Application Filing Date\": \"19950607\", \"Title of Invention\": \"Method and compositions for the synthesis of BCH-189 and related compounds\", \"Issue Date\": \"19990622\", \"Number of Claims\": \"4\", \"Exemplary Claim Number(s)\": \"1\", \"Primary Examiner\": \"Ford; John M.\", \"Number of Drawing Sheets\": \"4\", \"Number of figures\": \"4\"}, \"source_file\": \"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/1999/pftaps19990622_wk25.zip\", \"abstract\": \"The present invention relates to a method of preparing BCH-189 and various analogs of BCH-189 from inexpensive precursors with the option of introducing functionality as needed. This synthetic route allows the stereoselective preparation of the biologically active isomer of these compounds, .beta.-BCH-189 and related compounds. Furthermore, the steochemistry at the nucleoside 4' position can be controlled to produce enantiomerically-enriched .beta.-BCH-189 and its analogs.\", \"citations\": [{\"Patent number\": \"4000137\", \"Issue date\": \"19761200\", \"Patentee name\": \"Dvonoch et al.\", \"US classification\": \"260252\"}, {\"Patent number\": \"4336381\", \"Issue date\": \"19820600\", \"Patentee name\": \"Nagata et al.\", \"US classification\": \"544313\"}, {\"Patent number\": \"4861759\", \"Issue date\": \"19890800\", \"Patentee name\": \"Mitsuya et al.\", \"US classification\": \"514 46\"}, {\"Patent number\": \"4879277\", \"Issue date\": \"19891100\", \"Patentee name\": \"Mitsuya et al.\", \"US classification\": \"514 49\"}, {\"Patent number\": \"4916122\", \"Issue date\": \"19900400\", \"Patentee name\": \"Chu et al.\", \"US classification\": \"514 50\"}, {\"Patent number\": \"4963533\", \"Issue date\": \"19901000\", \"Patentee name\": \"de Clerq et al.\", \"US classification\": \"514 49\"}, {\"Patent number\": \"5011774\", \"Issue date\": \"19910400\", \"Patentee name\": \"Farina et al.\", \"US classification\": \"435 87\"}, {\"Patent number\": \"5041449\", \"Issue date\": \"19910800\", \"Patentee name\": \"Belleau et al.\", \"US classification\": \"544317\"}, {\"Patent number\": \"5059690\", \"Issue date\": \"19911000\", \"Patentee name\": \"Zahler et al.\", \"US classification\": \"544276\"}, {\"Patent number\": \"5204466\", \"Issue date\": \"19930400\", \"Patentee name\": \"Liotta et al.\", \"US classification\": \"544317\"}, {\"Patent number\": \"5539116\", \"Issue date\": \"19960700\", \"Patentee name\": \"Liotta et al.\", \"US classification\": \"544317\"}], \"assignees\": [{\"inventor name\": \"Emory University\", \"City\": \"Atlanta\", \"State\": \"GA\", \"Assignee type code\": \"02\"}], \"classifications\": [{\"OCL\": [\"544314\"], \"XCL\": [\"544317\", \"XCL 544310\", \"XCL 544313\", \"XCL 544229\"], \"EDF\": [\"6\"], \"ICL\": [\"C07D23934\", \"ICL C07D23936\", \"ICL C07D23954\"], \"FSC\": [\"544\"], \"FSS\": [\"310;317;313;229;314;274\"]}], \"inventors\": [{\"inventor name\": \"Liotta; Dennis C.\", \"City\": \"McDonough\", \"State\": \"GA\"}, {\"inventor name\": \"Choi; Woo-Baeg\", \"City\": \"North Brunswick\", \"State\": \"NY\"}]}}\n", + "data_0_time1600288933_1996.jsonl@52814 content is \n", + "{\"text\": \"The present invention relates to methods and compositions for preparing antiviral nucleoside analogs, particularly BCH-189 (2',3'-dideoxy-3'-thia-cytidine). More particularly, the invention relates to the selective synthesis of the .beta.-isomer of BCH-189 and related compounds as well as the selective synthesis of enantiomerically-enriched BCH-189 and related compounds.\\nIn 1981, documentation began on the disease that became known as Acquired Immune Deficiency Syndrome (AIDS), as well as its forerunner AIDS Related Complex (ARC). In 1983, the cause of the disease AIDS was established as a virus named the Human Immunodeficiency Virus type 1 (HIV-1). Usually, a person infected with the virus will eventually develop AIDS; in all known cases of AIDS the final outcome has always been death.\\nThe disease AIDS is the end result of an HIV-1 virus following its own complex life cycle. The virion life cycle begins with the virion attaching itself to the host human T-4 lymphocyte immune cell through the bonding of a glycoprotein on the surface of the virion's protective coat with the CD4 glycoprotein on the lymphocyte cell. Once attached, the virion sheds its glycoprotein coat, penetrates into the membrane of the host cell, and uncoats its RNA. The virion enzyme, reverse transcriptase, directs the process of transcribing the RNA into single stranded DNA. The viral RNA is degraded and a second DNA strand is created. The now double-stranded DNA is integrated into the human cell's genes and those genes are used for cell reproduction.\\nAt this point, the human cell carries out its reproductive process by using its own RNA polymerase to transcribe the integrated DNA into viral RNA. The viral RNA is translated into glycoproteins, structural proteins, and viral enzymes, which assemble with the viral RNA intact. When the host cell finishes the reproductive step, a new virion cell, not a T-4 lymphocyte, buds forth. The number of HIV-1 virus cells thus grows while the number of T-4 lymphocytes decline.\\nThe typical human immune system response, killing the invading virion, is taxed because a large portion of the virion's life cycle is spent in a latent state within the immune cell. In addition, viral reverse transcriptase, the enzyme used in making a new virion cell, is not very specific, and causes transcription mistakes that result in continually changed glycoproteins on the surface of the viral protective coat. This lack of specificity decreases the immune system's effectiveness because antibodies specifically produced against one glycoprotein may be useless against another, hence reducing the number of antibodies available to fight the virus. The virus continues to grow while the immune response system continues to weaken. Eventually, the HIV largely holds free reign over the body's immune system, allowing opportunistic infections to set in and ensuring that, without the administration of antiviral agents and/or immunomodulators, death will results.\\nThere are three critical points in the virus's life cycle which have been identified as targets for antiviral drugs: (1) the initial attachment of the virion to the T-4 lymphocyte, or macrophage, site, (2) the transcription of viral RNA to viral DNA, and (3) the assemblage of the new virion cell during reproduction.\\nInhibition of the virus at the second critical point, the viral RNA to vital DNA transcription process, has provided the bulk of the therapies used in treating AIDS. This transcription must occur for the virion to reproduce because the virion's genes are encoded in RNA; the host cell reads only DNA. By introducing drugs that block the reverse transcriptase from completing the formation of viral DNA, HIV-1 replication can be stopped.\\nNucleoside analogs, such as 3'-azido-3'-deoxythymidine (AZT), 2',3'-dideoxycytidine (DDC), 2',3'-dideoxythymidinene (D4T), 2',3'-dideoxyinosine (DDI), and various fluoro-derivatives of these nucleosides are relatively effective in halting HIV replication at the reverse transcriptase stage. Another promising reverse transcriptase inhibitor is 2',3'-dideoxy-3'-thia-cytidine (BCH-189), which contains an oxathiolane ring substituting for the sugar moiety in the nucleoside.\\nAZT is a successful anti-HIV drug because it sabotages the formation of viral DNA inside the host T-4 lymphocyte cell. When AZT enters the cell, cellular kinases activate AZT by phosphorylation to AZT triphosphate. AZT triphosphate then competes with natural thymidine nucleosides for the receptor site of HIV reverse transcriptase enzyme. The natural nucleoside possesses two reactive ends, the first for attachment to the previous nucleoside and the second for linking to the next nucleoside. The AZT molecule has only the first reactive end; once inside the HIV enzyme site, the AZT azide group terminates viral DNA formation because the azide cannot make the 3',5'-phosphodiester with the ribose moiety of the following nucleoside.\\nAZT's clinical benefits include increased longevity, reduced frequency and severity of opportunistic infections, and increased peripheral CD4 lymphocyte count. Immunosorbent assays for viral p24, an antigen used to track HIV-1 activity, show a significant decrease with use of AZT. However, AZT's benefits must be weighed against the severe adverse reactions of bone marrow suppression, nausea, myalgia, insomnia, severe headaches, anemia, peripheral neuropathy, and seizures. Furthermore, these adverse side effects occur immediately after treatment begins whereas a minimum of six weeks of therapy is necessary to realize AZT's benefits.\\nBoth DDC and D4T are potent inhibitors of HIV replication with activities comparable (D4T) or superior (DDC) to AZT. However, both DDC and D4T are converted to their 5' triphosphates less efficiently than their natural analogs and are resistent to deaminases and phosphorylases. Clinically, both compounds are toxic. Currently, DDI is used in conjunction with AZT to treat AIDS. However, DDI's side effects include sporadic pancreatis and peripheral neuropathy. Initial tests on 3'-fluoro-2'-3'-dideoxythymidine show that its anti-viral activity is comparable to that of AZT.\\nRecent tests on BCH-189 have shown that it possesses anti-HIV activity similar to AZT and DDC, but without the cell toxicity which causes the debilitating side effects of AZT and DDC. A sufficient quantity of BCH-189 is needed to allow clinical testing and treatment using the drug.\\nThe commonly-used chemical approaches for synthesizing nucleosides or nucleoside analogs can be classified into two broad categories: (1) those which modify intact nuceosides by altering the carbohydrate, the base, or both and (2) those which modify carbohydrates and incorporate the base, or its synthetic precursor, at a suitable stage in the synthesis. Because BCH-189 substitutes a sulfur atom for a carbon atom in the carbohydrate ring, the second approach is more feasible. The most important factor in this latter strategy involves delivering the base from the .beta.-face of the carbohydrate ring in the glycosylation reaction because only the .beta.-isomers exhibit useful biological activity.\\nIt is well known in the art that the stereoselective introduction of bases to the anomeric centers of carbohydrates can be controlled by capitalizing on the neighboring group participation of a 2-substituent on the carbohydrate ring (Chem. Ber. 114:1234 (1981)). However, BCH-189 and its analogs do not possess a 2-substitutent and, therefore, cannot utilize this procedure unless additional steps to introduce a functional group that is both directing and disposable are incorporated into the synthesis. These added steps would lower the overall efficiency of the synthesis.\\nIt is also well known in the art that \\\"considerable amounts of the undesired .alpha.-nucleosides are always formed during the synthesis of 2'-deoxyribosides\\\" (Chem. Ber. 114:1234, 1244 (1981)). Furthermore, this reference teaches that the use of simple Friedel-Crafts catalysts like SnCl.sub.4 in nucleoside syntheses produces undesirable emulsions upon the workup of the reaction mixture, generates complex mixtures of the .alpha. and .beta.-isomers, and leads to stable p-complexes between the SnCl.sub.4 and the more basic silyated heterocycles such as silyated cytosine. These complexes lead to longer reaction times, lower yields, and production of the undesired unnatural N-3-nucleosides. Thus, the prior art teaches the use of trimethysilyl triflate or trimethylsilyl perchlorate as a catalyst during the coupling of pyrimidine bases with a carbohydrate ring to achieve high yields of the biologically active .beta.-isomers. However, the use of these catalysts to synthesize BCH-189 or BCH-189 analogs does not produce the .beta.-isomer preferentially; these reactions result in approximately a 50:50 ratio of the isomers.\\nThus, there exists a need for an efficient synthetic route to BCH-189 and its analogs. There also exists a need for a stereoselective synthetic route to the biologically active isomer of these compounds, .beta.-BCH-189 and related .beta.-analogs. Furthermore, there exists a need for a stereoselective synthetic route to enantiomerically-enriched .beta.-BCH-189 because the other enantiomer is inactive and, therefore, represents a 50% impurity.\", \"meta\": {\"bibliographic_information\": {\"Patent Number\": \"055391168\", \"Series Code\": \"8\", \"Application Number\": \"0159921\", \"Application Type\": \"1\", \"Art unit\": \"122\", \"Application Filing Date\": \"19930210\", \"Title of Invention\": \"Method and compositions for the synthesis of BCH-189 and related compounds\", \"Issue Date\": \"19960723\", \"Number of Claims\": \"7\", \"Exemplary Claim Number(s)\": \"1\", \"Primary Examiner\": \"Tsang; Cecilia\", \"Number of Drawing Sheets\": \"4\", \"Number of figures\": \"4\", \"Disclaimer Date\": \"20140321\"}, \"source_file\": \"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/1996/pftaps19960723_wk30.zip\", \"abstract\": \"The present invention relates to a method of preparing BCH-189 and various analogs of BCH-189 from inexpensive precursors with the option of introducing functionality as needed. This synthetic route allows the stereoselective preparation of the biologically active isomer of these compounds, .beta.-BCH-189 and related compounds. Furthermore, the steochemistry at the nucleoside 4' position can be controlled to produce enantiomerically-enriched .beta.-BCH-189 and its analogs.\", \"citations\": [{\"Patent number\": \"4000137\", \"Issue date\": \"19761200\", \"Patentee name\": \"Dvonoch et al.\", \"US classification\": \"260252\"}, {\"Patent number\": \"4336381\", \"Issue date\": \"19820600\", \"Patentee name\": \"Nagata et al.\", \"US classification\": \"544313\"}, {\"Patent number\": \"4861759\", \"Issue date\": \"19890800\", \"Patentee name\": \"Mitsuya et al.\", \"US classification\": \"514 49\"}, {\"Patent number\": \"4879277\", \"Issue date\": \"19891100\", \"Patentee name\": \"Mitsuya et al.\", \"US classification\": \"514 49\"}, {\"Patent number\": \"4916122\", \"Issue date\": \"19900400\", \"Patentee name\": \"Chu et al.\", \"US classification\": \"514 50\"}, {\"Patent number\": \"4963533\", \"Issue date\": \"19901000\", \"Patentee name\": \"de Clerq et al.\", \"US classification\": \"514 49\"}, {\"Patent number\": \"5011774\", \"Issue date\": \"19910400\", \"Patentee name\": \"Farina et al.\", \"US classification\": \"435 87\"}, {\"Patent number\": \"5041449\", \"Issue date\": \"19910800\", \"Patentee name\": \"Bellean et al.\", \"US classification\": \"544277\"}, {\"Patent number\": \"5047407\", \"Issue date\": \"19910900\", \"Patentee name\": \"Bellean et al.\", \"US classification\": \"514274\"}, {\"Patent number\": \"5059690\", \"Issue date\": \"19911000\", \"Patentee name\": \"Zahler et al.\", \"US classification\": \"544276\"}], \"assignees\": [{\"inventor name\": \"Emory University\", \"City\": \"Atlanta\", \"State\": \"GA\", \"Assignee type code\": \"02\"}], \"classifications\": [{\"OCL\": [\"544317\"], \"XCL\": [\"544229\", \"XCL 544313\", \"XCL 544314\", \"XCL 544318\"], \"EDF\": [\"6\"], \"ICL\": [\"C07D41104\", \"ICL C07F 502\", \"ICL A61K 31505\"], \"FSC\": [\"544\"], \"FSS\": [\"313;314;317\"]}], \"inventors\": [{\"inventor name\": \"Liotta; Dennis C.\", \"City\": \"Stone Mountain\", \"State\": \"GA\"}, {\"inventor name\": \"Choi; Woo-Baeg\", \"City\": \"Atlanta\", \"State\": \"GA\"}]}}\n" + ] + } + ], + "source": [ + "idx = 1000\n", + "left = []\n", + "right = []\n", + "for rid, component in enumerate(components):\n", + " if rid > idx:\n", + " break\n", + " if rid != idx:\n", + " continue\n", + " print(\"left is\")\n", + " left.append(reversed_mapper[component[0]])\n", + " print(f\" {left[0]}\")\n", + " print(\"right is\")\n", + " for j in range(1, len(component)):\n", + " doc = reversed_mapper[component[j]]\n", + " right.append(doc)\n", + " print(f\" {doc}\")\n", + "file_name, n_row = left[0].split(\"@\")\n", + "print(left[0], \"content is \")\n", + "! sed -n {n_row}p {path}/{file_name}\n", + "for r in right:\n", + " print(r, \"content is \")\n", + " file_name, n_row = r.split(\"@\")\n", + " ! sed -n {n_row}p {path}/{file_name}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b03c214-a031-426e-bdb8-96c72101193d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/apply_deduplication.ipynb b/tools/near_dedup/PILE_notebooks/apply_deduplication.ipynb new file mode 100644 index 000000000..6d9bd89a6 --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/apply_deduplication.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "dd13cba6-6f8f-40b4-a956-e707f8fcb877", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 1 for number of processes\n", + "100%|████████████████████████████████████████████| 1/1 [03:05<00:00, 185.44s/it]\n", + "apply duplicates.pickle to create new data took 185.49705739098135 sec\n" + ] + } + ], + "source": [ + "input_path = \"/home/vmagent/app/PILE\"\n", + "output_path = \"/home/vmagent/app/PILE_output/\"\n", + "\n", + "bucket = \"FreeLaw\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "58008b35-e595-47e5-be56-78a9927c24be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 1 for number of processes\n", + "100%|█████████████████████████████████████████████| 1/1 [01:07<00:00, 67.05s/it]\n", + "apply duplicates.pickle to create new data took 67.08336020295974 sec\n" + ] + } + ], + "source": [ + "bucket = \"PUBMED\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4465fc9b-a38e-4e26-b6bf-8d21a77873d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 45 for number of processes\n", + "100%|███████████████████████████████████████████| 45/45 [00:42<00:00, 1.06it/s]\n", + "apply duplicates.pickle to create new data took 42.78813537606038 sec\n" + ] + } + ], + "source": [ + "bucket = \"pile_uspto\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b7636688-8ea7-4f87-968a-30ca11011f78", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 1 for number of processes\n", + "100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.91s/it]\n", + "apply duplicates.pickle to create new data took 25.924333511968143 sec\n" + ] + } + ], + "source": [ + "bucket = \"EuroParl\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "70809848-c1f5-491c-9ac0-af964a269f01", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 1 for number of processes\n", + "100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.25s/it]\n", + "apply duplicates.pickle to create new data took 11.300638337968849 sec\n" + ] + } + ], + "source": [ + "bucket = \"NIH\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8590f524-0b7a-4641-8bb4-2fa88221a344", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 1 for number of processes\n", + "100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.57s/it]\n", + "apply duplicates.pickle to create new data took 14.592552542919293 sec\n" + ] + } + ], + "source": [ + "bucket = \"PhilArch\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4c1c2536-9562-4122-987a-9471ce5825a4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 1 for number of processes\n", + "100%|█████████████████████████████████████████████| 1/1 [00:04<00:00, 4.84s/it]\n", + "apply duplicates.pickle to create new data took 4.862166045000777 sec\n" + ] + } + ], + "source": [ + "bucket = \"hn\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f9b4c02f-db61-4c70-bdc9-5dc8e65a28fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n", + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "apply duplicates.pickle to create new data started ...\n", + "resetting to 48 for number of processes\n", + "100%|███████████████████████████████████████████| 48/48 [01:38<00:00, 2.05s/it]\n", + "apply duplicates.pickle to create new data took 98.7749809169909 sec\n" + ] + } + ], + "source": [ + "bucket = \"pmc\"\n", + "! cd ../; python dedup_convert.py -d {input_path}/{bucket} -f {output_path}/{bucket}/deduplicate/duplicates.pickle -o {output_path}/{bucket}/output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46f4a90d-8f46-4c2b-bbd7-e9dc6b85b6dc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/freelaw_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/freelaw_near_dedup.ipynb new file mode 100644 index 000000000..05d5a4349 --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/freelaw_near_dedup.ipynb @@ -0,0 +1,415 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "\n", + "from near_dedup import *" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/FreeLaw')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/FreeLaw/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17e776fe-d214-43e8-80be-3b0bf8604e1d", + "metadata": {}, + "outputs": [], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/FreeLaw/FreeLaw_Opinions.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 3:======================================================>(199 + 1) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 92.56428317597602 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate minHashLsh took 2542.00012657803 sec\n" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88a1e50b-a333-4e88-bb78-47a88b6fd6db", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_connected_components all started ...\n", + "Started graph building\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 203/203 [00:00<00:00, 1566.65it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to process /home/vmagent/app/PILE_output/FreeLaw/deduplicate/duplicates.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "Failed to process /home/vmagent/app/PILE_output/FreeLaw/deduplicate/connected_components.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "length of the set of duplicates: 108593 0.13472366333007812\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108593/108593 [00:00<00:00, 1026996.80it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of connected components: 54997 0.3537757396697998\n", + "Graph generated duplicates list!!! 0.42903733253479004\n", + "generate_connected_components all took 0.4670676819514483 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_duplicates_dict all started ...\n", + "Processing duplicates!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54997/54997 [00:00<00:00, 645282.05it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of duplicate documents that will be removed: 101089\n", + "generate_duplicates_dict all took 0.13365483097732067 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed!!\n", + " total processed 3562015 documents\n", + " total detected 101089 duplicated documents\n", + " duplicate ratio is 0.02837972327460721\n" + ] + } + ], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/hn_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/hn_near_dedup.ipynb new file mode 100644 index 000000000..2510192a4 --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/hn_near_dedup.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "\n", + "from near_dedup import * " + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/hn')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/hn/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2a9bf4a-a43d-4ab2-955e-eb91ba675737", + "metadata": {}, + "outputs": [], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/hn/part_0.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 3:======================================================>(197 + 3) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 14.270188356051221 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 14:================================================> (184 + 16) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate minHashLsh took 103.93559605104383 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63f02224-2e7b-410b-a6f0-b3f1525339e9", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_connected_components all started ...\n", + "Started graph building\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 203/203 [00:00<00:00, 5747.60it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to process /home/vmagent/app/PILE_output/hn/deduplicate/duplicates.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "Failed to process /home/vmagent/app/PILE_output/hn/deduplicate/connected_components.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "length of the set of duplicates: 4853 0.04064679145812988\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4853/4853 [00:00<00:00, 1337382.21it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of connected components: 26 0.04944562911987305\n", + "Graph generated duplicates list!!! 0.05143308639526367\n", + "generate_connected_components all took 0.05592275899834931 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_duplicates_dict all started ...\n", + "Processing duplicates!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 6544.55it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of duplicate documents that will be removed: 4853\n", + "generate_duplicates_dict all took 0.009566550957970321 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed!!\n", + " total processed 373027 documents\n", + " total detected 4853 duplicated documents\n", + " duplicate ratio is 0.013009782133732948\n" + ] + } + ], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "053a9ddd-c398-4d17-a011-ed523c652bad", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/pile_uspto_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/pile_uspto_near_dedup.ipynb new file mode 100644 index 000000000..2c4fca9e7 --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/pile_uspto_near_dedup.ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "from near_dedup import *\n" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/pile_uspto')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/pile_uspto/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "306ef8e3-6a7c-43d9-b4c4-1a941c1e141a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will assign 48 cores and 308492 M memory for spark\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23/08/30 16:26:53 WARN Utils: Your hostname, sr414 resolves to a loopback address: 127.0.1.1; using 10.1.2.14 instead (on interface enp134s0f1)\n", + "23/08/30 16:26:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "23/08/30 16:26:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "per core memory size is 6.276 GB and shuffle_disk maximum capacity is 8589934592.000 GB\n" + ] + } + ], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/pile_uspto/data_0_time1600242225_1976.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_0_time1600288933_1996.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_0_time1600364422_2006.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_10_time1600247352_1986.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_10_time1600397964_2016.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_11_time1600248055_1987.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_11_time1600403125_2017.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_12_time1600248735_1988.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_12_time1600407882_2018.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_13_time1600249593_1989.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_13_time1600413654_2019.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_14_time1600250414_1990.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_14_time1600418007_2020.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_15_time1600251300_1991.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_16_time1600252232_1992.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_17_time1600253218_1993.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_18_time1600254256_1994.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_19_time1600255349_1995.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_1_time1600242733_1977.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_1_time1600290224_1997.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_1_time1600366285_2007.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_2_time1600243253_1978.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_2_time1600291526_1998.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_2_time1600368167_2008.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_3_time1600243638_1979.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_3_time1600293502_1999.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_3_time1600370287_2009.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_4_time1600244129_1980.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_4_time1600295577_2000.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_4_time1600373162_2010.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_5_time1600244662_1981.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_5_time1600297749_2001.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_5_time1600376227_2011.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_6_time1600245134_1982.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_6_time1600300737_2002.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_6_time1600379743_2012.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_7_time1600245601_1983.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_7_time1600303730_2003.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_7_time1600383780_2013.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_8_time1600246160_1984.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_8_time1600306688_2004.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_8_time1600388299_2014.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_9_time1600246762_1985.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_9_time1600308480_2005.jsonl\n", + "/home/vmagent/app/PILE/pile_uspto/data_9_time1600393078_2015.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 268:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 212.13308584492188 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate minHashLsh took 1390.2824745549588 sec\n" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "50077ae0-131d-4930-9b6d-04f7d6c40bb3", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_connected_components all started ...\n", + "Started graph building\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8801/8801 [00:01<00:00, 8729.32it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "length of the set of duplicates: 1220088 1.012829065322876\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1220088/1220088 [00:01<00:00, 706828.50it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of connected components: 613348 4.172353982925415\n", + "Graph generated duplicates list!!! 5.222286939620972\n", + "generate_connected_components all took 5.889061606954783 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_duplicates_dict all started ...\n", + "Processing duplicates!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 613348/613348 [00:01<00:00, 553290.64it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of duplicate documents that will be removed: 1063551\n", + "generate_duplicates_dict all took 2.27969934605062 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed!!\n", + " total processed 5883037 documents\n", + " total detected 1063551 duplicated documents\n", + " duplicate ratio is 0.18078264678600525\n" + ] + } + ], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "053a9ddd-c398-4d17-a011-ed523c652bad", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/PILE_notebooks/pmc_near_dedup.ipynb b/tools/near_dedup/PILE_notebooks/pmc_near_dedup.ipynb new file mode 100644 index 000000000..3773e8b65 --- /dev/null +++ b/tools/near_dedup/PILE_notebooks/pmc_near_dedup.ipynb @@ -0,0 +1,511 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "cur_path = \"/home/vmagent/app\"\n", + "sys.path.append(cur_path)\n", + "\n", + "from near_dedup import *\n" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/PILE/pmc')\n", + "dup_dir = \"/home/vmagent/app/PILE_output/pmc/deduplicate\"\n", + "ngram_size = 13\n", + "num_perm = 256\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a3876729-c56a-4ad9-97f3-a85d77818444", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will assign 48 cores and 308492 M memory for spark\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23/08/30 17:13:59 WARN Utils: Your hostname, sr414 resolves to a loopback address: 127.0.1.1; using 10.1.2.14 instead (on interface enp134s0f1)\n", + "23/08/30 17:13:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "23/08/30 17:14:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "per core memory size is 6.276 GB and shuffle_disk maximum capacity is 8589934592.000 GB\n" + ] + } + ], + "source": [ + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID started ...\n", + "/home/vmagent/app/PILE/pmc/part_0.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_1.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_10.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_11.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_12.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_13.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_14.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_15.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_16.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_17.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_18.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_19.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_2.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_20.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_21.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_22.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_23.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_24.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_25.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_26.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_27.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_28.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_29.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_3.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_30.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_31.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_32.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_33.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_34.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_35.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_36.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_37.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_38.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_39.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_4.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_40.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_41.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_42.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_43.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_44.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_45.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_46.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_47.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_5.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_6.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_7.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_8.jsonl\n", + "/home/vmagent/app/PILE/pmc/part_9.jsonl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 144:===================================================(9600 + 0) / 9600]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load data with RowID took 274.0298769559013 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "generate minHashLsh started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate minHashLsh took 3619.6923032089835 sec\n" + ] + } + ], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "50077ae0-131d-4930-9b6d-04f7d6c40bb3", + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_connected_components all started ...\n", + "Started graph building\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 38%|███████████████████████████████████████████▎ | 3646/9603 [00:00<00:00, 36458.38it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to process /home/vmagent/app/PILE_output/pmc/deduplicate/duplicates.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loop on file: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9603/9603 [00:00<00:00, 36910.04it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to process /home/vmagent/app/PILE_output/pmc/deduplicate/connected_components.pickle, error is 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte\n", + "length of the set of duplicates: 239010 0.2639467716217041\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 239010/239010 [00:00<00:00, 1167798.88it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of connected components: 1148 0.4844677448272705\n", + "Graph generated duplicates list!!! 0.5864162445068359\n", + "generate_connected_components all took 0.6859650410478935 sec\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate_duplicates_dict all started ...\n", + "Processing duplicates!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1148/1148 [00:00<00:00, 9223.18it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of duplicate documents that will be removed: 234309\n", + "generate_duplicates_dict all took 0.21165297203697264 sec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed!!\n", + " total processed 3097097 documents\n", + " total detected 234309 duplicated documents\n", + " duplicate ratio is 0.07565439506738084\n" + ] + } + ], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "053a9ddd-c398-4d17-a011-ed523c652bad", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/README.md b/tools/near_dedup/README.md new file mode 100644 index 000000000..a3d75e39c --- /dev/null +++ b/tools/near_dedup/README.md @@ -0,0 +1,67 @@ +# Near Dedup + +## Intro + +Near Dedup is to Detect duplicated documents and output as a duplicates list. + +Step 1. We use [DataSketch minHash](https://ekzhu.com/datasketch/minhash.html) as the base algorithm to calculate (hash, band_id) pair for each documents. + +Step 2. We use Spark Groupby to find the local lists for documents sharing the same (hash, band_id) pair. + +Step 3. We use SlimPajama [connected component graph](https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/dedup/generate_connected_components.py) to detect global lists and [generate duplication list](https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/dedup/generate_duplicates_dict.py) for documents sharing the same (hash, band_id) pair. + +Step 4(Optional). We apply the duplication list to original file to elimate duplicated documents. + +Now, only support to run with single node. Distributed Run will support by Ray in near Future. + +## Expected input and Output + +Input format: a folder of *.jsonl. + +near_dedup.py Output: a folder with duplicates.pickle(index of duplicated documents), connected_components.pickle(graph indicates the connections between duplications) + +dedup_convert.py Output: a folder of *.jsonl files (deduplicated) + +## How to RUN + +0. setup +``` +cd llm-ray/tools/near_dedup +cd docker +docker compose build && docker compose run autofe-notebook-dev +``` + +### option 1. using cmdline + +``` +docker exec -it `docker ps | grep 'autofe-notebook-dev' | awk '{print $NF}'` /bin/bash +``` +Now you are inside container +0. (Optional) convert text files to jsonl +``` +python convert_jsonl.py -d ${raw_data_dir} -o ${output_dir} -n {num_partition} | tee -a ${log} +``` + +1. generate dedup pickle +``` +python near_dedup.py -d ${to_dedup_data_folder} | tee -a ${log} +``` + +2. using dedup pickle to shink data +``` +python dedup_convert.py -d ${to_dedup_data_folder} -f ${dedup_dict_file} -o ${output_folder} | tee -a ${log} +``` + +### option 2. using notebook + +There will be notebook url shown as below +``` +[I 2023-08-28 16:24:48.132 ServerApp] Serving notebooks from local directory: /home/vmagent/app/workspace +[I 2023-08-28 16:24:48.132 ServerApp] Jupyter Server 2.7.1 is running at: +[I 2023-08-28 16:24:48.132 ServerApp] http://${hostname}:8890/lab +[I 2023-08-28 16:24:48.132 ServerApp] http://127.0.0.1:8890/lab +[I 2023-08-28 16:24:48.132 ServerApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). + +``` +template notebooks are at [template_notebooks](template_notebooks) +PILE processed notebooks are at [PILE_notebooks](PILE_notebooks) \ No newline at end of file diff --git a/tools/near_dedup/convert_jsonl.py b/tools/near_dedup/convert_jsonl.py new file mode 100644 index 000000000..31dade479 --- /dev/null +++ b/tools/near_dedup/convert_jsonl.py @@ -0,0 +1,24 @@ +import argparse +import os +from multiprocessing import Pool, cpu_count +from pyrecdp.core.utils import Timer +from math import ceil +from tqdm import tqdm +import json +from pyrecdp.primitives.llmutils.text_to_jsonl import * + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-d", dest="data_dir", type=str) + parser.add_argument("-o", dest="out_dir", type=str) + parser.add_argument("-n", dest="n_part", type=int, default = 10) + args = parser.parse_args() + + data_dir = args.data_dir + out_dir = args.out_dir + n_part = args.n_part + + with Timer(f"apply duplicates.pickle to create new data"): + text_to_jsonl_MP(data_dir, out_dir, n_part) + \ No newline at end of file diff --git a/tools/near_dedup/dedup_convert.py b/tools/near_dedup/dedup_convert.py new file mode 100644 index 000000000..5b7befd48 --- /dev/null +++ b/tools/near_dedup/dedup_convert.py @@ -0,0 +1,35 @@ +import argparse +import os +import sys +import pickle +import queue +from multiprocessing import Pool, cpu_count +from pyrecdp.core.utils import Timer +from math import ceil +from tqdm import tqdm +from pyrecdp.primitives.llmutils.shrink_jsonl import * + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # data_files, dup_dir, ngram_size, num_perm, bands, ranges + #pipeline = minHashLSH_prepare(df, num_perm = 256, ngram_size = 6, bands = 9, ranges = 13) + parser.add_argument("-d", dest="data_dir", type=str) + parser.add_argument("-f", dest="dup_dict", type=str, default=None) + parser.add_argument("-o", dest="out_dir", type=str, default=None) + args = parser.parse_args() + + data_dir = args.data_dir + dup_dir = os.path.join(data_dir, "deduplicate") + if args.dup_dict is None: + dup_dict = os.path.join(dup_dir, "duplicates.pickle") + else: + dup_dict = args.dup_dict + + if args.out_dir is None: + out_dir = os.path.join(dup_dir, "output") + else: + out_dir = args.out_dir + + with Timer(f"apply duplicates.pickle to create new data"): + shrink_document_MP(data_dir, dup_dict, out_dir) + \ No newline at end of file diff --git a/tools/near_dedup/docker/Dockerfile b/tools/near_dedup/docker/Dockerfile new file mode 100644 index 000000000..076e34b0b --- /dev/null +++ b/tools/near_dedup/docker/Dockerfile @@ -0,0 +1,15 @@ +FROM ubuntu:22.04 + +RUN apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + python3 \ + python3-pip \ + python-is-python3 \ + git \ + graphviz \ + openjdk-8-jre && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +RUN pip install --upgrade pip +RUN pip install jupyterlab +RUN pip install pyspark +RUN pip install pyrecdp --pre \ No newline at end of file diff --git a/tools/near_dedup/docker/docker-compose.yml b/tools/near_dedup/docker/docker-compose.yml new file mode 100644 index 000000000..3734ba13e --- /dev/null +++ b/tools/near_dedup/docker/docker-compose.yml @@ -0,0 +1,29 @@ +networks: + autofe_workflow: + external: true +services: + autofe-notebook-dev: + build: + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + dockerfile: Dockerfile + context: ./ + command: + - /bin/bash + - -c + - | + jupyter lab --allow-root --ip 0.0.0.0 --NotebookApp.token='' --NotebookApp.password='' --notebook-dir /home/vmagent/app/ + container_name: autofe_dev + network_mode: "host" + environment: + - http_proxy=${http_proxy} + - https_proxy=${https_proxy} + image: intel/ai-workflows:pa-autofe + privileged: true + devices: + - /dev/dri + volumes: + - ../:/home/vmagent/app/ + working_dir: /home/vmagent/app/ + shm_size: 300g diff --git a/tools/near_dedup/near_dedup.py b/tools/near_dedup/near_dedup.py new file mode 100644 index 000000000..e11cf20ea --- /dev/null +++ b/tools/near_dedup/near_dedup.py @@ -0,0 +1,91 @@ +import argparse +import os +import sys +import ftfy +import re +import numpy as np +import pickle +from pyrecdp.core.utils import Timer +from pyrecdp.core import SparkDataProcessor +from pyrecdp.core.utils import Timer +from pyrecdp.primitives.llmutils.near_dedup import * +from pyrecdp.primitives.llmutils.utils import * +import pyspark.sql.functions as F +from pyspark.sql.window import Window +import shutil +from nltk import ngrams +import string + +def run(data_files, dup_dir, ngram_size, num_perm, bands, ranges, enable_ray): + if enable_ray: + rdp = SparkDataProcessor(spark_mode='ray') + else: + rdp = SparkDataProcessor() + spark=rdp.spark + try: + with Timer("Load data with RowID"): + df = read_json(data_files, spark).cache() + total_length = df.count() + + pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges) + with Timer("generate minHashLsh"): + if os.path.exists(dup_dir): + shutil.rmtree(dup_dir, ignore_errors=True) + results = pipeline.saveAsTextFile(dup_dir) + + + with Timer(f"generate_connected_components all"): + dup_connected_args = argparse.Namespace() + dup_connected_args.input_dir = dup_dir + dup_connected_args.out_file = os.path.join( + dup_dir, "connected_components.pickle" + ) + generate_connected_components.generate_connected_components_mp( + dup_connected_args + ) + + with Timer(f"generate_duplicates_dict all"): + dup_docs = os.path.join(dup_dir, "duplicates.pickle") + dup_dict_args = argparse.Namespace() + dup_dict_args.input_file = os.path.join( + dup_dir, "connected_components.pickle" + ) + dup_dict_args.out_file = dup_docs + generate_duplicates_dict.generate_duplicates(dup_dict_args) + + dup_dict = pickle.load(open(os.path.join(dup_dir, "duplicates.pickle"), 'rb')) + dup_sum = 0 + for _, v in dup_dict.items(): + dup_sum += len(list(v)) + + print(f"Completed!!") + print(f" total processed {total_length} documents") + print(f" total detected {dup_sum} duplicated documents") + print(f" duplicate ratio is {dup_sum/total_length}") + except Exception as e: + spark.stop() + print("Failed", e) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # data_files, dup_dir, ngram_size, num_perm, bands, ranges + #pipeline = minHashLSH_prepare(df, num_perm = 256, ngram_size = 6, bands = 9, ranges = 13) + parser.add_argument("-d", dest="data_dir", type=str) + parser.add_argument("--nperm", dest="num_perm", type=int, default=256) + parser.add_argument("--ngram", dest="ngram_size", type=int, default=6) + parser.add_argument("--bands", dest="bands", type=int, default=9) + parser.add_argument("--ranges", dest="ranges", type=int, default=13) + parser.add_argument("--enable_ray", dest="enable_ray", action='store_true', default=False) + args = parser.parse_args() + data_dir = args.data_dir + + data_files = get_data_files(data_dir) + dup_dir = os.path.join(data_dir, "deduplicate") + + num_perm = args.num_perm + ngram_size = args.ngram_size + bands = args.bands + ranges = args.ranges + enable_ray = args.enable_ray + with Timer(f"Generate duplicate dict for {data_dir}"): + run(data_files, dup_dir, ngram_size, num_perm, bands, ranges, enable_ray) diff --git a/tools/near_dedup/template_notebooks/debug_batch_near_dedup.ipynb b/tools/near_dedup/template_notebooks/debug_batch_near_dedup.ipynb new file mode 100644 index 000000000..20e8a310d --- /dev/null +++ b/tools/near_dedup/template_notebooks/debug_batch_near_dedup.ipynb @@ -0,0 +1,662 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "markdown", + "id": "bc12e6e9-8068-4ceb-9d79-937f9ba9b6df", + "metadata": {}, + "source": [ + "## difference in bigcode and slim\n", + "\n", + "1. ngram =>\n", + " slim uses: ngrams(content, ngram_size)\n", + " bigcode uses: ngrams(NON_ALPHA.split(content), ngram_size)\n", + "\n", + "2. permuations =>\n", + " slim generate permutation by datasketch\n", + " bigcide generate permutation its self." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/vmagent/app/modelzoo\n", + "/home/vmagent/app/bigcode-dataset/near_deduplication/\n" + ] + } + ], + "source": [ + "import argparse\n", + "import os\n", + "import sys\n", + "import ftfy\n", + "import pyspark.sql.functions as F\n", + "import shutil\n", + "\n", + "cur_path = \"/home/vmagent/app/\"\n", + "\n", + "modelzoo_path = os.path.join(cur_path, \"modelzoo\")\n", + "print(modelzoo_path)\n", + "bigcode_path = os.path.join(cur_path, \"bigcode-dataset/near_deduplication/\")\n", + "print(bigcode_path)\n", + "! cp -r {modelzoo_path}/modelzoo/transformers/data_processing/slimpajama/dedup /usr/local/lib/python3.10/dist-packages/\n", + "! cp {bigcode_path}/minhash_deduplication_spark.py /usr/local/lib/python3.10/dist-packages/\n", + "sys.path.append(modelzoo_path)\n", + "sys.path.append(bigcode_path)\n", + "\n", + "from dedup import generate_connected_components, generate_duplicates_dict, to_hash\n", + "from dedup.to_hash import *\n", + "from datasketch import MinHash\n", + "from datasketch.lean_minhash import LeanMinHash\n", + "from minhash_deduplication_spark import *\n", + "from datasketch.hashfunc import sha1_hash32 as datasketch_sha1_hash32\n", + "\n", + "def normalize_str(s):\n", + " print(\"original:\", s)\n", + " s = ftfy.fix_text(s, normalization=\"NFC\")\n", + " s = s.lower().translate(str.maketrans(\"\", \"\", string.punctuation))\n", + " s = re.sub(r\"\\s+\", \" \", s.strip())\n", + " return s\n", + "\n", + "def generate_hash_values_slimpj(content, idx, num_perm, ngram_size, hashranges, permutations, min_ngram_size):\n", + " # 0. apply normalization to content\n", + " content = normalize_str(content)\n", + " print(\"after normalize: \", content)\n", + " #tokens = {\" \".join(t) for t in ngrams(content, ngram_size)}\n", + " print(\"after split:\", NON_ALPHA.split(content))\n", + " tokens = {\" \".join(t) for t in ngrams(NON_ALPHA.split(content), ngram_size)}\n", + " print(\"after tokenize w/ split:\", tokens)\n", + " print(\"after tokenize wo split:\", {\" \".join(t) for t in ngrams(content, ngram_size)})\n", + " \n", + " #1. using bigcode impl to calculate minHash\n", + " m = MinHash(num_perm=num_perm, permutations = permutations, hashfunc = sha1_hash32)\n", + " m.update_batch([token.encode(\"utf-8\") for token in tokens])\n", + "\n", + " print(m.hashvalues.shape)\n", + " print(m.hashvalues)\n", + " \n", + " #2. map results to each band\n", + " Hs = [bytes(m.hashvalues[start:end].byteswap().data) for start, end in hashranges]\n", + " return [(band_idx, H, idx) for band_idx, H in enumerate(Hs)]\n", + "\n", + "def generate_hash_values_bigcode(content, idx, num_perm, ngram_size, hashranges, permutations, min_ngram_size):\n", + " #0. apply normalize to content\n", + " content = normalize_str(content)\n", + " tokens = {\" \".join(t) for t in ngrams(NON_ALPHA.split(content), ngram_size, min_ngram_size)}\n", + " \n", + " #1. using bigcode impl to calculate minHash \n", + " a, b = permutations\n", + " hashvalues = np.ones(num_perm, dtype=np.uint64) * MAX_HASH\n", + " hv = np.array([sha1_hash32(token.encode(\"utf-8\")) for token in tokens], dtype=np.uint64)\n", + " phv = np.bitwise_and(((hv * np.tile(a, (len(hv), 1)).T).T + b) % MERSENNE_PRIME, MAX_HASH)\n", + " hashvalues = np.vstack([phv, hashvalues]).min(axis=0)\n", + " \n", + " print(hashvalues.shape)\n", + " print(hashvalues)\n", + " \n", + " #2. map results to each band\n", + " Hs = [bytes(hashvalues[start:end].byteswap().data) for start, end in hashranges]\n", + " return [(band_idx, H, idx) for band_idx, H in enumerate(Hs)]\n", + "\n", + "def get_permutation(threshold, num_perm, B = None, R = None):\n", + " if B is None or R is None:\n", + " B, R = optimal_param(threshold, num_perm)\n", + " HASH_RANGES = [(i * R, (i + 1) * R) for i in range(B)]\n", + " PERMUTATIONS = np.array(\n", + " [\n", + " (\n", + " RNG.randint(1, MERSENNE_PRIME, dtype=np.uint64),\n", + " RNG.randint(0, MERSENNE_PRIME, dtype=np.uint64),\n", + " )\n", + " for _ in range(num_perm)\n", + " ],\n", + " dtype=np.uint64,\n", + " ).T\n", + " return HASH_RANGES, PERMUTATIONS, B, R\n", + "\n", + "def read_json(data_files):\n", + " from pyspark.sql.functions import input_file_name\n", + " from pyspark.sql.types import StructType,StructField, StringType\n", + " schema = StructType([ \n", + " StructField(\"text\",StringType(),True), \n", + " StructField(\"meta\",StringType(),True)\n", + " ])\n", + "\n", + " first = True\n", + " for filename in data_files:\n", + " print(filename)\n", + " df = spark.read.text(filename)\n", + " df = df.withColumn('jsonData', F.from_json(F.col('value'), schema)).select(\"jsonData.*\")\n", + " df = df.withColumn(\"__id__\", F.monotonically_increasing_id())\n", + " df = df.withColumn(\"filename\", F.lit(os.path.basename(filename)))\n", + " df = df.withColumn(\"filename_docid\", F.concat_ws(\"@\", \"filename\", \"__id__\"))\n", + " df = df.select(\"filename_docid\", \"text\", \"meta\")\n", + " if first:\n", + " first = False\n", + " ret_df = df\n", + " else:\n", + " ret_df = ret_df.union(df)\n", + " return ret_df\n", + "\n", + "def convert_to_slimPJ_fmt(first, second):\n", + " return [f\"{first} :: {second}\"]\n" + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_bands is 9, ranges is 13\n", + "!!! using slimPJ !!!\n", + "original: Q: How to modify non-configurable, non-writable properties in Javascript? I'm writing a simple EventEmitter is ES5.\n", + "The objective is to ensure that all properties on EventEmitter instances are\n", + "non-writable and non-configurable.\"\n", + "After 6 hours of racking my brain I still can't figure out how to, increase the listenerCount, for example if the configurable descriptor is set to false.\n", + "Here's an example of what I have:\n", + "var eventEmitter = function(){\n", + " var listeners = listeners || 0;\n", + " var events = events || {};\n", + "\n", + " Object.defineProperties(this, {\n", + " listeners: {\n", + " value : 0,\n", + " configurable: false,\n", + " writable: false\n", + " },\n", + " events: {\n", + " value: {},\n", + " configurable : false,\n", + " writable: false\n", + " }\n", + " });\n", + " return this;\n", + "};\n", + "\n", + "\n", + "eventEmmitter.prototype.on = function(ev, cb) {\n", + " if (typeof ev !== 'string') throw new TypeError(\"Event should be type string\", \"index.js\", 6);\n", + " if (typeof cb !== 'function' || cb === null || cb === undefined) throw new TypeError(\"callback should be type function\", \"index.js\", 7);\n", + "\n", + " if (this.events[ev]){\n", + " this.events[ev].push(cb);\n", + " } else {\n", + " this.events[ev] = [cb];\n", + " }\n", + "\n", + " this.listeners ++;\n", + " return this;\n", + "};\n", + "\n", + "\n", + "A: I would recommend the use of an IIFE (immediatly invoked function expression):\n", + "var coolObj=(function(){\n", + "var public={};\n", + "var nonpublic={};\n", + "nonpublic.a=0;\n", + "public.getA=function(){nonpublic.a++;return nonpublic.a;};\n", + "\n", + "return public;\n", + "})();\n", + "\n", + "Now you can do:\n", + "coolObj.getA();//1\n", + "coolObj.getA();//2\n", + "coolObj.a;//undefined\n", + "coolObj.nonpublic;//undefined\n", + "coolObj.nonpublic.a;//undefined\n", + "\n", + "I know this is not the answer youve expected, but i think its the easiest way of doing sth like that.\n", + "\n", + "A: You can use a proxy which requires a key in order to define properties:\n", + "\n", + "\n", + "function createObject() {\n", + " var key = {configurable: true};\n", + " return [new Proxy({}, {\n", + " defineProperty(target, prop, desc) {\n", + " if (desc.value === key) {\n", + " return Reflect.defineProperty(target, prop, key);\n", + " }\n", + " }\n", + " }), key];\n", + "}\n", + "function func() {\n", + " var [obj, key] = createObject();\n", + " key.value = 0;\n", + " Reflect.defineProperty(obj, \"value\", {value: key});\n", + " key.value = function() {\n", + " key.value = obj.value + 1;\n", + " Reflect.defineProperty(obj, \"value\", {value: key});\n", + " };\n", + " Reflect.defineProperty(obj, \"increase\", {value: key});\n", + " return obj;\n", + "}\n", + "var obj = func();\n", + "console.log(obj.value); // 0\n", + "try { obj.value = 123; } catch(err) {}\n", + "try { Object.defineProperty(obj, \"value\", {value: 123}); } catch(err) {}\n", + "console.log(obj.value); // 0\n", + "obj.increase();\n", + "console.log(obj.value); // 1\n", + "\n", + "\n", + "\n", + "after normalize: q how to modify nonconfigurable nonwritable properties in javascript im writing a simple eventemitter is es5 the objective is to ensure that all properties on eventemitter instances are nonwritable and nonconfigurable after 6 hours of racking my brain i still cant figure out how to increase the listenercount for example if the configurable descriptor is set to false heres an example of what i have var eventemitter function var listeners listeners 0 var events events objectdefinepropertiesthis listeners value 0 configurable false writable false events value configurable false writable false return this eventemmitterprototypeon functionev cb if typeof ev string throw new typeerrorevent should be type string indexjs 6 if typeof cb function cb null cb undefined throw new typeerrorcallback should be type function indexjs 7 if thiseventsev thiseventsevpushcb else thiseventsev cb thislisteners return this a i would recommend the use of an iife immediatly invoked function expression var coolobjfunction var public var nonpublic nonpublica0 publicgetafunctionnonpublicareturn nonpublica return public now you can do coolobjgeta1 coolobjgeta2 coolobjaundefined coolobjnonpublicundefined coolobjnonpublicaundefined i know this is not the answer youve expected but i think its the easiest way of doing sth like that a you can use a proxy which requires a key in order to define properties function createobject var key configurable true return new proxy definepropertytarget prop desc if descvalue key return reflectdefinepropertytarget prop key key function func var obj key createobject keyvalue 0 reflectdefinepropertyobj value value key keyvalue function keyvalue objvalue 1 reflectdefinepropertyobj value value key reflectdefinepropertyobj increase value key return obj var obj func consolelogobjvalue 0 try objvalue 123 catcherr try objectdefinepropertyobj value value 123 catcherr consolelogobjvalue 0 objincrease consolelogobjvalue 1\n", + "after split: ['q', 'how', 'to', 'modify', 'nonconfigurable', 'nonwritable', 'properties', 'in', 'javascript', 'im', 'writing', 'a', 'simple', 'eventemitter', 'is', 'es5', 'the', 'objective', 'is', 'to', 'ensure', 'that', 'all', 'properties', 'on', 'eventemitter', 'instances', 'are', 'nonwritable', 'and', 'nonconfigurable', 'after', '6', 'hours', 'of', 'racking', 'my', 'brain', 'i', 'still', 'cant', 'figure', 'out', 'how', 'to', 'increase', 'the', 'listenercount', 'for', 'example', 'if', 'the', 'configurable', 'descriptor', 'is', 'set', 'to', 'false', 'heres', 'an', 'example', 'of', 'what', 'i', 'have', 'var', 'eventemitter', 'function', 'var', 'listeners', 'listeners', '0', 'var', 'events', 'events', 'objectdefinepropertiesthis', 'listeners', 'value', '0', 'configurable', 'false', 'writable', 'false', 'events', 'value', 'configurable', 'false', 'writable', 'false', 'return', 'this', 'eventemmitterprototypeon', 'functionev', 'cb', 'if', 'typeof', 'ev', 'string', 'throw', 'new', 'typeerrorevent', 'should', 'be', 'type', 'string', 'indexjs', '6', 'if', 'typeof', 'cb', 'function', 'cb', 'null', 'cb', 'undefined', 'throw', 'new', 'typeerrorcallback', 'should', 'be', 'type', 'function', 'indexjs', '7', 'if', 'thiseventsev', 'thiseventsevpushcb', 'else', 'thiseventsev', 'cb', 'thislisteners', 'return', 'this', 'a', 'i', 'would', 'recommend', 'the', 'use', 'of', 'an', 'iife', 'immediatly', 'invoked', 'function', 'expression', 'var', 'coolobjfunction', 'var', 'public', 'var', 'nonpublic', 'nonpublica0', 'publicgetafunctionnonpublicareturn', 'nonpublica', 'return', 'public', 'now', 'you', 'can', 'do', 'coolobjgeta1', 'coolobjgeta2', 'coolobjaundefined', 'coolobjnonpublicundefined', 'coolobjnonpublicaundefined', 'i', 'know', 'this', 'is', 'not', 'the', 'answer', 'youve', 'expected', 'but', 'i', 'think', 'its', 'the', 'easiest', 'way', 'of', 'doing', 'sth', 'like', 'that', 'a', 'you', 'can', 'use', 'a', 'proxy', 'which', 'requires', 'a', 'key', 'in', 'order', 'to', 'define', 'properties', 'function', 'createobject', 'var', 'key', 'configurable', 'true', 'return', 'new', 'proxy', 'definepropertytarget', 'prop', 'desc', 'if', 'descvalue', 'key', 'return', 'reflectdefinepropertytarget', 'prop', 'key', 'key', 'function', 'func', 'var', 'obj', 'key', 'createobject', 'keyvalue', '0', 'reflectdefinepropertyobj', 'value', 'value', 'key', 'keyvalue', 'function', 'keyvalue', 'objvalue', '1', 'reflectdefinepropertyobj', 'value', 'value', 'key', 'reflectdefinepropertyobj', 'increase', 'value', 'key', 'return', 'obj', 'var', 'obj', 'func', 'consolelogobjvalue', '0', 'try', 'objvalue', '123', 'catcherr', 'try', 'objectdefinepropertyobj', 'value', 'value', '123', 'catcherr', 'consolelogobjvalue', '0', 'objincrease', 'consolelogobjvalue', '1']\n", + "after tokenize w/ split: {'iife immediatly invoked function expression', 'if descvalue key return reflectdefinepropertytarget', 'value 0 configurable false writable', 'type function indexjs 7 if', 'would recommend the use of', 'to modify nonconfigurable nonwritable properties', 'false heres an example of', 'function expression var coolobjfunction var', 'can do coolobjgeta1 coolobjgeta2 coolobjaundefined', 'to false heres an example', 'on eventemitter instances are nonwritable', 'you can use a proxy', 'cb undefined throw new typeerrorcallback', 'this is not the answer', 'return new proxy definepropertytarget prop', 'what i have var eventemitter', 'value key reflectdefinepropertyobj increase value', 'expression var coolobjfunction var public', 'ev string throw new typeerrorevent', 'still cant figure out how', 'reflectdefinepropertyobj increase value key return', 'a i would recommend the', 'new proxy definepropertytarget prop desc', 'and nonconfigurable after 6 hours', 'false writable false return this', 'this a i would recommend', '1 reflectdefinepropertyobj value value key', 'to increase the listenercount for', 'the easiest way of doing', 'example of what i have', 'be type string indexjs 6', 'null cb undefined throw new', 'var nonpublic nonpublica0 publicgetafunctionnonpublicareturn nonpublica', 'the configurable descriptor is set', 'a you can use a', 'var public var nonpublic nonpublica0', 'function createobject var key configurable', 'a simple eventemitter is es5', 'nonpublica0 publicgetafunctionnonpublicareturn nonpublica return public', 'know this is not the', 'racking my brain i still', 'be type function indexjs 7', 'nonwritable properties in javascript im', 'es5 the objective is to', 'think its the easiest way', '6 hours of racking my', 'objvalue 123 catcherr try objectdefinepropertyobj', 'is es5 the objective is', 'nonwritable and nonconfigurable after 6', 'easiest way of doing sth', 'eventemmitterprototypeon functionev cb if typeof', 'use a proxy which requires', 'function indexjs 7 if thiseventsev', 'public now you can do', 'func consolelogobjvalue 0 try objvalue', 'try objvalue 123 catcherr try', 'keyvalue function keyvalue objvalue 1', 'obj var obj func consolelogobjvalue', 'listeners value 0 configurable false', '6 if typeof cb function', 'properties function createobject var key', 'catcherr consolelogobjvalue 0 objincrease consolelogobjvalue', 'the listenercount for example if', 'typeerrorevent should be type string', 'objectdefinepropertiesthis listeners value 0 configurable', 'all properties on eventemitter instances', '0 var events events objectdefinepropertiesthis', 'cb null cb undefined throw', 'i would recommend the use', 'requires a key in order', 'throw new typeerrorevent should be', 'writable false return this eventemmitterprototypeon', 'key createobject keyvalue 0 reflectdefinepropertyobj', 'can use a proxy which', 'definepropertytarget prop desc if descvalue', 'to define properties function createobject', 'eventemitter instances are nonwritable and', 'proxy which requires a key', 'createobject var key configurable true', 'value key return obj var', 'var eventemitter function var listeners', 'javascript im writing a simple', 'configurable true return new proxy', 'of what i have var', 'descvalue key return reflectdefinepropertytarget prop', 'desc if descvalue key return', 'configurable false writable false events', 'cb thislisteners return this a', 'that a you can use', 'function keyvalue objvalue 1 reflectdefinepropertyobj', 'that all properties on eventemitter', 'this eventemmitterprototypeon functionev cb if', 'coolobjgeta2 coolobjaundefined coolobjnonpublicundefined coolobjnonpublicaundefined i', 'catcherr try objectdefinepropertyobj value value', 'events objectdefinepropertiesthis listeners value 0', 'key reflectdefinepropertyobj increase value key', 'events events objectdefinepropertiesthis listeners value', 'typeof cb function cb null', 'false events value configurable false', 'var events events objectdefinepropertiesthis listeners', 'configurable false writable false return', 'the use of an iife', 'false writable false events value', 'you can do coolobjgeta1 coolobjgeta2', 'nonconfigurable after 6 hours of', 'writable false events value configurable', 'how to modify nonconfigurable nonwritable', 'in javascript im writing a', 'key configurable true return new', 'i know this is not', 'im writing a simple eventemitter', 'coolobjnonpublicundefined coolobjnonpublicaundefined i know this', 'var key configurable true return', 'true return new proxy definepropertytarget', 'in order to define properties', 'return public now you can', 'function func var obj key', 'is to ensure that all', 'thislisteners return this a i', 'recommend the use of an', 'of an iife immediatly invoked', 'invoked function expression var coolobjfunction', 'eventemitter is es5 the objective', 'events value configurable false writable', 'brain i still cant figure', 'nonpublica return public now you', 'have var eventemitter function var', 'reflectdefinepropertytarget prop key key function', 'prop desc if descvalue key', 'immediatly invoked function expression var', 'sth like that a you', 'key key function func var', 'set to false heres an', 'define properties function createobject var', 'reflectdefinepropertyobj value value key keyvalue', 'thiseventsev cb thislisteners return this', 'if typeof ev string throw', 'expected but i think its', 'value value key reflectdefinepropertyobj increase', 'a proxy which requires a', 'type string indexjs 6 if', 'how to increase the listenercount', 'publicgetafunctionnonpublicareturn nonpublica return public now', 'var listeners listeners 0 var', 'public var nonpublic nonpublica0 publicgetafunctionnonpublicareturn', 'but i think its the', 'objective is to ensure that', 'like that a you can', '123 catcherr try objectdefinepropertyobj value', 'value 123 catcherr consolelogobjvalue 0', 'if the configurable descriptor is', 'eventemitter function var listeners listeners', 'listeners listeners 0 var events', 'value value key keyvalue function', 'the answer youve expected but', 'cant figure out how to', 'its the easiest way of', 'q how to modify nonconfigurable', 'instances are nonwritable and nonconfigurable', 'properties in javascript im writing', 'function var listeners listeners 0', 'ensure that all properties on', 'return this a i would', 'way of doing sth like', 'of doing sth like that', 'key return reflectdefinepropertytarget prop key', '0 reflectdefinepropertyobj value value key', 'value key keyvalue function keyvalue', 'is set to false heres', 'should be type string indexjs', 'key in order to define', 'if typeof cb function cb', '7 if thiseventsev thiseventsevpushcb else', 'throw new typeerrorcallback should be', 'an iife immediatly invoked function', 'var coolobjfunction var public var', 'return this eventemmitterprototypeon functionev cb', 'my brain i still cant', 'not the answer youve expected', 'a key in order to', 'listenercount for example if the', 'for example if the configurable', 'typeerrorcallback should be type function', 'key return obj var obj', 'simple eventemitter is es5 the', 'cb if typeof ev string', 'nonpublic nonpublica0 publicgetafunctionnonpublicareturn nonpublica return', 'value configurable false writable false', 'var obj key createobject keyvalue', 'objvalue 1 reflectdefinepropertyobj value value', 'thiseventsevpushcb else thiseventsev cb thislisteners', 'prop key key function func', 'out how to increase the', 'increase the listenercount for example', 'nonconfigurable nonwritable properties in javascript', 'key function func var obj', 'figure out how to increase', 'heres an example of what', 'i think its the easiest', 'indexjs 7 if thiseventsev thiseventsevpushcb', 'i still cant figure out', 'thiseventsev thiseventsevpushcb else thiseventsev cb', 'cb function cb null cb', 'keyvalue 0 reflectdefinepropertyobj value value', 'configurable descriptor is set to', 'do coolobjgeta1 coolobjgeta2 coolobjaundefined coolobjnonpublicundefined', 'should be type function indexjs', 'return obj var obj func', 'modify nonconfigurable nonwritable properties in', 'writing a simple eventemitter is', 'coolobjnonpublicaundefined i know this is', 'false return this eventemmitterprototypeon functionev', 'after 6 hours of racking', 'an example of what i', 'functionev cb if typeof ev', 'key keyvalue function keyvalue objvalue', 'descriptor is set to false', 'of racking my brain i', 'listeners 0 var events events', 'new typeerrorcallback should be type', 'now you can do coolobjgeta1', 'answer youve expected but i', 'keyvalue objvalue 1 reflectdefinepropertyobj value', 'value value 123 catcherr consolelogobjvalue', '123 catcherr consolelogobjvalue 0 objincrease', 'undefined throw new typeerrorcallback should', 'increase value key return obj', 'new typeerrorevent should be type', 'if thiseventsev thiseventsevpushcb else thiseventsev', 'is not the answer youve', 'return reflectdefinepropertytarget prop key key', 'i have var eventemitter function', 'else thiseventsev cb thislisteners return', 'obj func consolelogobjvalue 0 try', 'string indexjs 6 if typeof', 'coolobjfunction var public var nonpublic', 'use of an iife immediatly', 'to ensure that all properties', 'properties on eventemitter instances are', 'indexjs 6 if typeof cb', 'youve expected but i think', 'consolelogobjvalue 0 objincrease consolelogobjvalue 1', 'the objective is to ensure', 'coolobjaundefined coolobjnonpublicundefined coolobjnonpublicaundefined i know', 'are nonwritable and nonconfigurable after', 'order to define properties function', 'obj key createobject keyvalue 0', 'typeof ev string throw new', 'consolelogobjvalue 0 try objvalue 123', 'coolobjgeta1 coolobjgeta2 coolobjaundefined coolobjnonpublicundefined coolobjnonpublicaundefined', 'func var obj key createobject', 'var obj func consolelogobjvalue 0', 'example if the configurable descriptor', 'createobject keyvalue 0 reflectdefinepropertyobj value', '0 configurable false writable false', 'hours of racking my brain', 'doing sth like that a', 'try objectdefinepropertyobj value value 123', 'function cb null cb undefined', 'proxy definepropertytarget prop desc if', 'reflectdefinepropertyobj value value key reflectdefinepropertyobj', '0 try objvalue 123 catcherr', 'which requires a key in', 'objectdefinepropertyobj value value 123 catcherr', 'string throw new typeerrorevent should'}\n", + "after tokenize wo split: {' a n e', 's e v e n', 's i o n ', ' r e t u', 'a l l p', 's 7 i', 'u l d b', ' a p r', 't e d b', ' h a v e', 'n e d c', 'r n n o', 'j a u n d', 'r o x y ', 'p d e s', '1 c o o', 'p e o f ', 'n d e x j', 'e c t i v', 'e r t i e', 'r e n o', 'u e f u', 'a m p l e', 'n s w e r', 'y p e e r', 'q h o w', 'b e t y', '2 c o o', ' o r d e', 'a n u s', 'x y d e', 'e 0 c', 'l c b ', 's e r e', ' k n o w', 'e 1 2 3', 'c t i v e', 'f t e r ', 'e n e r s', 'e 0 o', 'j v a r', 'i n e d ', ' i s t', 't e m i t', 't o r i', 'c a r e', 'a t a ', 'c a n u', 'n c r e', 'a n t f', 'o b j a u', 'x p r e s', 'r a b l e', ' a r e ', 'l i c v', 'f d o i', 'r e o u', ' i n c r', ' o f w', 'r e s a', 'e s t r', 'n o n c o', 'i c h r', ' 0 o b', 't h i s l', 'p t o r ', ' w r i t', 'p k e y', 'e c t v', ' w h i c', 'r r c o', 'l u e 0', 'o n e x', 'o r d e r', 's i n ', 'e y v a l', 'l l c a', '0 r e f', 'r f u n', 't h e l', 'p u b l i', 'n o b j', 'y k e y', 'n t s o', 't o e n', 'b u n d', 'm y b r', 'n n e w', 't s e v', 's i e s t', 'l e i f', 'c v a l u', 'l e o f', 'i n e p r', 'e w r i', 'i n j a', 'l u e f', ' n e w ', 'b l i c ', 'o b j f', 'f t h e', 'e t t o', 'k e y k', 'i c a r e', 'r d e r ', '0 t r y', 'b j a u n', 'e m m i t', 'a f u n c', ' h o w ', 'l e l o g', 'k e y i', 'i n k i', 'o w t h', 'y o u c', 'n c v a', 'o l e l o', 'b j i n c', 'k e y v a', 'o g o b j', ' 0 v a', 'r i n g ', 'n v o k e', 'i n d e x', 't i l l ', 'u s e o', ' r a c k', 'a l s e ', '3 c a t', ' r e q u', 'l y i n', 'j g e t a', ' h e r e', 'a t l y ', 'e w p r', 'e n t s', 't w a y', 'p e f u', 'i e s t h', 'c k s h', 'l u e 1', 'n s t a n', 't t o ', 'a l u e ', ' d o c', 'o f c b', 'e a t e o', 't h e o', 'r i p t o', 'b e l s', 'm w r i', 'e r e s ', 'e p r o p', ' i s e', 'e m i t t', 'n t s e v', ' y o u ', 'f i n e ', 'e e x p', 'e f u n', 'g a s', 'e d c o', 'a y o f', 'e k e y', 'i l l c', 'e a f t', ' n u l l', 'j f u n c', 'g e t p', 'e x j s ', 'e a n d', 'p r o p ', 'p r o t o', ' t r y ', 't a l l', ' e v s', 's t a n c', ' c b n', ' b u t ', 'v o k e d', '7 i f ', 'h i s e', 'e s a r', 'e l s e ', 'c n o w', 'w n e w', 'c v a r', ' 0 r e', 'e r t y o', 'o i n g ', ' i n o', 'u e 1 2', 't i v e ', ' a n d ', 't o t y p', 'o b j n o', 'b l e n', 'o u c a', 'e t h i', 'n g t h', 's s i o n', ' t o i', 'l s e t', 'e o u t', '1 r e f', 'r o b j', ' l i k e', 'b j f u n', 't h i s ', 'e i s ', 's l i s t', 'c a r e t', 'i n g i', 'n c e s ', 'r y o u', ' i t h', 'e s t w', 'e t u r n', '0 c o n', 'f i n e d', ' b e t', 'a y o u', 's e t h', 't l y i', 'b l e p', 'n t f o', 'v a s c r', 'j e c t i', 't e r f', 'e i m m', 'w y o u', 'g s t h', 'r e x a', 'i c g e t', 'e v a l', 't e o b j', 'y o u v e', 'i k e t', 's f u n', 'e o f ', 'y b r a', 't f i g', 's n o t', 'n o n w r', 'i o n f', 't e r i', 't h o w', 'e l o g o', 's t h e', 'j v a l', 'l s e r', 'e s f u', ' 7 i f', 'c c o n', ' i w o', 's e e v', 'r t y o b', 't o m o', 'x j s 7', ' w o u l', 'w r i t i', 'l s e e', 'o m m e n', 'i c v a', 'v a r c', 'b l i c g', 'p e o n ', 's e c o', 'p e r t y', 'c o o l o', 't v a r', 'n f u n', 'e d f u', 'f l e c t', 'd e s c ', 'n g m y', 'l i c u n', 'u b l i c', 'r e a s e', 'c e s a', 'd e r t', 'x y w h', 'b j f u', 'a n i i', 'o u t h', 'y p e f', 'k n o w ', 'o b j i', 't e r p r', 'o t t h', 'c h e r r', 'i n g m', 'l i c g e', 'o x y w', 'j i n c r', 'd i k', 'u e 0 ', ' 0 c o', 'a n c e s', 'n e p r o', 'o n e v', 'l i s t e', 'v a l u e', 'e v c b', 'e a s i e', 'u r e o', 'a r l i', 'c t e d ', 'q u i r e', 'e e r r o', '1 2 3 c', 'd o i n g', 'o n e v ', 'o b j i n', 'l p r o', 's e v t', 'e c t d e', 't d e f i', 'e v e n t', 'w h i c h', 'e s a ', 'i g u r a', 'c o u n t', ' t o f', 'r o r c a', 'i r e s ', 'a 0 p u', 'h r o w ', 'i s i s', 'e r s 0', 'd r e c', 'c b u n', ' b r a i', 'e e v e', ' e x a m', 'e s c v a', ' s t i l', 'r i s ', 'a p r o', ' o b j i', 'r r o r c', ' i f t', 'r l i s', 't s h o', 'n e d i', ' u n d e', 't a b l e', 'o u v e ', ' w a y ', 'o d e f', 't a r g e', 't h l i', 'b i f ', 'u e c o', 'r p r o t', 's h c b ', ' t h r o', 'r e v e n', 'a r e v', 'g o b j v', 'g t h r', 'e d e s', 's e w r', 'e s i n', 'o l o b j', 'e s a n', 'o b j k', 'o n c b', 'i n g a', 'b f u n', 'c r i p t', 'o i n c', 'y p e o f', 'n p u b l', 'a t e o b', 'j n o n p', 'p l e o', 'o u l d ', 'v e n t ', 'e s s i o', ' e x p r', 'd c o o', 'h e r r ', 'b l e f', 'i s t o', 't i n g ', 'e s c r i', 'a i n i', 'i n i ', 'u r n t', 'a n d o', 'i s l i s', 'n u s e', ' a l l ', ' i h a', ' a i ', 'o p e r t', 'c u n d e', 'j f u n', 'y i n v', 'o r e v e', 'c n o n', 'i c a u n', 'p r o x y', 'u e r e', 'i s s e', 'n d n o', 'v e n t e', 'p l e i', 'b j i n', 'i f t h', 't y p e ', 'e t a f u', 'd b u t', 's 5 t h', 'a n e x', 's e v p u', 'a r p u', '6 i f ', 'p u s h c', 'l o b j a', ' i k n', 'c a t c h', 'i v e i', 'd i a t l', 'l e n o', 'n e v e', 'h o w t', 'o p k e', ' a k e', 'n d o ', 'n c c o', ' o f r', 'r c o n', 'a r o b', 'a t a l', 'o c o o', 'c i f ', 'y o b j', ' a n s w', 'e r r t', ' s i m p', 'o t o t y', 'r n r e', 'h i s l', 'h e c o', 'p e e r r', 'd f u n', 'k e t h', 's v a l', 'n n o n', 't a f u n', ' i s t', 'r c a l l', 't i m ', 'b r a i n', 's u r e ', 'e o f c', 'c t d e f', 'f e i m', 'e r y o', 'e e a s', ' f u n c', 'u e o b', 'e r i s', 'f i g u r', 'e x p e c', ' e l s e', 'n e w p', 'n j a v', 'h e l i', 'i n e p', 'n r e f', 'a b l e ', 'o n s o l', 'g m y ', 'l l c b', 'e f l e c', 'u e v a', ' n o t ', 'j a v a s', 'l s e h', 'e s c i', 'y d e f', 'n c r e a', 'u n c t i', 'o n v a', 'f t y p', 'b l e t', 'l o b j f', ' 1 r e', 'e r s l', 'm i t t e', 'i o n e v', 'k i t s', 'e y k e', 'h i s a', ' a f t e', 't y t a r', 's t i l l', 'e f a l', 'u r n r', 't e n e r', 'n t h i', 'g e t a f', 't r u e ', 'a 2 c o', 'i c u n d', 'i k n o', ' d e s c', 'b j g e t', 'l e f a', 'i o n e', 'b a c k ', 'm e d i a', 'n w r i t', 'e x a m p', 'l e t r', 'j i n c', 'l u e o', 'n g s t', 'l e d e', 's a n ', 'o n i n', 'w t h i', ' n o w ', 'n e r c o', 'r y o b', 'a r g e t', 'n i s', 'a v e v', 'v c b ', 'b j n o n', 'a i w', 'e y c o', 'l u e v', 'r s o f', 'b l i c a', 't a 2 c', 'b l e d', ' 6 h o', 'e r s v', 'n t s h', ' p r o x', 'u i r e s', 'u e 1 ', ' c b t', 'a s e t', 'c a n t ', 'o n p u b', 'e y c r', 'e r i n', 'n s u r e', 'i f d e', 'b t h i', 'e 1 r', 'o b j g e', 'o n f i g', 'e w t y', 'g u r e ', 'e s 5 t', 'a n s w e', 'r n o n', 'h o u l d', 'k e y r', ' c o o l', 'n g a ', 'c t i o n', 'c t k e', 'u n c c', 'i t i n g', 'e o b j e', 'l o g o b', 'n t s v', 'y c r e', 'g u r a b', 'h i n k ', 'l e e v', 'i n s t a', 'e n o n', ' p r o p', 'r t r y', 'e n t s e', 'h r e q', ' o b j ', ' m o d i', ' o f a', 'w r i t a', 'o m o d', 'l d r e', ' c o n s', 'r e t h', 'k e y c', 'k e y f', ' r e f l', 'i m m e d', 'b u t i', 'p l e e', 'r s v a', 'u v e e', 's c v a l', 'v e i s', 'h e e a', 'b n u l', 's t h i s', 'u s e a', 'p t i m', 'r i t a b', 'u r s o', 'o r e x', 'o n f u', 'n e v c', 'a c k i n', 's t r i n', 'i o n n o', 'n p u b', 'o b j f u', ' t y p e', 'o n n o n', 'l i c a r', 'c a n d', 'v a r l', 'a u n d e', 't s e v p', 'h e u s', ' o u t ', 'i s l i', 'j v a l u', 'i o n i', 'w p r o', 'i p t o r', 'i i f e ', 'i n o r', 'i s e v e', 'c b n u', 't h e a', 'f d e s', 'd i f y ', ' a s i', ' c b u', 'i a t l y', 'u r n o', 's o l e l', 'e r c o u', 'c o n f i', ' t r u e', 'e r e t', 'd n o n', 'r t o ', 'n i n d', 'i o n v', 'o f w h', 'h i s i', 'r i t i n', 'i s t e n', 't y p e e', 'j k e y', 'u t h o', 'e d t h', 'e r t o', 'y f u n', 'r c o u n', ' h o u r', 'r p u b', 'c b f u', 'o n k e', 'n e d t', ' i f d', 'e 0 r', 'a s i m', 'i p t i', 'e q u i r', 't i o n ', 't e m m i', 'd t h e', ' i s s', 'g e t a 1', 'a s i e s', 'r k e y', 'o r i s', 'c r e a t', 'l e p r', 'n v a r', ' c a n ', 'r e a t e', ' c o n f', 'r n t h', 'w e r y', 'e x p r e', 's e v c', 'e y f u', 'w a y o', 'e l i s', 'n e r s ', 'x j s 6', 'a s e v', 'o f d o', 't s v a', 'a r n o', 's c i f', 'w o u l d', 'h a t a', 's l i s', 'f c b ', 'k s h o', 'j e c t ', ' i m w', 'n e x a', 'e o n f', ' c b i', 't u r n ', 'i n v o k', 'f o r e', 'c r e a s', 'j e c t d', 'l e a n', 'f i n e p', 'r e t u r', 'p e s t', 'u l d r', ' e x p e', 'v a r k', 'c o m m e', 'h e r e s', 'o u n t ', ' e n s u', 'n c o n f', 's i m p l', ' t h i s', 'e c t k', ' i i f e', 'e d i a t', ' s h o u', 'e t a 1 ', 't i o n n', 't o i n', 'u l l c', ' n o n p', 'm m e d i', 'u r n p', 'o b j e c', ' t o d', 'n t e m i', 'f w h a', 'r n p u', 'y n o n', 'i n c r e', ' n o n w', 'e v a r', 'c t v a', 'i o n k', 'c a u n d', 'i g u r e', 't c h e r', 'a v a s c', 's h o u l', 'u n d e f', 'f e v ', 'e c o m m', 'e r 6 ', 's r e t', 'e o b j', 'i m w r', 'a k e y', 'y r e t', 'a n d n', 'l i k e ', 'p e c t e', ' s t h ', 'k e d f', 'w h a t ', 'r e f l e', 'r a c k i', ' i t s ', 'e f i n e', 'v t h i', '6 h o u', 'c b i f', 'e v t h', 'n k e y', 'o w y o', 't k e y', ' o f d', 's o f ', 'd e s c v', 'i e s i', 't h e e', 'l e c t d', 'u n c v', 'n e w t', 't i t', 'h o u r s', ' y o u v', ' t h i n', 's t h l', 'a r e n', 'y w h i', 'h e o b', 'f r a c', 'a l l b a', 'h i c h ', 'e a n s', 'y o f ', 't h e c', 'm m e n d', ' t o e', 'u r a b l', 'y r e f', 's a k', ' f a l s', 'l i c a u', 'n e p r', 'o k e d ', ' c b f', 'r t i e s', 's s e t', 'n c t i o', ' w h a t', 'r e s s i', 'n o t t', ' i n s t', ' f o r ', '5 t h e', 'n i i f', 'u r e t', ' l i s t', 'p e r t i', 'v s t r', 'i s e s', 'n o n p u', 'f t h i', 'a r k e', 's a r e', 't s t h', 'n f i g u', 'h a v e ', 'l i c a 0', 's e a ', 'y t a r g', 'e n e r c', ' c r e a', ' v a l u', 'i e s o', 't i e s t', 'r i p t ', 'n o w t', ' i s n', 'w t y p', 'l i c n', '0 o b j', ' k e y ', 'y p e s', 'v e v a', 'e d b u', 'n s o l e', 'a t i ', 'd t h r', 'e s o n', ' c a n t', ' n o n c', 'r s l i', 'i f t y', 'o f e v', 'p r e s s', 'f u n c ', ' e a s i', ' a n i', ' o n e', 't t h e', 'r o p d', ' t h e ', 'r e v e', ' 1 2 3 ', 'm p l e ', 's t o ', 's 6 i', 's e s 5', 's o b j', 'n t e m m', 'u r n n', 'r n n e', ' s e t ', 'y o b j ', ' o b j v', 't o f a', 'r 6 h', 't i h', ' r e c o', 'a c k s', 'n d t h', 'l o b j g', ' p u b l', 'b j k e', 'l u e c', 'i e s t ', 'x p e c t', 't t e r p', 'e t y p', 't s e v ', 't h r o w', 'r c o o', 'r e q u i', 'n o r d', 't r i n g', 'l i c a ', 'm o d i f', 't t e r ', 'o w t o', 'n o w y', 'r r t r', 'i c n o', 'n n o n p', 'y c o n', 'v a r o', ' d o i n', 'c a 0 p', 'd o c o', 'o d i f y', 'r s r e', 't i e s ', 'o b j v', 'y p e o n', ' f i g u', 'e n d t', ' t o m', ' e v e n', ' c a t c', 'u n t f', 'j s 6 ', 'i t s t', 'd b e ', 'i n g s', 'o t y p e', ' i n j', 'i s t i', 'v a r e', 'l b a c k', 'x a m p l', 'b l e a', 'i o n c', 's a i', 'e r p r o', ' i m m e', 'l c a n', 'o b j v a', 'i t t e r', 'h e a n', 'd e f i n', 's e t t', 'r o p k', 'i s e v', 'r r o r e', 'u e k e', 'b j e c t', 'v e n t s', ' i n v o', 'w t o ', 'c h r e', 'n e x p', 'l o b j n', 'l u e k', 's e o f', 'i n g t', ' s t r i', 'o n c r', ' k e y v', 'n g i n', 'o x y d', 'u c a n', 't s o b', 'n u l l ', 'd e s c r', 's 0 v', 'g i n d', 'r o t o t', 'o f a n', 'e p r o', 'k i n g ', 's w e r ', 'r e c o m', 'n k i t', 'e y r e', 'e u s e', 'a 1 c o', 'i e s f', 'o n w r i', 'r o w n', 'h i s e v', 't h a t ', 't h e u', ' e s 5 ', 'a r e t', 'r o p e r', 'e a s e ', 'n c b ', ' m y b', '0 v a r', ' i n d e', 'e a p', 'y i n ', 's o n ', 's e v e', 'v e e x', 'l l b a c', 'a s c r i', 'm e n d ', 'm m i t t', 'e r r c', 'n t f i', 't f o r', 'e s t h i', 'r s 0 ', 'n t s e', 'l l p r', 'e v s t', 'i f e i', 'i t h i', 'f u n c t', 'f y n o', 'o e n s', 'i c a 0 ', 'e c t e d', 'h l i k', 't h i s e', 'e t p r', 'c a l l b', '2 3 c a', 'e r r o r', 'c b e l', 't e r 6', ' u s e ', 'o o l o b', 'h i s l i', ' o b j e', 't a 1 c', 't a y', 'o r c a l', 'r g e t ', 's t e n e', 'b j v a', 't i o n e', 't p r o', 'e t h a', 'p r o p e', 'f a l s e', 'i m p l e', ' v a r ', 'o f a l', 'e t a 2 ', ' d e f i', 'i w o u', 's t w a', 't y o b j', 't h i n k', ' t h a t', 'e t h e', 'e r f u', 'r u e r', 'e y i n', 'e n s u r', 'i c a r', 's e v a', 'n d e f i', 'e t r u', 't r y o', 'o p d e', 's e h e', 'u t i ', 't o d e', 'o u r s ', 'g e t a 2', 'e r t y t', 'a s e c', 'y v a l u', 'c g e t a', 'h c b e', 'e d i ', 't y p e o', 'r n o b', ' 0 t r', ' j a v a', 'r a i n ', 'd e x j s', 'e r s r', 'i t a b l', 'r o r e v', 's c r i p', 't a n c e', 'e o f e', 'r t y t a', 'c b t h', 'b j v a l', 'h a t i', 'i s n o', 'e v p u s', 'a f t e r', 'c k i n g', 'e n t s ', 'l d b e', ' 6 i f', 'v a r p', ' a y o', 'o w n e', 'a r e t u', 'f a n ', 'a r c o', 'v a r n', 'i h a v', 'l s e w', 'j s 7 ', 'l e a f', 'e h e r', 'i s a ', 's i s ', '0 p u b', 'r i n s', 'v p u s h', 'c o n s o', 'a t c h e', 'e c o n', 'b l i c u', 'e n t e m', 'u s h c b', 'o f r a', 'o n c o n', 'e i f ', 'e 0 t', 'i f y n'}\n", + "(128,)\n", + "[ 3784462 37800422 2386799 12798583 19067780 35471985 5092384 21656912\n", + " 6270010 1085463 18127069 48604244 2576934 374215 28430437 18254380\n", + " 4102298 34211961 12290716 7631961 29572216 24709709 29912800 41048438\n", + " 27236136 1418300 12406498 2497947 4862465 25247775 47097720 29021636\n", + " 20339264 2232649 12859188 2729719 17801875 21089389 31907354 16606114\n", + " 69956623 13833904 10446519 77378211 8550234 46195874 288182 14496122\n", + " 11237319 6912996 23228712 16190126 64914739 7870255 40934646 924194\n", + " 29225892 19996530 6438690 18986291 3218965 16411597 11880766 5132621\n", + " 1052847 2415123 59555528 3491442 15151896 7004468 10340142 32777136\n", + " 33778962 3260671 19172809 2359234 23976143 2416865 3125227 6480263\n", + " 20210992 27807156 3421181 5312099 10619587 9582 12716254 12379028\n", + " 10804802 2014329 48317165 19835788 20719591 5230161 9897841 98164\n", + " 9710948 17732724 27558813 3818146 9510059 5773255 34977961 18016812\n", + " 15674277 10223242 22205030 22428386 41746147 1300997 6531413 3921793\n", + " 6004933 2850113 25499809 16598362 414028 7644653 30133303 6577923\n", + " 14728135 4456431 736672 15122723 1921938 2848032 386889 9643633]\n", + "!!! using bigcode !!!\n", + "original: Q: How to modify non-configurable, non-writable properties in Javascript? I'm writing a simple EventEmitter is ES5.\n", + "The objective is to ensure that all properties on EventEmitter instances are\n", + "non-writable and non-configurable.\"\n", + "After 6 hours of racking my brain I still can't figure out how to, increase the listenerCount, for example if the configurable descriptor is set to false.\n", + "Here's an example of what I have:\n", + "var eventEmitter = function(){\n", + " var listeners = listeners || 0;\n", + " var events = events || {};\n", + "\n", + " Object.defineProperties(this, {\n", + " listeners: {\n", + " value : 0,\n", + " configurable: false,\n", + " writable: false\n", + " },\n", + " events: {\n", + " value: {},\n", + " configurable : false,\n", + " writable: false\n", + " }\n", + " });\n", + " return this;\n", + "};\n", + "\n", + "\n", + "eventEmmitter.prototype.on = function(ev, cb) {\n", + " if (typeof ev !== 'string') throw new TypeError(\"Event should be type string\", \"index.js\", 6);\n", + " if (typeof cb !== 'function' || cb === null || cb === undefined) throw new TypeError(\"callback should be type function\", \"index.js\", 7);\n", + "\n", + " if (this.events[ev]){\n", + " this.events[ev].push(cb);\n", + " } else {\n", + " this.events[ev] = [cb];\n", + " }\n", + "\n", + " this.listeners ++;\n", + " return this;\n", + "};\n", + "\n", + "\n", + "A: I would recommend the use of an IIFE (immediatly invoked function expression):\n", + "var coolObj=(function(){\n", + "var public={};\n", + "var nonpublic={};\n", + "nonpublic.a=0;\n", + "public.getA=function(){nonpublic.a++;return nonpublic.a;};\n", + "\n", + "return public;\n", + "})();\n", + "\n", + "Now you can do:\n", + "coolObj.getA();//1\n", + "coolObj.getA();//2\n", + "coolObj.a;//undefined\n", + "coolObj.nonpublic;//undefined\n", + "coolObj.nonpublic.a;//undefined\n", + "\n", + "I know this is not the answer youve expected, but i think its the easiest way of doing sth like that.\n", + "\n", + "A: You can use a proxy which requires a key in order to define properties:\n", + "\n", + "\n", + "function createObject() {\n", + " var key = {configurable: true};\n", + " return [new Proxy({}, {\n", + " defineProperty(target, prop, desc) {\n", + " if (desc.value === key) {\n", + " return Reflect.defineProperty(target, prop, key);\n", + " }\n", + " }\n", + " }), key];\n", + "}\n", + "function func() {\n", + " var [obj, key] = createObject();\n", + " key.value = 0;\n", + " Reflect.defineProperty(obj, \"value\", {value: key});\n", + " key.value = function() {\n", + " key.value = obj.value + 1;\n", + " Reflect.defineProperty(obj, \"value\", {value: key});\n", + " };\n", + " Reflect.defineProperty(obj, \"increase\", {value: key});\n", + " return obj;\n", + "}\n", + "var obj = func();\n", + "console.log(obj.value); // 0\n", + "try { obj.value = 123; } catch(err) {}\n", + "try { Object.defineProperty(obj, \"value\", {value: 123}); } catch(err) {}\n", + "console.log(obj.value); // 0\n", + "obj.increase();\n", + "console.log(obj.value); // 1\n", + "\n", + "\n", + "\n", + "(128,)\n", + "[ 3784462 37800422 2386799 12798583 19067780 35471985 5092384 21656912\n", + " 6270010 1085463 18127069 48604244 2576934 374215 28430437 18254380\n", + " 4102298 34211961 12290716 7631961 29572216 24709709 29912800 41048438\n", + " 27236136 1418300 12406498 2497947 4862465 25247775 47097720 29021636\n", + " 20339264 2232649 12859188 2729719 17801875 21089389 31907354 16606114\n", + " 69956623 13833904 10446519 77378211 8550234 46195874 288182 14496122\n", + " 11237319 6912996 23228712 16190126 64914739 7870255 40934646 924194\n", + " 29225892 19996530 6438690 18986291 3218965 16411597 11880766 5132621\n", + " 1052847 2415123 59555528 3491442 15151896 7004468 10340142 32777136\n", + " 33778962 3260671 19172809 2359234 23976143 2416865 3125227 6480263\n", + " 20210992 27807156 3421181 5312099 10619587 9582 12716254 12379028\n", + " 10804802 2014329 48317165 19835788 20719591 5230161 9897841 98164\n", + " 9710948 17732724 27558813 3818146 9510059 5773255 34977961 18016812\n", + " 15674277 10223242 22205030 22428386 41746147 1300997 6531413 3921793\n", + " 6004933 2850113 25499809 16598362 414028 7644653 30133303 6577923\n", + " 14728135 4456431 736672 15122723 1921938 2848032 386889 9643633]\n", + "0\n", + "slimpj hash\n", + "(0, b'\\x00\\x00\\x00\\x00\\x009\\xbf\\x0e\\x00\\x00\\x00\\x00\\x02@\\xc9\\xe6\\x00\\x00\\x00\\x00\\x00$ko\\x00\\x00\\x00\\x00\\x00\\xc3Jw\\x00\\x00\\x00\\x00\\x01\"\\xf3\\x84\\x00\\x00\\x00\\x00\\x02\\x1dBq\\x00\\x00\\x00\\x00\\x00M\\xb4 \\x00\\x00\\x00\\x00\\x01JuP\\x00\\x00\\x00\\x00\\x00_\\xac:\\x00\\x00\\x00\\x00\\x00\\x10\\x90\\x17\\x00\\x00\\x00\\x00\\x01\\x14\\x98\\xdd\\x00\\x00\\x00\\x00\\x02\\xe5\\xa4T\\x00\\x00\\x00\\x00\\x00\\'R&', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "bigcode hash\n", + "(0, b'\\x00\\x00\\x00\\x00\\x009\\xbf\\x0e\\x00\\x00\\x00\\x00\\x02@\\xc9\\xe6\\x00\\x00\\x00\\x00\\x00$ko\\x00\\x00\\x00\\x00\\x00\\xc3Jw\\x00\\x00\\x00\\x00\\x01\"\\xf3\\x84\\x00\\x00\\x00\\x00\\x02\\x1dBq\\x00\\x00\\x00\\x00\\x00M\\xb4 \\x00\\x00\\x00\\x00\\x01JuP\\x00\\x00\\x00\\x00\\x00_\\xac:\\x00\\x00\\x00\\x00\\x00\\x10\\x90\\x17\\x00\\x00\\x00\\x00\\x01\\x14\\x98\\xdd\\x00\\x00\\x00\\x00\\x02\\xe5\\xa4T\\x00\\x00\\x00\\x00\\x00\\'R&', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "1\n", + "slimpj hash\n", + "(1, b'\\x00\\x00\\x00\\x00\\x00\\x05\\xb5\\xc7\\x00\\x00\\x00\\x00\\x01\\xb1\\xd0e\\x00\\x00\\x00\\x00\\x01\\x16\\x8a,\\x00\\x00\\x00\\x00\\x00>\\x98\\x9a\\x00\\x00\\x00\\x00\\x02\\n\\x08y\\x00\\x00\\x00\\x00\\x00\\xbb\\x8a\\x9c\\x00\\x00\\x00\\x00\\x00ttY\\x00\\x00\\x00\\x00\\x01\\xc3\\x98\\x9a\\x00\\x00\\x00\\x00\\x02\\n\\x08y\\x00\\x00\\x00\\x00\\x00\\xbb\\x8a\\x9c\\x00\\x00\\x00\\x00\\x00ttY\\x00\\x00\\x00\\x00\\x01\\xc3\\x00\\x00\\x00\\x00\\x00NQM\\x00\\x00\\x00\\x00\\x00\\x10\\x10\\xaf', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "bigcode hash\n", + "(4, b'\\x00\\x00\\x00\\x00\\x03\\xde\\x853\\x00\\x00\\x00\\x00\\x00x\\x17/\\x00\\x00\\x00\\x00\\x02p\\x9c\\xf6\\x00\\x00\\x00\\x00\\x00\\x0e\\x1a\"\\x00\\x00\\x00\\x00\\x01\\xbd\\xf3\\xa4\\x00\\x00\\x00\\x00\\x011\\x1fr\\x00\\x00\\x00\\x00\\x00b?\"\\x00\\x00\\x00\\x00\\x01!\\xb53\\x00\\x00\\x00\\x00\\x001\\x1e\\x15\\x00\\x00\\x00\\x00\\x00\\xfak\\xcd\\x00\\x00\\x00\\x00\\x00\\xb5I>\\x00\\x00\\x00\\x00\\x00NQM\\x00\\x00\\x00\\x00\\x00\\x10\\x10\\xaf', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "5\n", + "slimpj hash\n", + "(5, b'\\x00\\x00\\x00\\x00\\x00$\\xda\\x13\\x00\\x00\\x00\\x00\\x03\\x8c\\xbe\\xc8\\x00\\x00\\x00\\x00\\x005Fr\\x00\\x00\\x00\\x00\\x00\\xe73\\x18\\x00\\x00\\x00\\x00\\x00j\\xe14\\x00\\x00\\x00\\x00\\x00\\x9d\\xc7.\\x00\\x00\\x00\\x00\\x01\\xf4#\\xb0\\x00\\x00\\x00\\x00\\x02\\x03m\\x12\\x00\\x00\\x00\\x00\\x001\\xc0\\xff\\x00\\x00\\x00\\x00\\x01$\\x8d\\xc9\\x00\\x00\\x00\\x00\\x00#\\xff\\xc2\\x00\\x00\\x00\\x00\\x01m\\xd8\\xcf\\x00\\x00\\x00\\x00\\x00$\\xe0\\xe1', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "bigcode hash\n", + "(5, b'\\x00\\x00\\x00\\x00\\x00$\\xda\\x13\\x00\\x00\\x00\\x00\\x03\\x8c\\xbe\\xc8\\x00\\x00\\x00\\x00\\x005Fr\\x00\\x00\\x00\\x00\\x00\\xe73\\x18\\x00\\x00\\x00\\x00\\x00j\\xe14\\x00\\x00\\x00\\x00\\x00\\x9d\\xc7.\\x00\\x00\\x00\\x00\\x01\\xf4#\\xb0\\x00\\x00\\x00\\x00\\x02\\x03m\\x12\\x00\\x00\\x00\\x00\\x001\\xc0\\xff\\x00\\x00\\x00\\x00\\x01$\\x8d\\xc9\\x00\\x00\\x00\\x00\\x00#\\xff\\xc2\\x00\\x00\\x00\\x00\\x01m\\xd8\\xcf\\x00\\x00\\x00\\x00\\x00$\\xe0\\xe1', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "6\n", + "slimpj hash\n", + "(6, b'\\x00\\x00\\x00\\x00\\x00/\\xaf\\xeb\\x00\\x00\\x00\\x00\\x00b\\xe1\\x87\\x00\\x00\\x00\\x00\\x014e0\\x00\\x00\\x00\\x00\\x01\\xa8M\\xb4\\x00\\x00\\x00\\x00\\x0043\\xfd\\x00\\x00\\x00\\x00\\x00Q\\x0ec\\x00\\x00\\x00\\x00\\x00\\xa2\\n\\xc3\\x00\\x00\\x00\\x00\\x00\\x00%n\\x00\\x00\\x00\\x00\\x00\\xc2\\x08\\xde\\x00\\x00\\x00\\x00\\x00\\xbc\\xe3\\x94\\x00\\x00\\x00\\x00\\x00\\xa4\\xdeB\\x00\\x00\\x00\\x00\\x00\\x1e\\xbcy\\x00\\x00\\x00\\x00\\x02\\xe1B\\xed', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "bigcode hash\n", + "(6, b'\\x00\\x00\\x00\\x00\\x00/\\xaf\\xeb\\x00\\x00\\x00\\x00\\x00b\\xe1\\x87\\x00\\x00\\x00\\x00\\x014e0\\x00\\x00\\x00\\x00\\x01\\xa8M\\xb4\\x00\\x00\\x00\\x00\\x0043\\xfd\\x00\\x00\\x00\\x00\\x00Q\\x0ec\\x00\\x00\\x00\\x00\\x00\\xa2\\n\\xc3\\x00\\x00\\x00\\x00\\x00\\x00%n\\x00\\x00\\x00\\x00\\x00\\xc2\\x08\\xde\\x00\\x00\\x00\\x00\\x00\\xbc\\xe3\\x94\\x00\\x00\\x00\\x00\\x00\\xa4\\xdeB\\x00\\x00\\x00\\x00\\x00\\x1e\\xbcy\\x00\\x00\\x00\\x00\\x02\\xe1B\\xed', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "7\n", + "slimpj hash\n", + "(7, b\"\\x00\\x00\\x00\\x00\\x01.\\xab\\x8c\\x00\\x00\\x00\\x00\\x01<'\\xe7\\x00\\x00\\x00\\x00\\x00O\\xceQ\\x00\\x00\\x00\\x00\\x00\\x97\\x07q\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x00\\x00\\x00\\x00\\x00\\x94-d\\x00\\x00\\x00\\x00\\x01\\x0e\\x94t\\x00\\x00\\x00\\x00\\x01\\xa4\\x83\\x9d\\x00\\x00\\x00\\x00\\x00:B\\xa2\\x00\\x00\\x00\\x00\\x00\\x91\\x1c\\xab\\x00\\x00\\x00\\x00\\x00X\\x17\\xc7\\x00\\x00\\x00\\x00\\x02\\x15\\xb8\\xa9\\x00\\x00\\x00\\x00\\x01\\x12\\xea,\", '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "bigcode hash\n", + "(7, b\"\\x00\\x00\\x00\\x00\\x01.\\xab\\x8c\\x00\\x00\\x00\\x00\\x01<'\\xe7\\x00\\x00\\x00\\x00\\x00O\\xceQ\\x00\\x00\\x00\\x00\\x00\\x97\\x07q\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x00\\x00\\x00\\x00\\x00\\x94-d\\x00\\x00\\x00\\x00\\x01\\x0e\\x94t\\x00\\x00\\x00\\x00\\x01\\xa4\\x83\\x9d\\x00\\x00\\x00\\x00\\x00:B\\xa2\\x00\\x00\\x00\\x00\\x00\\x91\\x1c\\xab\\x00\\x00\\x00\\x00\\x00X\\x17\\xc7\\x00\\x00\\x00\\x00\\x02\\x15\\xb8\\xa9\\x00\\x00\\x00\\x00\\x01\\x12\\xea,\", '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "8\n", + "slimpj hash\n", + "(8, b'\\x00\\x00\\x00\\x00\\x00\\xef+\\xa5\\x00\\x00\\x00\\x00\\x00\\x9b\\xfe\\x8a\\x00\\x00\\x00\\x00\\x01R\\xd2f\\x00\\x00\\x00\\x00\\x01V:\\xe2\\x00\\x00\\x00\\x00\\x02|\\xfe\\xe3\\x00\\x00\\x00\\x00\\x00\\x13\\xda\\x05\\x00\\x00\\x00\\x00\\x00c\\xa9U\\x00\\x00\\x00\\x00\\x00;\\xd7\\x81\\x00\\x00\\x00\\x00\\x00[\\xa0\\xc5\\x00\\x00\\x00\\x00\\x00+}A\\x00\\x00\\x00\\x00\\x01\\x85\\x18\\xa1\\x00\\x00\\x00\\x00\\x00\\xfdEZ\\x00\\x00\\x00\\x00\\x00\\x06QL', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n", + "bigcode hash\n", + "(8, b'\\x00\\x00\\x00\\x00\\x00\\xef+\\xa5\\x00\\x00\\x00\\x00\\x00\\x9b\\xfe\\x8a\\x00\\x00\\x00\\x00\\x01R\\xd2f\\x00\\x00\\x00\\x00\\x01V:\\xe2\\x00\\x00\\x00\\x00\\x02|\\xfe\\xe3\\x00\\x00\\x00\\x00\\x00\\x13\\xda\\x05\\x00\\x00\\x00\\x00\\x00c\\xa9U\\x00\\x00\\x00\\x00\\x00;\\xd7\\x81\\x00\\x00\\x00\\x00\\x00[\\xa0\\xc5\\x00\\x00\\x00\\x00\\x00+}A\\x00\\x00\\x00\\x00\\x01\\x85\\x18\\xa1\\x00\\x00\\x00\\x00\\x00\\xfdEZ\\x00\\x00\\x00\\x00\\x00\\x06QL', '/home/vmagent/app/test_data/stackexchange_sample.jsonl@0')\n" + ] + } + ], + "source": [ + "import jsonlines\n", + "data_files = [\"/home/vmagent/app/test_data/stackexchange_sample.jsonl\"]\n", + "\n", + "# ported implementation from slim, very different\n", + "# Define parameters\n", + "ngram_size = 5\n", + "num_perm = 256\n", + "threshold = 0.7\n", + "min_ngram_size = 5\n", + "B = None\n", + "R = None\n", + "# defined in SlimPajama Main.py\n", + "B = 9\n", + "R = 13\n", + "num_perm = 128\n", + "\n", + "HASH_RANGES, PERMUTATIONS, num_bands, ranges = get_permutation(threshold, num_perm, B, R)\n", + "print(f\"num_bands is {num_bands}, ranges is {ranges}\")\n", + "\n", + "def get_documents(file_path):\n", + " with jsonlines.open(file_path) as rdr:\n", + " for doc_id, doc in enumerate(rdr):\n", + " yield doc[\"text\"], file_path, doc_id\n", + "\n", + "content_list = []\n", + "print(\"!!! using slimPJ !!!\")\n", + "for doc in get_documents(data_files[0]):\n", + " content, file_path, doc_id = doc\n", + " ret_slimpj = generate_hash_values_slimpj(content=content,\n", + " idx=f\"{file_path}@{doc_id}\",\n", + " num_perm=num_perm,\n", + " ngram_size=ngram_size,\n", + " hashranges=HASH_RANGES,\n", + " permutations = PERMUTATIONS, \n", + " min_ngram_size = min_ngram_size)\n", + " break\n", + "\n", + "print(\"!!! using bigcode !!!\")\n", + "for doc in get_documents(data_files[0]):\n", + " content, file_path, doc_id = doc\n", + " ret_bigcode = generate_hash_values_bigcode(content=content,\n", + " idx=f\"{file_path}@{doc_id}\",\n", + " num_perm=num_perm,\n", + " ngram_size=ngram_size,\n", + " hashranges=HASH_RANGES,\n", + " permutations = PERMUTATIONS, \n", + " min_ngram_size = min_ngram_size)\n", + " break\n", + "\n", + "for idx in range(len(ret_slimpj)):\n", + " print(idx)\n", + " print(\"slimpj hash\")\n", + " print(ret_slimpj[idx])\n", + " print(\"bigcode hash\")\n", + " print(ret_bigcode[idx])\n" + ] + }, + { + "attachments": { + "478bc5bf-2815-4154-9ccf-7abf6e899ea6.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "8177f917-cc98-47ee-bfaa-d9877e1df804", + "metadata": {}, + "source": [ + "### Calulate minHashLSH\n", + "\n", + "reference is https://huggingface.co/blog/dedup#locality-sensitive-hashing\n", + "\n", + "### 1. step 1: gen minHash for doc_bands\n", + "\n", + "![image.png](attachment:478bc5bf-2815-4154-9ccf-7abf6e899ea6.png)" + ] + }, + { + "attachments": { + "93db8fe1-907e-42a4-aab5-250886145889.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "68386a3e-564a-4748-b6a0-ff1674e24e36", + "metadata": {}, + "source": [ + "### 2. Step2: gen edges\n", + "\n", + "Once we have the band_index, hash of the band and doc_ids, we need to find which docs are sharing band_index and band_value\n", + "![image.png](attachment:93db8fe1-907e-42a4-aab5-250886145889.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9b0a5296-c9c8-406a-9e40-8bf45cfcd260", + "metadata": {}, + "outputs": [], + "source": [ + "## ported implementation from slim, very different\n", + "# # Define parameters\n", + "# ngram_size = 5\n", + "# num_perm = 256\n", + "# threshold = 0.7\n", + "# min_ngram_size = 5\n", + "\n", + "# HASH_RANGES, PERMUTATIONS, num_bands, ranges = get_permutation(threshold, num_perm)\n", + "# print(f\"num_bands is {num_bands}, ranges is {ranges}\")\n", + "\n", + "# # group_by(x[0], x[1]) => band_idx, hash value\n", + "# # flatMap(x[1]) => a list of tuple(band_idx, hash value, doc_id) shares same (band_idx, hash value)\n", + "# # generate_edges(i[2]) input is doc_idx\n", + "# pipeline = (\n", + "# df.rdd.flatMap(\n", + "# lambda x: generate_hash_values_slimpj(\n", + "# content=x[1],\n", + "# idx=x[0],\n", + "# num_perm=num_perm,\n", + "# ngram_size=ngram_size,\n", + "# hashranges=HASH_RANGES\n", + "# )\n", + "# )\n", + "# .cache()\n", + "# .groupBy(lambda x: (x[0], x[1]))\n", + "# .flatMap(lambda x: generate_edges([(i[2]) for i in x[1]]))\n", + "# .flatMap(lambda x: convert_to_slimPJ_fmt(x[0], x[1]))\n", + "# .distinct()\n", + "# .cache()\n", + "# )\n", + "\n", + "# with Timer(\"generate minHashLsh Edges\"):\n", + "# if os.path.exists(dup_dir):\n", + "# shutil.rmtree(dup_dir, ignore_errors=True)\n", + "# results = pipeline.saveAsTextFile(dup_dir)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [], + "source": [ + "# # Define parameters\n", + "# ngram_size = 5\n", + "# num_perm = 256\n", + "# threshold = 0.7\n", + "# min_ngram_size = 5\n", + "# B = None\n", + "# R = None\n", + "# # defined in SlimPajama Main.py\n", + "# B = 9\n", + "# R = 13\n", + "# num_perm = 128\n", + "\n", + "# HASH_RANGES, PERMUTATIONS, num_bands, ranges = get_permutation(threshold, num_perm, B, R)\n", + "# print(f\"num_bands is {num_bands}, ranges is {ranges}\")\n", + "\n", + "# # group_by(x[0], x[1]) => band_idx, hash value\n", + "# # flatMap(x[1]) => a list of tuple(band_idx, hash value, doc_id) shares same (band_idx, hash value)\n", + "# # generate_edges(i[2]) input is doc_idx\n", + "# pipeline = (\n", + "# df.rdd.flatMap(\n", + "# lambda x: generate_hash_values_bigcode(\n", + "# content=x[1],\n", + "# idx=x[0],\n", + "# num_perm=num_perm,\n", + "# ngram_size=ngram_size,\n", + "# hashranges=HASH_RANGES,\n", + "# permutations = PERMUTATIONS, \n", + "# min_ngram_size = min_ngram_size\n", + "# )\n", + "# )\n", + "# .cache()\n", + "# .groupBy(lambda x: (x[0], x[1]))\n", + "# .flatMap(lambda x: generate_edges([(i[2]) for i in x[1]]))\n", + "# .flatMap(lambda x: convert_to_slimPJ_fmt(x[0], x[1]))\n", + "# .distinct()\n", + "# .cache()\n", + "# )\n", + "\n", + "# with Timer(\"generate minHashLsh Edges\"):\n", + "# if os.path.exists(dup_dir):\n", + "# shutil.rmtree(dup_dir, ignore_errors=True)\n", + "# results = pipeline.saveAsTextFile(dup_dir)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/template_notebooks/dedup_convert.ipynb b/tools/near_dedup/template_notebooks/dedup_convert.ipynb new file mode 100644 index 000000000..f97952b76 --- /dev/null +++ b/tools/near_dedup/template_notebooks/dedup_convert.ipynb @@ -0,0 +1,87 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6cd83bf5-bd6e-4d1d-85b5-89a7d8e2ff95", + "metadata": {}, + "source": [ + "# Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4cd5492-7518-4331-abef-12a8ea561a1b", + "metadata": {}, + "outputs": [], + "source": [ + "from dedup_convert import *" + ] + }, + { + "cell_type": "markdown", + "id": "b9cbd6f6-2864-499d-814a-9df630898838", + "metadata": {}, + "source": [ + "# Config " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ded76a88-461e-4fae-a88f-6b0b0d66db7f", + "metadata": {}, + "outputs": [], + "source": [ + "data_dir = \"/home/vmagent/app/test_data/\"\n", + "dup_dir = os.path.join(data_dir, \"deduplicate\")\n", + "dup_dict = os.path.join(dup_dir, \"duplicates.pickle\")\n", + "out_dir = os.path.join(dup_dir, \"output\")" + ] + }, + { + "cell_type": "markdown", + "id": "bc18fbe4-5266-471b-9257-6bf1a94b2a4f", + "metadata": {}, + "source": [ + "# Run shink" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a20ea89-3fb8-4a63-9272-b61024d4fb9f", + "metadata": {}, + "outputs": [], + "source": [ + "with Timer(f\"apply duplicates.pickle to create new data\"):\n", + " dedup_args = argparse.Namespace()\n", + " dedup_args.data_dir = data_dir\n", + " dedup_args.out_dir = out_dir\n", + " dedup_args.dup_dict = dup_dict\n", + " shink_document_MP(dedup_args)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tools/near_dedup/template_notebooks/near_dedup.ipynb b/tools/near_dedup/template_notebooks/near_dedup.ipynb new file mode 100644 index 000000000..40975b4a3 --- /dev/null +++ b/tools/near_dedup/template_notebooks/near_dedup.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e01a822b-abf8-4327-ae5c-9723ad11c0ab", + "metadata": {}, + "source": [ + "## Import and define" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24ffc3f0-f5c7-460e-b400-b35b48f54b0b", + "metadata": {}, + "outputs": [], + "source": [ + "from near_dedup import *\n", + "rdp = SparkDataProcessor()\n", + "spark=rdp.spark " + ] + }, + { + "cell_type": "markdown", + "id": "3a51e611-df76-4e69-86b4-addf91ce4306", + "metadata": {}, + "source": [ + "## Configurate DIR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54869228-a39c-441b-b757-ba337dfdd8d5", + "metadata": {}, + "outputs": [], + "source": [ + "data_files = get_data_files('/home/vmagent/app/test_data/')\n", + "dup_dir = \"/home/vmagent/app/test_data/deduplicate\"\n", + "ngram_size = 6\n", + "num_perm = 256\n", + "threshold = 0.7\n", + "bands = 9\n", + "ranges = 13" + ] + }, + { + "cell_type": "markdown", + "id": "67299966-7ae2-492a-bea6-195721f5ee9f", + "metadata": {}, + "source": [ + "## Load data into Spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd3432d2-9f18-4b49-9747-6b4730e97100", + "metadata": {}, + "outputs": [], + "source": [ + "with Timer(\"Load data with RowID\"):\n", + " df = read_json(data_files, spark).cache()\n", + " total_length = df.count()" + ] + }, + { + "cell_type": "markdown", + "id": "5a0f56b2-ebdc-4933-81fd-399e3234ea81", + "metadata": {}, + "source": [ + "## Get minHashLSH edges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a5ba001-0adc-49c5-b0fc-c539499318bd", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = minHashLSH_prepare(df, num_perm, ngram_size, bands, ranges)\n", + "with Timer(\"generate minHashLsh\"):\n", + " if os.path.exists(dup_dir):\n", + " shutil.rmtree(dup_dir, ignore_errors=True)\n", + " results = pipeline.saveAsTextFile(dup_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "0116c740-a373-469c-937e-bcedb20f71d9", + "metadata": {}, + "source": [ + "## Generate connected components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c69644-a12c-433d-9eaf-8632c63c042b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "with Timer(f\"generate_connected_components all\"):\n", + " dup_connected_args = argparse.Namespace()\n", + " dup_connected_args.input_dir = dup_dir\n", + " dup_connected_args.out_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " generate_connected_components.generate_connected_components_mp(dup_connected_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5abadeea-2aed-4de0-9508-6f17d735adf2", + "metadata": {}, + "source": [ + "## convert as duplicates dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dea1212-989f-4544-a087-0bfb1b40c664", + "metadata": {}, + "outputs": [], + "source": [ + "with Timer(f\"generate_duplicates_dict all\"):\n", + " dup_docs = os.path.join(dup_dir, \"duplicates.pickle\")\n", + " dup_dict_args = argparse.Namespace()\n", + " dup_dict_args.input_file = os.path.join(\n", + " dup_dir, \"connected_components.pickle\"\n", + " )\n", + " dup_dict_args.out_file = dup_docs\n", + " generate_duplicates_dict.generate_duplicates(dup_dict_args)" + ] + }, + { + "cell_type": "markdown", + "id": "dbc0610b-6820-4beb-b71b-ab04f76ef97c", + "metadata": {}, + "source": [ + "## View result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e056210c-907c-4938-b439-d31a4824eecb", + "metadata": {}, + "outputs": [], + "source": [ + "dup_dict = pickle.load(open(os.path.join(dup_dir, \"duplicates.pickle\"), 'rb'))\n", + "dup_sum = 0\n", + "for _, v in dup_dict.items():\n", + " dup_sum += len(list(v))\n", + "\n", + "print(f\"Completed!!\")\n", + "print(f\" total processed {total_length} documents\")\n", + "print(f\" total detected {dup_sum} duplicated documents\")\n", + "print(f\" duplicate ratio is {dup_sum/total_length}\")" + ] + }, + { + "attachments": { + "478bc5bf-2815-4154-9ccf-7abf6e899ea6.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "8177f917-cc98-47ee-bfaa-d9877e1df804", + "metadata": {}, + "source": [ + "### Calulate minHashLSH\n", + "\n", + "reference is https://huggingface.co/blog/dedup#locality-sensitive-hashing\n", + "\n", + "### 1. step 1: gen minHash for doc_bands\n", + "\n", + "![image.png](attachment:478bc5bf-2815-4154-9ccf-7abf6e899ea6.png)" + ] + }, + { + "attachments": { + "93db8fe1-907e-42a4-aab5-250886145889.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "68386a3e-564a-4748-b6a0-ff1674e24e36", + "metadata": {}, + "source": [ + "### 2. Step2: gen edges\n", + "\n", + "Once we have the band_index, hash of the band and doc_ids, we need to find which docs are sharing band_index and band_value\n", + "![image.png](attachment:93db8fe1-907e-42a4-aab5-250886145889.png)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}