Text tools added, testing in progress

georgew79 · Jun 6, 2023 · 3fd2942 · 3fd2942
1 parent 23b906a
commit 3fd2942
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 9 deletions.
diff --git a/Lib/__pycache__/__init__.cpython-310.pyc b/Lib/__pycache__/__init__.cpython-310.pyc
diff --git a/Lib/__pycache__/net_utils.cpython-310.pyc b/Lib/__pycache__/net_utils.cpython-310.pyc
diff --git a/Lib/__pycache__/text_utils.cpython-310.pyc b/Lib/__pycache__/text_utils.cpython-310.pyc
diff --git a/Lib/net_utils.py b/Lib/net_utils.py
@@ -13,7 +13,7 @@
 import os
 
 from tqdm import tqdm
-from text_utils import clean_text
+from .text_utils import clean_text
 
 class ResponseException(Exception):
     ''' Raised when there is some issue in the response.'''
@@ -55,7 +55,8 @@ def download_file(path:str, save_file_path:str ='NONAME', unzipped_path:str = 'U
     os.remove(save_file_path)
 
 def download_wet_file(path:str, standard_encoding:str ='utf-8', error_handling:str ='ignore', 
-                    save_file_path:str ='NONAME.warc.wet.gz', unzipped_path:str = 'UNZIPPED.warc', should_print:bool =False) -> str:
+                    save_file_path:str ='NONAME.warc.wet.gz', unzipped_path:str = 'UNZIPPED.warc',
+                    should_split:bool =False, should_capture:bool =False, should_print:bool =False) -> str:
     '''
     This function pulls down the .wet gz file with the given filepath. The filepath 
     provided **must** be a valid path to a common crawl .wet file.
@@ -65,20 +66,29 @@ def download_wet_file(path:str, standard_encoding:str ='utf-8', error_handling:s
     @error_handling: 'ignore', 'replace', or 'strict'. Specifies how to handle decoding errors.
     @save_file_path: Choose where to save the file as text for processing. This is only the intermediary save.
     @unzipped_path: Choose where to save the unzipped file.
+    @should_split: Flag for splitting text by newlines. 
+    @should_capture: Flag for capturing keyboard interrupts. Adding text can take a long
+    time, so if should_capture is true then a keyboard interrupt will just return
+    the text received so far.
     @should_print: Flag for extra printing, if it's available.
+    
     @return: Returns a cleaned string.
     '''
-    #if save_file_path == 'NONAME.warc.wet.gz' or unzipped_path == 'UNZIPPED.warc':
-    #    raise UserWarning(f"WARNING, did not specify save location for file, using {save_file_path} and {unzipped_path} in local directory")
 
     download_file(path=path, save_file_path=save_file_path, unzipped_path=unzipped_path, should_print=should_print)
 
     # Written to disk, now get raw text.
     text = ""
-    with warc.open(unzipped_path) as f:
-        for i,record in enumerate(tqdm(f)):
-            text += clean_text(record.payload.read().decode(standard_encoding, error_handling))
+    try: 
+        with warc.open(unzipped_path) as f:
+            for i, record in enumerate(tqdm(f)):
+                text += clean_text(record.payload.read().decode(standard_encoding, error_handling))
+    except KeyboardInterrupt:
+        if not should_capture:
+            raise KeyboardInterrupt()
 
+    if should_split:
+        return text.split('\n')
     return text
 
 def pull_from_paths(index_path: str, save_path: str, should_print=False):

diff --git a/Lib/text_utils.py b/Lib/text_utils.py
@@ -5,6 +5,48 @@
 up for model processing
 
 '''
+import re
+
+from re import sub
+from typing import Callable, List
+
+def __process_lines(text:str, language_reg:str) -> str:
+    remove_nonprint = re.compile(language_reg)
+    return sub(remove_nonprint, '', text)
+
+def remove_html(text:str) -> str:
+    remove_html_reg = re.compile('<.*?>')
+    return sub(remove_html_reg, '', text)
+
+def clean_text(text:str, b_remove_html:bool =True, b_rm_nprintable:bool =True, b_split_nlines:bool =False, 
+               func:Callable|None =None, language_reg:str ='[^a-zA-Z0-9 _]', **kwargs) -> str|List[str]:
+    '''
+    Function meant to handle cleaning text from common crawl. 
+    @text: String meant to represent the text to clean
+    @b_remove_html: boolean of whether or not to remove any HTML information
+    @b_rm_nprintable: boolean of whether or not to remove any non printable characters
+    @b_split_nlines: boolean of whether or not to split by '\n'
+    @func: Extra preprocessing function pointer that you may wish to add.
+    Add any extra kwargs for the func as necessary. They will be passed in the kwargs dictionary.
+    I do assume that the text is the first argument to the function.
+    NOTE: Any alterations in the return type of the function you give will ALTER the return type of this function.
+    @language_reg: String for a regular expression pattern of what constitutes the language (ie what characters
+    to keep)
+
+    @return: Either return one long string with the text, or a list separated by newlines.
+    '''
+
+    if b_remove_html:
+        text = remove_html(text)
+
+    if b_rm_nprintable:
+        text = __process_lines(text, language_reg)
+
+    if func is not None:
+        text = func(text, **kwargs)
+
+    if b_split_nlines:
+        return text.split('\n')
+
+    return text
 
-def clean_text(text:str) -> str:
-    pass
diff --git a/README.md b/README.md
@@ -8,6 +8,8 @@ Data from Common Crawl comes in .WARC, .WET, and .WAT formats, as described here
 
 **This repository specifically focuses on converting common crawl plaintext to a usable dataset format.**
 
+** PLEASE NOTE THAT THE REPOSITORY IS IN ACTIVE DEVELOPMENT AS A SIDE PROJECT, development is slow **
+
 # Install
 
 # Usage
diff --git a/crawl_example.ipynb b/crawl_example.ipynb
@@ -0,0 +1,67 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Lib.net_utils import download_wet_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_url = 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320299852.23/wet/CC-MAIN-20220116093137-20220116123137-00000.warc.wet.gz'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ResponseException",
+     "evalue": "Response failed with message REQUEST FOR https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320299852.23/wet/CC-MAIN-20220116093137-20220116123137-00000.warc.wet.gz FAILED WITH CODE 503, MEANING Service Unavailable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mResponseException\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[3], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m text \u001b[39m=\u001b[39m download_wet_file(sample_url, should_capture\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, should_split\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n",
+      "File \u001b[1;32mc:\\Users\\George\\Documents\\Programming\\local_installs\\CommonCrawler\\Lib\\net_utils.py:78\u001b[0m, in \u001b[0;36mdownload_wet_file\u001b[1;34m(path, standard_encoding, error_handling, save_file_path, unzipped_path, should_split, should_capture, should_print)\u001b[0m\n\u001b[0;32m     57\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdownload_wet_file\u001b[39m(path:\u001b[39mstr\u001b[39m, standard_encoding:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mutf-8\u001b[39m\u001b[39m'\u001b[39m, error_handling:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mignore\u001b[39m\u001b[39m'\u001b[39m, \n\u001b[0;32m     58\u001b[0m                     save_file_path:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mNONAME.warc.wet.gz\u001b[39m\u001b[39m'\u001b[39m, unzipped_path:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mUNZIPPED.warc\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[0;32m     59\u001b[0m                     should_split:\u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, should_capture:\u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, should_print:\u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m:\n\u001b[0;32m     60\u001b[0m \u001b[39m    \u001b[39m\u001b[39m'''\u001b[39;00m\n\u001b[0;32m     61\u001b[0m \u001b[39m    This function pulls down the .wet gz file with the given filepath. The filepath \u001b[39;00m\n\u001b[0;32m     62\u001b[0m \u001b[39m    provided **must** be a valid path to a common crawl .wet file.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     75\u001b[0m \u001b[39m    @return: Returns a cleaned string.\u001b[39;00m\n\u001b[0;32m     76\u001b[0m \u001b[39m    '''\u001b[39;00m\n\u001b[1;32m---> 78\u001b[0m     download_file(path\u001b[39m=\u001b[39;49mpath, save_file_path\u001b[39m=\u001b[39;49msave_file_path, unzipped_path\u001b[39m=\u001b[39;49munzipped_path, should_print\u001b[39m=\u001b[39;49mshould_print)\n\u001b[0;32m     80\u001b[0m     \u001b[39m# Written to disk, now get raw text.\u001b[39;00m\n\u001b[0;32m     81\u001b[0m     text \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n",
+      "File \u001b[1;32mc:\\Users\\George\\Documents\\Programming\\local_installs\\CommonCrawler\\Lib\\net_utils.py:39\u001b[0m, in \u001b[0;36mdownload_file\u001b[1;34m(path, save_file_path, unzipped_path, should_print)\u001b[0m\n\u001b[0;32m     37\u001b[0m r \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mget(path, stream\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m     38\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m r\u001b[39m.\u001b[39mok:\n\u001b[1;32m---> 39\u001b[0m     \u001b[39mraise\u001b[39;00m ResponseException(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mREQUEST FOR \u001b[39m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m FAILED WITH CODE \u001b[39m\u001b[39m{\u001b[39;00mr\u001b[39m.\u001b[39mstatus_code\u001b[39m}\u001b[39;00m\u001b[39m, MEANING \u001b[39m\u001b[39m{\u001b[39;00mr\u001b[39m.\u001b[39mreason\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[0;32m     41\u001b[0m \u001b[39m# Got file, now write it to disk.\u001b[39;00m\n\u001b[0;32m     42\u001b[0m total \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(r\u001b[39m.\u001b[39mheaders\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mcontent-length\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m0\u001b[39m))\n",
+      "\u001b[1;31mResponseException\u001b[0m: Response failed with message REQUEST FOR https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320299852.23/wet/CC-MAIN-20220116093137-20220116123137-00000.warc.wet.gz FAILED WITH CODE 503, MEANING Service Unavailable"
+     ]
+    }
+   ],
+   "source": [
+    "text = download_wet_file(sample_url, should_capture=True, should_split=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}