Skip to content

Commit

Permalink
Text tools added, testing in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
georgew79 committed Jun 6, 2023
1 parent 23b906a commit 3fd2942
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 9 deletions.
Binary file added Lib/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file added Lib/__pycache__/net_utils.cpython-310.pyc
Binary file not shown.
Binary file added Lib/__pycache__/text_utils.cpython-310.pyc
Binary file not shown.
24 changes: 17 additions & 7 deletions Lib/net_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import os

from tqdm import tqdm
from text_utils import clean_text
from .text_utils import clean_text

class ResponseException(Exception):
''' Raised when there is some issue in the response.'''
Expand Down Expand Up @@ -55,7 +55,8 @@ def download_file(path:str, save_file_path:str ='NONAME', unzipped_path:str = 'U
os.remove(save_file_path)

def download_wet_file(path:str, standard_encoding:str ='utf-8', error_handling:str ='ignore',
save_file_path:str ='NONAME.warc.wet.gz', unzipped_path:str = 'UNZIPPED.warc', should_print:bool =False) -> str:
save_file_path:str ='NONAME.warc.wet.gz', unzipped_path:str = 'UNZIPPED.warc',
should_split:bool =False, should_capture:bool =False, should_print:bool =False) -> str:
'''
This function pulls down the .wet gz file with the given filepath. The filepath
provided **must** be a valid path to a common crawl .wet file.
Expand All @@ -65,20 +66,29 @@ def download_wet_file(path:str, standard_encoding:str ='utf-8', error_handling:s
@error_handling: 'ignore', 'replace', or 'strict'. Specifies how to handle decoding errors.
@save_file_path: Choose where to save the file as text for processing. This is only the intermediary save.
@unzipped_path: Choose where to save the unzipped file.
@should_split: Flag for splitting text by newlines.
@should_capture: Flag for capturing keyboard interrupts. Adding text can take a long
time, so if should_capture is true then a keyboard interrupt will just return
the text received so far.
@should_print: Flag for extra printing, if it's available.
@return: Returns a cleaned string.
'''
#if save_file_path == 'NONAME.warc.wet.gz' or unzipped_path == 'UNZIPPED.warc':
# raise UserWarning(f"WARNING, did not specify save location for file, using {save_file_path} and {unzipped_path} in local directory")

download_file(path=path, save_file_path=save_file_path, unzipped_path=unzipped_path, should_print=should_print)

# Written to disk, now get raw text.
text = ""
with warc.open(unzipped_path) as f:
for i,record in enumerate(tqdm(f)):
text += clean_text(record.payload.read().decode(standard_encoding, error_handling))
try:
with warc.open(unzipped_path) as f:
for i, record in enumerate(tqdm(f)):
text += clean_text(record.payload.read().decode(standard_encoding, error_handling))
except KeyboardInterrupt:
if not should_capture:
raise KeyboardInterrupt()

if should_split:
return text.split('\n')
return text

def pull_from_paths(index_path: str, save_path: str, should_print=False):
Expand Down
46 changes: 44 additions & 2 deletions Lib/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,48 @@
up for model processing
'''
import re

from re import sub
from typing import Callable, List

def __process_lines(text:str, language_reg:str) -> str:
remove_nonprint = re.compile(language_reg)
return sub(remove_nonprint, '', text)

def remove_html(text:str) -> str:
remove_html_reg = re.compile('<.*?>')
return sub(remove_html_reg, '', text)

def clean_text(text:str, b_remove_html:bool =True, b_rm_nprintable:bool =True, b_split_nlines:bool =False,
func:Callable|None =None, language_reg:str ='[^a-zA-Z0-9 _]', **kwargs) -> str|List[str]:
'''
Function meant to handle cleaning text from common crawl.
@text: String meant to represent the text to clean
@b_remove_html: boolean of whether or not to remove any HTML information
@b_rm_nprintable: boolean of whether or not to remove any non printable characters
@b_split_nlines: boolean of whether or not to split by '\n'
@func: Extra preprocessing function pointer that you may wish to add.
Add any extra kwargs for the func as necessary. They will be passed in the kwargs dictionary.
I do assume that the text is the first argument to the function.
NOTE: Any alterations in the return type of the function you give will ALTER the return type of this function.
@language_reg: String for a regular expression pattern of what constitutes the language (ie what characters
to keep)
@return: Either return one long string with the text, or a list separated by newlines.
'''

if b_remove_html:
text = remove_html(text)

if b_rm_nprintable:
text = __process_lines(text, language_reg)

if func is not None:
text = func(text, **kwargs)

if b_split_nlines:
return text.split('\n')

return text

def clean_text(text:str) -> str:
pass
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Data from Common Crawl comes in .WARC, .WET, and .WAT formats, as described here

**This repository specifically focuses on converting common crawl plaintext to a usable dataset format.**

** PLEASE NOTE THAT THE REPOSITORY IS IN ACTIVE DEVELOPMENT AS A SIDE PROJECT, development is slow **

# Install

# Usage
67 changes: 67 additions & 0 deletions crawl_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from Lib.net_utils import download_wet_file"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"sample_url = 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320299852.23/wet/CC-MAIN-20220116093137-20220116123137-00000.warc.wet.gz'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "ResponseException",
"evalue": "Response failed with message REQUEST FOR https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320299852.23/wet/CC-MAIN-20220116093137-20220116123137-00000.warc.wet.gz FAILED WITH CODE 503, MEANING Service Unavailable",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mResponseException\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[3], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m text \u001b[39m=\u001b[39m download_wet_file(sample_url, should_capture\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, should_split\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n",
"File \u001b[1;32mc:\\Users\\George\\Documents\\Programming\\local_installs\\CommonCrawler\\Lib\\net_utils.py:78\u001b[0m, in \u001b[0;36mdownload_wet_file\u001b[1;34m(path, standard_encoding, error_handling, save_file_path, unzipped_path, should_split, should_capture, should_print)\u001b[0m\n\u001b[0;32m 57\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdownload_wet_file\u001b[39m(path:\u001b[39mstr\u001b[39m, standard_encoding:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mutf-8\u001b[39m\u001b[39m'\u001b[39m, error_handling:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mignore\u001b[39m\u001b[39m'\u001b[39m, \n\u001b[0;32m 58\u001b[0m save_file_path:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mNONAME.warc.wet.gz\u001b[39m\u001b[39m'\u001b[39m, unzipped_path:\u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mUNZIPPED.warc\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[0;32m 59\u001b[0m should_split:\u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, should_capture:\u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, should_print:\u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m:\n\u001b[0;32m 60\u001b[0m \u001b[39m \u001b[39m\u001b[39m'''\u001b[39;00m\n\u001b[0;32m 61\u001b[0m \u001b[39m This function pulls down the .wet gz file with the given filepath. The filepath \u001b[39;00m\n\u001b[0;32m 62\u001b[0m \u001b[39m provided **must** be a valid path to a common crawl .wet file.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[39m @return: Returns a cleaned string.\u001b[39;00m\n\u001b[0;32m 76\u001b[0m \u001b[39m '''\u001b[39;00m\n\u001b[1;32m---> 78\u001b[0m download_file(path\u001b[39m=\u001b[39;49mpath, save_file_path\u001b[39m=\u001b[39;49msave_file_path, unzipped_path\u001b[39m=\u001b[39;49munzipped_path, should_print\u001b[39m=\u001b[39;49mshould_print)\n\u001b[0;32m 80\u001b[0m \u001b[39m# Written to disk, now get raw text.\u001b[39;00m\n\u001b[0;32m 81\u001b[0m text \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n",
"File \u001b[1;32mc:\\Users\\George\\Documents\\Programming\\local_installs\\CommonCrawler\\Lib\\net_utils.py:39\u001b[0m, in \u001b[0;36mdownload_file\u001b[1;34m(path, save_file_path, unzipped_path, should_print)\u001b[0m\n\u001b[0;32m 37\u001b[0m r \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mget(path, stream\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m 38\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m r\u001b[39m.\u001b[39mok:\n\u001b[1;32m---> 39\u001b[0m \u001b[39mraise\u001b[39;00m ResponseException(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mREQUEST FOR \u001b[39m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m FAILED WITH CODE \u001b[39m\u001b[39m{\u001b[39;00mr\u001b[39m.\u001b[39mstatus_code\u001b[39m}\u001b[39;00m\u001b[39m, MEANING \u001b[39m\u001b[39m{\u001b[39;00mr\u001b[39m.\u001b[39mreason\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 41\u001b[0m \u001b[39m# Got file, now write it to disk.\u001b[39;00m\n\u001b[0;32m 42\u001b[0m total \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(r\u001b[39m.\u001b[39mheaders\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mcontent-length\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m0\u001b[39m))\n",
"\u001b[1;31mResponseException\u001b[0m: Response failed with message REQUEST FOR https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320299852.23/wet/CC-MAIN-20220116093137-20220116123137-00000.warc.wet.gz FAILED WITH CODE 503, MEANING Service Unavailable"
]
}
],
"source": [
"text = download_wet_file(sample_url, should_capture=True, should_split=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 3fd2942

Please sign in to comment.