Skip to content

Commit

Permalink
Basic text grab working, filtering still needs work
Browse files Browse the repository at this point in the history
  • Loading branch information
georgew79 committed Jun 6, 2023
1 parent 3fd2942 commit b1ffd8b
Show file tree
Hide file tree
Showing 7 changed files with 692 additions and 30 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.warc
*.warc.gz
*.gz
Binary file modified Lib/__pycache__/net_utils.cpython-310.pyc
Binary file not shown.
Binary file modified Lib/__pycache__/text_utils.cpython-310.pyc
Binary file not shown.
16 changes: 4 additions & 12 deletions Lib/net_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import os

from tqdm import tqdm
from .text_utils import clean_text
from .text_utils import process_wet

class ResponseException(Exception):
''' Raised when there is some issue in the response.'''
Expand Down Expand Up @@ -78,18 +78,10 @@ def download_wet_file(path:str, standard_encoding:str ='utf-8', error_handling:s
download_file(path=path, save_file_path=save_file_path, unzipped_path=unzipped_path, should_print=should_print)

# Written to disk, now get raw text.
text = ""
try:
with warc.open(unzipped_path) as f:
for i, record in enumerate(tqdm(f)):
text += clean_text(record.payload.read().decode(standard_encoding, error_handling))
except KeyboardInterrupt:
if not should_capture:
raise KeyboardInterrupt()
text = process_wet(unzipped_path, standard_encoding=standard_encoding, error_handling=error_handling,
should_capture=should_capture, should_split=should_split)

if should_split:
return text.split('\n')
return text
return text

def pull_from_paths(index_path: str, save_path: str, should_print=False):
'''
Expand Down
73 changes: 67 additions & 6 deletions Lib/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,55 @@
'''
import re
import warc

from re import sub
from typing import Callable, List
from tqdm import tqdm

def __process_lines(text:str, language_reg:str) -> str:
remove_nonprint = re.compile(language_reg)
return sub(remove_nonprint, '', text)

def default_filter(single_string:str) -> str:
'''
This is a default filter function meant to help do some basic filtering tasks.
Please feel free to implement your own. This is only meant to be used in the case
of the list string approach.
@single_str: The single string to check and filter in/out
@return: The string to push
'''

if single_string.isspace():
return None
else:
return single_string

def remove_html(text:str) -> str:
remove_html_reg = re.compile('<.*?>')
return sub(remove_html_reg, '', text)

def clean_text(text:str, b_remove_html:bool =True, b_rm_nprintable:bool =True, b_split_nlines:bool =False,
func:Callable|None =None, language_reg:str ='[^a-zA-Z0-9 _]', **kwargs) -> str|List[str]:
func:Callable|None =None, language_reg:str ='[^a-zA-Z ]', **kwargs) -> str:
'''
Function meant to handle cleaning text from common crawl.
@text: String meant to represent the text to clean
@b_remove_html: boolean of whether or not to remove any HTML information
@b_rm_nprintable: boolean of whether or not to remove any non printable characters
@b_split_nlines: boolean of whether or not to split by '\n'
@b_split_nlines: boolean of whether or not to split by '\n', (whether or not to include \n in the string)
@func: Extra preprocessing function pointer that you may wish to add.
Add any extra kwargs for the func as necessary. They will be passed in the kwargs dictionary.
I do assume that the text is the first argument to the function.
NOTE: Any alterations in the return type of the function you give will ALTER the return type of this function.
@language_reg: String for a regular expression pattern of what constitutes the language (ie what characters
to keep)
@return: Either return one long string with the text, or a list separated by newlines.
@return: Return one long string.
'''
if b_split_nlines:
language_reg = language_reg[:-1]
language_reg += '\n]'

if b_remove_html:
text = remove_html(text)
Expand All @@ -44,9 +64,50 @@ def clean_text(text:str, b_remove_html:bool =True, b_rm_nprintable:bool =True, b

if func is not None:
text = func(text, **kwargs)

if b_split_nlines:
return text.split('\n')

return text

def process_wet(unzipped_path:str, standard_encoding:str ='utf-8', error_handling:str ='ignore',
should_capture:bool =False, should_split:bool =False,
filter_func:Callable|None =None, **kwargs) -> str|List[str]:
'''
This function process a .wet unzipped file, and starts processing the underlying text.
@unzipped_path: Where is the unzipped file?
@standard_encoding: The encoding method to use for decoding the byte string.
@error_handling: 'ignore', 'replace', or 'strict'. Specifies how to handle decoding errors.
@should_split: Flag for splitting text by newlines.
@should_capture: Flag for capturing keyboard interrupts. Adding text can take a long
time, so if should_capture is true then a keyboard interrupt will just return
the text received so far.
@filter_func: Callable function that takes in a single string meant to help filter EITHER
the whole string at once (if that is what you've selected), or the string. Return the item.
@return: Returns a cleaned string.
'''
text = ""
count = 0
try:
with warc.open(unzipped_path) as f:
for i, record in enumerate(tqdm(f)):
raw = record.payload.read().decode(standard_encoding, error_handling)
cleaned = clean_text(raw, b_split_nlines=should_split, **kwargs)
text += cleaned
count += 1
except KeyboardInterrupt:
if not should_capture:
raise KeyboardInterrupt()

print(f"Processed {count} blocks")

if should_split:
if filter_func is not None:
ret = list(filter(filter_func, text.split('\n')))
else:
ret = list(filter(None, text.split('\n')))
return ret
else:
if filter_func is not None:
return filter_func(text)
else:
return text
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ Data from Common Crawl comes in .WARC, .WET, and .WAT formats, as described here

** PLEASE NOTE THAT THE REPOSITORY IS IN ACTIVE DEVELOPMENT AS A SIDE PROJECT, development is slow **

## Planned Improvements

- [ ] Clean up the parameters, allow easy access to all depth parameters from the top level.
- [ ] Randomized index selection, filter by language.
- [ ] Add automatic filtering of headers, extra code details captured by the crawler.
- [ ] Improve readme / docs
- [ ] Improve crawl_example notebook
- [ ] Add grammar based filtering for 'sensical' outputs. This is useful for LLM training.
- [ ] Add text saving

# Install

# Usage
Loading

0 comments on commit b1ffd8b

Please sign in to comment.