Skip to content

Commit

Permalink
Initial steps towards file unpacking
Browse files Browse the repository at this point in the history
  • Loading branch information
georgew79 committed Jun 6, 2023
1 parent 24d5937 commit 23b906a
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 2 deletions.
Empty file added Lib/__init__.py
Empty file.
Binary file added Lib/__pycache__/utils.cpython-310.pyc
Binary file not shown.
90 changes: 90 additions & 0 deletions Lib/net_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
'''
By: George Witt || May 2023
Utility functions meant to handle pulling files down
with particular parameters.
'''
import requests
import sys
import gzip
import warc
import shutil
import os

from tqdm import tqdm
from text_utils import clean_text

class ResponseException(Exception):
''' Raised when there is some issue in the response.'''
def __init__(self, res_message: str):
self.res_message = res_message
super().__init__(f'Response failed with message {res_message}')

def download_file(path:str, save_file_path:str ='NONAME', unzipped_path:str = 'UNZIPPED', should_print:bool = False) -> None:
'''
This function pulls down a general file with the given path, saves the gz version, then unzips it for processing.
NOTE: No assumptions are made about the save_file type and the unzipped_file type, so it is absolutely crucial
that these variables are provided with the correct file extensions.
@path: Path for .wet file, MUST be a valid path.
@save_file_path: Choose where to save the file as text for processing. This is only the intermediary save.
@unzipped_path: Choose where to save the unzipped file.
@should_print: Flag for extra printing, if it's available.
'''

# Get file with request
r = requests.get(path, stream=True)
if not r.ok:
raise ResponseException(f"REQUEST FOR {path} FAILED WITH CODE {r.status_code}, MEANING {r.reason}")

# Got file, now write it to disk.
total = int(r.headers.get('content-length', 0))
try:
with tqdm(total=total, unit='iB', unit_scale=True, unit_divisor=1024) as bar:
with open(save_file_path, 'wb') as f:
for data in r.iter_content(chunk_size=1024):
f.write(data)
bar.update(sys.getsizeof(data))
except KeyboardInterrupt:
print(f"QUITTING DOWNLOAD for file {path}")

with gzip.open(save_file_path, 'rb') as f_in:
with open(unzipped_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(save_file_path)

def download_wet_file(path:str, standard_encoding:str ='utf-8', error_handling:str ='ignore',
save_file_path:str ='NONAME.warc.wet.gz', unzipped_path:str = 'UNZIPPED.warc', should_print:bool =False) -> str:
'''
This function pulls down the .wet gz file with the given filepath. The filepath
provided **must** be a valid path to a common crawl .wet file.
@path: Path for .wet file, MUST be a valid path.
@standard_encoding: The encoding method to use for decoding the byte string.
@error_handling: 'ignore', 'replace', or 'strict'. Specifies how to handle decoding errors.
@save_file_path: Choose where to save the file as text for processing. This is only the intermediary save.
@unzipped_path: Choose where to save the unzipped file.
@should_print: Flag for extra printing, if it's available.
@return: Returns a cleaned string.
'''
#if save_file_path == 'NONAME.warc.wet.gz' or unzipped_path == 'UNZIPPED.warc':
# raise UserWarning(f"WARNING, did not specify save location for file, using {save_file_path} and {unzipped_path} in local directory")

download_file(path=path, save_file_path=save_file_path, unzipped_path=unzipped_path, should_print=should_print)

# Written to disk, now get raw text.
text = ""
with warc.open(unzipped_path) as f:
for i,record in enumerate(tqdm(f)):
text += clean_text(record.payload.read().decode(standard_encoding, error_handling))

return text

def pull_from_paths(index_path: str, save_path: str, should_print=False):
'''
This function downloads randomly selected files from within an index slice that was
previously selected.
'''

pass
10 changes: 10 additions & 0 deletions Lib/text_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
'''
By: George Witt || June 2023
Utility functions meant to help with cleaning text
up for model processing
'''

def clean_text(text:str) -> str:
pass
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,13 @@
# CommonCrawler
Lightweight python utility for pulling data from common crawl.
# Common Crawler

'Common Crawl' (https://commoncrawl.org/) is an open repository of years of internet webpages stored for use. The data can be difficult to work with when all that is desired is plaintext.

We provide a lightweight and simple python utility for collecting and batching plaintext data from common crawl in a multiprocessing manner. The data is cleaned, if desired, before being returned.

Data from Common Crawl comes in .WARC, .WET, and .WAT formats, as described here (https://commoncrawl.org/the-data/get-started/). The .WARC files store the information of the crawl itself (responses, request information, etc.). The .WET files store the plaintext. The .WAT files store metadata about the .WARC files. Note that data before 2018 does NOT store language information. Data before 2018 will need language detected manually.

**This repository specifically focuses on converting common crawl plaintext to a usable dataset format.**

# Install

# Usage
Empty file added crawl_example.ipynb
Empty file.
20 changes: 20 additions & 0 deletions webcrawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
'''
By: George Witt || May 2023
Main file for handling pulling down data as desired. This is the starting point
for the script. Separation of tasks occurs from here.
Overall the process can be visualized as follows:
1) Sorting information is provided, including:
a) URL regular expressions
b) Date ranges
c) Languages
NOTE: search information may be left to none.
2) Parameters for algorithm are provided, including:
a) Linear crawl or Random crawl T/F
b) Amount of data in GB
3) Data is pulled randomly (or linearly), searched by index.
4) Once the correct indices are found, the .wet files are downloaded.
5) The .wet files are cleaned, processed, and returned in the desired format.
'''

0 comments on commit 23b906a

Please sign in to comment.