-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial steps towards file unpacking
- Loading branch information
Showing
7 changed files
with
133 additions
and
2 deletions.
There are no files selected for viewing
Empty file.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
''' | ||
By: George Witt || May 2023 | ||
Utility functions meant to handle pulling files down | ||
with particular parameters. | ||
''' | ||
import requests | ||
import sys | ||
import gzip | ||
import warc | ||
import shutil | ||
import os | ||
|
||
from tqdm import tqdm | ||
from text_utils import clean_text | ||
|
||
class ResponseException(Exception): | ||
''' Raised when there is some issue in the response.''' | ||
def __init__(self, res_message: str): | ||
self.res_message = res_message | ||
super().__init__(f'Response failed with message {res_message}') | ||
|
||
def download_file(path:str, save_file_path:str ='NONAME', unzipped_path:str = 'UNZIPPED', should_print:bool = False) -> None: | ||
''' | ||
This function pulls down a general file with the given path, saves the gz version, then unzips it for processing. | ||
NOTE: No assumptions are made about the save_file type and the unzipped_file type, so it is absolutely crucial | ||
that these variables are provided with the correct file extensions. | ||
@path: Path for .wet file, MUST be a valid path. | ||
@save_file_path: Choose where to save the file as text for processing. This is only the intermediary save. | ||
@unzipped_path: Choose where to save the unzipped file. | ||
@should_print: Flag for extra printing, if it's available. | ||
''' | ||
|
||
# Get file with request | ||
r = requests.get(path, stream=True) | ||
if not r.ok: | ||
raise ResponseException(f"REQUEST FOR {path} FAILED WITH CODE {r.status_code}, MEANING {r.reason}") | ||
|
||
# Got file, now write it to disk. | ||
total = int(r.headers.get('content-length', 0)) | ||
try: | ||
with tqdm(total=total, unit='iB', unit_scale=True, unit_divisor=1024) as bar: | ||
with open(save_file_path, 'wb') as f: | ||
for data in r.iter_content(chunk_size=1024): | ||
f.write(data) | ||
bar.update(sys.getsizeof(data)) | ||
except KeyboardInterrupt: | ||
print(f"QUITTING DOWNLOAD for file {path}") | ||
|
||
with gzip.open(save_file_path, 'rb') as f_in: | ||
with open(unzipped_path, 'wb') as f_out: | ||
shutil.copyfileobj(f_in, f_out) | ||
os.remove(save_file_path) | ||
|
||
def download_wet_file(path:str, standard_encoding:str ='utf-8', error_handling:str ='ignore', | ||
save_file_path:str ='NONAME.warc.wet.gz', unzipped_path:str = 'UNZIPPED.warc', should_print:bool =False) -> str: | ||
''' | ||
This function pulls down the .wet gz file with the given filepath. The filepath | ||
provided **must** be a valid path to a common crawl .wet file. | ||
@path: Path for .wet file, MUST be a valid path. | ||
@standard_encoding: The encoding method to use for decoding the byte string. | ||
@error_handling: 'ignore', 'replace', or 'strict'. Specifies how to handle decoding errors. | ||
@save_file_path: Choose where to save the file as text for processing. This is only the intermediary save. | ||
@unzipped_path: Choose where to save the unzipped file. | ||
@should_print: Flag for extra printing, if it's available. | ||
@return: Returns a cleaned string. | ||
''' | ||
#if save_file_path == 'NONAME.warc.wet.gz' or unzipped_path == 'UNZIPPED.warc': | ||
# raise UserWarning(f"WARNING, did not specify save location for file, using {save_file_path} and {unzipped_path} in local directory") | ||
|
||
download_file(path=path, save_file_path=save_file_path, unzipped_path=unzipped_path, should_print=should_print) | ||
|
||
# Written to disk, now get raw text. | ||
text = "" | ||
with warc.open(unzipped_path) as f: | ||
for i,record in enumerate(tqdm(f)): | ||
text += clean_text(record.payload.read().decode(standard_encoding, error_handling)) | ||
|
||
return text | ||
|
||
def pull_from_paths(index_path: str, save_path: str, should_print=False): | ||
''' | ||
This function downloads randomly selected files from within an index slice that was | ||
previously selected. | ||
''' | ||
|
||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
''' | ||
By: George Witt || June 2023 | ||
Utility functions meant to help with cleaning text | ||
up for model processing | ||
''' | ||
|
||
def clean_text(text:str) -> str: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,13 @@ | ||
# CommonCrawler | ||
Lightweight python utility for pulling data from common crawl. | ||
# Common Crawler | ||
|
||
'Common Crawl' (https://commoncrawl.org/) is an open repository of years of internet webpages stored for use. The data can be difficult to work with when all that is desired is plaintext. | ||
|
||
We provide a lightweight and simple python utility for collecting and batching plaintext data from common crawl in a multiprocessing manner. The data is cleaned, if desired, before being returned. | ||
|
||
Data from Common Crawl comes in .WARC, .WET, and .WAT formats, as described here (https://commoncrawl.org/the-data/get-started/). The .WARC files store the information of the crawl itself (responses, request information, etc.). The .WET files store the plaintext. The .WAT files store metadata about the .WARC files. Note that data before 2018 does NOT store language information. Data before 2018 will need language detected manually. | ||
|
||
**This repository specifically focuses on converting common crawl plaintext to a usable dataset format.** | ||
|
||
# Install | ||
|
||
# Usage |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
''' | ||
By: George Witt || May 2023 | ||
Main file for handling pulling down data as desired. This is the starting point | ||
for the script. Separation of tasks occurs from here. | ||
Overall the process can be visualized as follows: | ||
1) Sorting information is provided, including: | ||
a) URL regular expressions | ||
b) Date ranges | ||
c) Languages | ||
NOTE: search information may be left to none. | ||
2) Parameters for algorithm are provided, including: | ||
a) Linear crawl or Random crawl T/F | ||
b) Amount of data in GB | ||
3) Data is pulled randomly (or linearly), searched by index. | ||
4) Once the correct indices are found, the .wet files are downloaded. | ||
5) The .wet files are cleaned, processed, and returned in the desired format. | ||
''' |