-
Notifications
You must be signed in to change notification settings - Fork 88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Configurable HTML Parser Wrappers for BeautifulSoup and Resiliparse #47
base: main
Are you sure you want to change the base?
Changes from 5 commits
f3e3cd7
c6b4ac9
864241a
2a0fd51
5ae6b86
9d0b263
5ddbf5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from bs4 import BeautifulSoup | ||
from bs4.dammit import EncodingDetector | ||
|
||
|
||
class HTMLParser(object): | ||
""" | ||
HTML parser using BeautifulSoup4 | ||
""" | ||
|
||
def html_to_text(self, html_tree: BeautifulSoup) -> str: | ||
""" | ||
Convert HTML content to plain text using BeautifulSoup4. | ||
|
||
Returns: | ||
str: Extracted plain text with scripts and styles removed | ||
""" | ||
for script in html_tree(['script', 'style']): | ||
script.extract() | ||
text = html_tree.get_text(' ', strip=True) | ||
return text | ||
|
||
def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup: | ||
""" | ||
Return the HTML tree object | ||
|
||
Args: | ||
page (bytes): Raw HTML content as bytes | ||
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted | ||
features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers. | ||
**kwargs: Additional arguments passed to BeautifulSoup constructor. | ||
Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments. | ||
|
||
Returns: | ||
BeautifulSoup: HTML tree object | ||
""" | ||
if not encoding: | ||
for encoding in EncodingDetector(page, is_html=True).encodings: | ||
# take the first detected encoding | ||
break | ||
soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs) | ||
return soup |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from resiliparse.extract.html2text import extract_plain_text | ||
from resiliparse.parse import detect_encoding | ||
from resiliparse.parse.html import HTMLTree | ||
|
||
|
||
class HTMLParser(object): | ||
""" | ||
HTML parser using Resiliparse | ||
""" | ||
|
||
def html_to_text(self, tree, **kwargs) -> str: | ||
""" | ||
Convert HTML content to plain text using Resiliparse. | ||
|
||
Returns: | ||
str: Extracted plain text with scripts and styles removed | ||
""" | ||
text = extract_plain_text(tree, **kwargs) | ||
return text | ||
|
||
def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree: | ||
""" | ||
Get the HTML tree object | ||
|
||
Args: | ||
page (bytes): Raw HTML content as bytes | ||
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted | ||
**kwargs: Additional arguments passed to extract_plain_text: | ||
Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments. | ||
Returns: | ||
str: Extracted plain text content | ||
""" | ||
if not encoding: | ||
encoding = detect_encoding(page) | ||
tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs) | ||
return tree |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,7 +71,8 @@ def parse_arguments(self): | |
|
||
arg_parser.add_argument("input", help=self.input_descr) | ||
arg_parser.add_argument("output", help=self.output_descr) | ||
|
||
arg_parser.add_argument("--html_parser", default="beautifulsoup", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's only one class which uses a HTML parser: CCIndexWordCountJob. Shouldn't the option be moved to the class were it is used? If it's a global option, there may be confusion, such as "I thought WET files contain plain text from parsed HTML. Why I should specify a HTML parser?" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes that's correct. I will move it to that particular job. |
||
help="HTML parser: beautifulsoup or resiliparse") | ||
arg_parser.add_argument("--input_base_url", | ||
help="Base URL (prefix) used if paths to WARC/WAT/WET " | ||
"files are relative paths. Used to select the " | ||
|
@@ -396,6 +397,22 @@ def get_warc_header(record: ArcWarcRecord, header: str, default: str=None): | |
def get_http_headers(record: ArcWarcRecord): | ||
return record.http_headers.headers | ||
|
||
def get_html_parser(self): | ||
try: | ||
if self.args.html_parser == 'beautifulsoup': | ||
from bs4_parser import HTMLParser | ||
return HTMLParser() | ||
elif self.args.html_parser == 'resiliparse': | ||
from resiliparse_parser import HTMLParser | ||
return HTMLParser() | ||
else: | ||
raise ValueError( | ||
"Unknown HTML parser: {}".format(self.args.html_parser) | ||
) | ||
except ImportError as e: | ||
raise ImportError(f"Failed to import HTML parser module '{self.args.html_parser}'." | ||
f" Please ensure the module is correctly added to PySpark cluster: {str(e)}") | ||
|
||
@staticmethod | ||
def is_response_record(record: ArcWarcRecord): | ||
"""Return true if WARC record is a WARC response record""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you verified that it it possible to install and use FastWARC and Resiliparse with different versions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @sebastian-nagel, thanks for the catch! Resiliparse has a strict version dependency on fastwarc and will throw up an error when installing incompatible versions. I will fix the tested version and add a comment in requirement.txt to highlight this