Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Configurable HTML Parser Wrappers for BeautifulSoup and Resiliparse #47

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i

However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes).


## Credits

Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades:
Expand Down
41 changes: 41 additions & 0 deletions bs4_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector


class HTMLParser(object):
"""
HTML parser using BeautifulSoup4
"""

def html_to_text(self, html_tree: BeautifulSoup) -> str:
"""
Convert HTML content to plain text using BeautifulSoup4.

Returns:
str: Extracted plain text with scripts and styles removed
"""
for script in html_tree(['script', 'style']):
script.extract()
text = html_tree.get_text(' ', strip=True)
return text

def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup:
"""
Return the HTML tree object

Args:
page (bytes): Raw HTML content as bytes
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers.
**kwargs: Additional arguments passed to BeautifulSoup constructor.
Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments.

Returns:
BeautifulSoup: HTML tree object
"""
if not encoding:
for encoding in EncodingDetector(page, is_html=True).encodings:
# take the first detected encoding
break
soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs)
return soup
58 changes: 39 additions & 19 deletions cc_index_word_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,33 @@ class CCIndexWordCountJob(WordCountJob, CCIndexWarcSparkJob):
records_parsing_failed = None
records_non_html = None

def add_arguments(self, parser):
super(CCIndexWordCountJob, self).add_arguments(parser)
parser.add_argument(
"--html_parser", default="beautifulsoup",
help="HTML parser: beautifulsoup or resiliparse."
" Make sure to install the correct dependencies for the parser and "
"include the correct parser module (bs4_parser.py for beautifulsoup or resiliparse_parser.py for resiliparse) to the cluster"
)

def get_html_parser(self):
try:
if self.args.html_parser == 'beautifulsoup':
from bs4_parser import HTMLParser
return HTMLParser()
elif self.args.html_parser == 'resiliparse':
from resiliparse_parser import HTMLParser
return HTMLParser()
else:
raise ValueError(
"Unknown HTML parser: {}".format(self.args.html_parser)
)
except ImportError as e:
raise ImportError(
f"Failed to import HTML parser module '{self.args.html_parser}'."
f" Please ensure the module is correctly added to PySpark cluster via `--py-files`: {str(e)}"
)

def init_accumulators(self, session):
super(CCIndexWordCountJob, self).init_accumulators(session)

Expand All @@ -36,32 +63,25 @@ def reduce_by_key_func(a, b):
# sum values of tuple <term_frequency, document_frequency>
return ((a[0] + b[0]), (a[1] + b[1]))

def html_to_text(self, page, record):
try:
encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
if not encoding:
for encoding in EncodingDetector(page, is_html=True).encodings:
# take the first detected encoding
break
soup = BeautifulSoup(page, 'lxml', from_encoding=encoding)
for script in soup(['script', 'style']):
script.extract()
return soup.get_text(' ', strip=True)
except Exception as e:
self.get_logger().error("Error converting HTML to text for {}: {}",
self.get_warc_header(record, 'WARC-Target-URI'), e)
self.records_parsing_failed.add(1)
return ''

def process_record(self, record):
if not self.is_response_record(record):
# skip over WARC request or metadata records
return
if not self.is_html(record):
self.records_non_html.add(1)
return
page = self.get_payload_stream(record).read()
text = self.html_to_text(page, record)

text = ""
try:
page = self.get_payload_stream(record).read()
encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
parser = self.get_html_parser()
html_tree = parser.get_html_tree(page, encoding=encoding)
text = parser.html_to_text(html_tree)
except Exception as e:
self.get_logger().error("Error converting HTML to text for {}: {}",
self.get_warc_header(record, 'WARC-Target-URI'), e)
self.records_parsing_failed.add(1)
words = map(lambda w: w.lower(),
WordCountJob.word_pattern.findall(text))
for word, count in Counter(words).items():
Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@ lxml
#fastwarc
# (tested with)
#fastwarc==0.14.1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you verified that it it possible to install and use FastWARC and Resiliparse with different versions?

Copy link
Author

@silentninja silentninja Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @sebastian-nagel, thanks for the catch! Resiliparse has a strict version dependency on fastwarc and will throw up an error when installing incompatible versions. I will fix the tested version and add a comment in requirement.txt to highlight this

# to parse HTML (used in cc_index_word_count.py) using Resiliparse (https://pypi.org/project/Resiliparse/). Resiliparse requires compatible fastwarc version.
#Resiliparse
# (tested with)
#Resiliparse==0.14.1
36 changes: 36 additions & 0 deletions resiliparse_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse import detect_encoding
from resiliparse.parse.html import HTMLTree


class HTMLParser(object):
"""
HTML parser using Resiliparse
"""

def html_to_text(self, tree, **kwargs) -> str:
"""
Convert HTML content to plain text using Resiliparse.

Returns:
str: Extracted plain text with scripts and styles removed
"""
text = extract_plain_text(tree, **kwargs)
return text

def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree:
"""
Get the HTML tree object

Args:
page (bytes): Raw HTML content as bytes
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
**kwargs: Additional arguments passed to extract_plain_text:
Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments.
Returns:
str: Extracted plain text content
"""
if not encoding:
encoding = detect_encoding(page)
tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs)
return tree
1 change: 0 additions & 1 deletion sparkcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def parse_arguments(self):

arg_parser.add_argument("input", help=self.input_descr)
arg_parser.add_argument("output", help=self.output_descr)

arg_parser.add_argument("--input_base_url",
help="Base URL (prefix) used if paths to WARC/WAT/WET "
"files are relative paths. Used to select the "
Expand Down