diff --git a/README.md b/README.md index 9a4c110..58deb4b 100644 --- a/README.md +++ b/README.md @@ -207,6 +207,7 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes). + ## Credits Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades: diff --git a/bs4_parser.py b/bs4_parser.py new file mode 100644 index 0000000..799a344 --- /dev/null +++ b/bs4_parser.py @@ -0,0 +1,41 @@ +from bs4 import BeautifulSoup +from bs4.dammit import EncodingDetector + + +class HTMLParser(object): + """ + HTML parser using BeautifulSoup4 + """ + + def html_to_text(self, html_tree: BeautifulSoup) -> str: + """ + Convert HTML content to plain text using BeautifulSoup4. + + Returns: + str: Extracted plain text with scripts and styles removed + """ + for script in html_tree(['script', 'style']): + script.extract() + text = html_tree.get_text(' ', strip=True) + return text + + def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup: + """ + Return the HTML tree object + + Args: + page (bytes): Raw HTML content as bytes + encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted + features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers. + **kwargs: Additional arguments passed to BeautifulSoup constructor. + Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments. + + Returns: + BeautifulSoup: HTML tree object + """ + if not encoding: + for encoding in EncodingDetector(page, is_html=True).encodings: + # take the first detected encoding + break + soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs) + return soup \ No newline at end of file diff --git a/cc_index_word_count.py b/cc_index_word_count.py index 7ea88ca..204fb08 100644 --- a/cc_index_word_count.py +++ b/cc_index_word_count.py @@ -16,6 +16,33 @@ class CCIndexWordCountJob(WordCountJob, CCIndexWarcSparkJob): records_parsing_failed = None records_non_html = None + def add_arguments(self, parser): + super(CCIndexWordCountJob, self).add_arguments(parser) + parser.add_argument( + "--html_parser", default="beautifulsoup", + help="HTML parser: beautifulsoup or resiliparse." + " Make sure to install the correct dependencies for the parser and " + "include the correct parser module (bs4_parser.py for beautifulsoup or resiliparse_parser.py for resiliparse) to the cluster" + ) + + def get_html_parser(self): + try: + if self.args.html_parser == 'beautifulsoup': + from bs4_parser import HTMLParser + return HTMLParser() + elif self.args.html_parser == 'resiliparse': + from resiliparse_parser import HTMLParser + return HTMLParser() + else: + raise ValueError( + "Unknown HTML parser: {}".format(self.args.html_parser) + ) + except ImportError as e: + raise ImportError( + f"Failed to import HTML parser module '{self.args.html_parser}'." + f" Please ensure the module is correctly added to PySpark cluster via `--py-files`: {str(e)}" + ) + def init_accumulators(self, session): super(CCIndexWordCountJob, self).init_accumulators(session) @@ -36,23 +63,6 @@ def reduce_by_key_func(a, b): # sum values of tuple return ((a[0] + b[0]), (a[1] + b[1])) - def html_to_text(self, page, record): - try: - encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset') - if not encoding: - for encoding in EncodingDetector(page, is_html=True).encodings: - # take the first detected encoding - break - soup = BeautifulSoup(page, 'lxml', from_encoding=encoding) - for script in soup(['script', 'style']): - script.extract() - return soup.get_text(' ', strip=True) - except Exception as e: - self.get_logger().error("Error converting HTML to text for {}: {}", - self.get_warc_header(record, 'WARC-Target-URI'), e) - self.records_parsing_failed.add(1) - return '' - def process_record(self, record): if not self.is_response_record(record): # skip over WARC request or metadata records @@ -60,8 +70,18 @@ def process_record(self, record): if not self.is_html(record): self.records_non_html.add(1) return - page = self.get_payload_stream(record).read() - text = self.html_to_text(page, record) + + text = "" + try: + page = self.get_payload_stream(record).read() + encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset') + parser = self.get_html_parser() + html_tree = parser.get_html_tree(page, encoding=encoding) + text = parser.html_to_text(html_tree) + except Exception as e: + self.get_logger().error("Error converting HTML to text for {}: {}", + self.get_warc_header(record, 'WARC-Target-URI'), e) + self.records_parsing_failed.add(1) words = map(lambda w: w.lower(), WordCountJob.word_pattern.findall(text)) for word, count in Counter(words).items(): diff --git a/requirements.txt b/requirements.txt index d693cd2..091743d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,8 @@ lxml #fastwarc # (tested with) #fastwarc==0.14.1 + +# to parse HTML (used in cc_index_word_count.py) using Resiliparse (https://pypi.org/project/Resiliparse/). Resiliparse requires compatible fastwarc version. +#Resiliparse +# (tested with) +#Resiliparse==0.14.1 \ No newline at end of file diff --git a/resiliparse_parser.py b/resiliparse_parser.py new file mode 100644 index 0000000..e70de1d --- /dev/null +++ b/resiliparse_parser.py @@ -0,0 +1,36 @@ +from resiliparse.extract.html2text import extract_plain_text +from resiliparse.parse import detect_encoding +from resiliparse.parse.html import HTMLTree + + +class HTMLParser(object): + """ + HTML parser using Resiliparse + """ + + def html_to_text(self, tree, **kwargs) -> str: + """ + Convert HTML content to plain text using Resiliparse. + + Returns: + str: Extracted plain text with scripts and styles removed + """ + text = extract_plain_text(tree, **kwargs) + return text + + def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree: + """ + Get the HTML tree object + + Args: + page (bytes): Raw HTML content as bytes + encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted + **kwargs: Additional arguments passed to extract_plain_text: + Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments. + Returns: + str: Extracted plain text content + """ + if not encoding: + encoding = detect_encoding(page) + tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs) + return tree \ No newline at end of file diff --git a/sparkcc.py b/sparkcc.py index 6a99ec9..15d7a70 100644 --- a/sparkcc.py +++ b/sparkcc.py @@ -71,7 +71,6 @@ def parse_arguments(self): arg_parser.add_argument("input", help=self.input_descr) arg_parser.add_argument("output", help=self.output_descr) - arg_parser.add_argument("--input_base_url", help="Base URL (prefix) used if paths to WARC/WAT/WET " "files are relative paths. Used to select the "