commoncrawl · silentninja · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/README.md b/README.md
@@ -207,6 +207,7 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i
 
 However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes).
 
+
 ## Credits
 
 Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades:

diff --git a/bs4_parser.py b/bs4_parser.py
@@ -0,0 +1,41 @@
+from bs4 import BeautifulSoup
+from bs4.dammit import EncodingDetector
+
+
+class HTMLParser(object):
+    """
+    HTML parser using BeautifulSoup4
+    """
+
+    def html_to_text(self, html_tree: BeautifulSoup) -> str:
+        """
+        Convert HTML content to plain text using BeautifulSoup4.
+
+        Returns:
+            str: Extracted plain text with scripts and styles removed
+        """
+        for script in html_tree(['script', 'style']):
+            script.extract()
+        text = html_tree.get_text(' ', strip=True)
+        return text
+
+    def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup:
+        """
+        Return the HTML tree object
+
+        Args:
+            page (bytes): Raw HTML content as bytes
+            encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
+            features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers.
+            **kwargs: Additional arguments passed to BeautifulSoup constructor.
+             Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments.
+
+        Returns:
+            BeautifulSoup: HTML tree object
+        """
+        if not encoding:
+            for encoding in EncodingDetector(page, is_html=True).encodings:
+                # take the first detected encoding
+                break
+        soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs)
+        return soup
diff --git a/cc_index_word_count.py b/cc_index_word_count.py
@@ -16,6 +16,33 @@ class CCIndexWordCountJob(WordCountJob, CCIndexWarcSparkJob):
     records_parsing_failed = None
     records_non_html = None
 
+    def add_arguments(self, parser):
+        super(CCIndexWordCountJob, self).add_arguments(parser)
+        parser.add_argument(
+            "--html_parser", default="beautifulsoup",
+            help="HTML parser: beautifulsoup or resiliparse."
+                 " Make sure to install the correct dependencies for the parser and "
+                 "include the correct parser module (bs4_parser.py for beautifulsoup or resiliparse_parser.py for resiliparse) to the cluster"
+        )
+
+    def get_html_parser(self):
+        try:
+            if self.args.html_parser == 'beautifulsoup':
+                from bs4_parser import HTMLParser
+                return HTMLParser()
+            elif self.args.html_parser == 'resiliparse':
+                from resiliparse_parser import HTMLParser
+                return HTMLParser()
+            else:
+                raise ValueError(
+                    "Unknown HTML parser: {}".format(self.args.html_parser)
+                )
+        except ImportError as e:
+            raise ImportError(
+                f"Failed to import HTML parser module '{self.args.html_parser}'."
+                f" Please ensure the module is correctly added to PySpark cluster via `--py-files`: {str(e)}"
+            )
+
     def init_accumulators(self, session):
         super(CCIndexWordCountJob, self).init_accumulators(session)
 
@@ -36,32 +63,25 @@ def reduce_by_key_func(a, b):
         # sum values of tuple <term_frequency, document_frequency>
         return ((a[0] + b[0]), (a[1] + b[1]))
 
-    def html_to_text(self, page, record):
-        try:
-            encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
-            if not encoding:
-                for encoding in EncodingDetector(page, is_html=True).encodings:
-                    # take the first detected encoding
-                    break
-            soup = BeautifulSoup(page, 'lxml', from_encoding=encoding)
-            for script in soup(['script', 'style']):
-                script.extract()
-            return soup.get_text(' ', strip=True)
-        except Exception as e:
-            self.get_logger().error("Error converting HTML to text for {}: {}",
-                                    self.get_warc_header(record, 'WARC-Target-URI'), e)
-            self.records_parsing_failed.add(1)
-            return ''
-
     def process_record(self, record):
         if not self.is_response_record(record):
             # skip over WARC request or metadata records
             return
         if not self.is_html(record):
             self.records_non_html.add(1)
             return
-        page = self.get_payload_stream(record).read()
-        text = self.html_to_text(page, record)
+
+        text = ""
+        try:
+            page = self.get_payload_stream(record).read()
+            encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
+            parser = self.get_html_parser()
+            html_tree = parser.get_html_tree(page, encoding=encoding)
+            text = parser.html_to_text(html_tree)
+        except Exception as e:
+            self.get_logger().error("Error converting HTML to text for {}: {}",
+                                    self.get_warc_header(record, 'WARC-Target-URI'), e)
+            self.records_parsing_failed.add(1)
         words = map(lambda w: w.lower(),
                     WordCountJob.word_pattern.findall(text))
         for word, count in Counter(words).items():

diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,8 @@ lxml
 #fastwarc
 # (tested with)
 #fastwarc==0.14.1
+
+# to parse HTML (used in cc_index_word_count.py) using Resiliparse (https://pypi.org/project/Resiliparse/). Resiliparse requires compatible fastwarc version.
+#Resiliparse
+# (tested with)
+#Resiliparse==0.14.1
diff --git a/resiliparse_parser.py b/resiliparse_parser.py
@@ -0,0 +1,36 @@
+from resiliparse.extract.html2text import extract_plain_text
+from resiliparse.parse import detect_encoding
+from resiliparse.parse.html import HTMLTree
+
+
+class HTMLParser(object):
+    """
+    HTML parser using Resiliparse
+    """
+
+    def html_to_text(self, tree, **kwargs) -> str:
+        """
+        Convert HTML content to plain text using Resiliparse.
+
+        Returns:
+            str: Extracted plain text with scripts and styles removed
+        """
+        text = extract_plain_text(tree, **kwargs)
+        return text
+
+    def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree:
+        """
+        Get the HTML tree object
+
+        Args:
+            page (bytes): Raw HTML content as bytes
+            encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
+            **kwargs: Additional arguments passed to extract_plain_text:
+                Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments.
+        Returns:
+            str: Extracted plain text content
+        """
+        if not encoding:
+            encoding = detect_encoding(page)
+        tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs)
+        return tree
diff --git a/sparkcc.py b/sparkcc.py
@@ -71,7 +71,6 @@ def parse_arguments(self):
 
         arg_parser.add_argument("input", help=self.input_descr)
         arg_parser.add_argument("output", help=self.output_descr)
-
         arg_parser.add_argument("--input_base_url",
                                 help="Base URL (prefix) used if paths to WARC/WAT/WET "
                                 "files are relative paths. Used to select the "
Original file line number	Diff line number	Diff line change
Expand Up		@@ -207,6 +207,7 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i

		However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes).


		## Credits

		Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades:
Expand Down