improve html2txt extraction (#483)

* better html2txt extraction * fix tests * fix coverage * add function
adbar · Jan 23, 2024 · 5c2761e · 5c2761e
1 parent 02c8342
commit 5c2761e
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 12 deletions.
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -449,21 +449,23 @@ def test_crawling():
 def test_probing():
     "Test webpage probing functions."
     url = 'https://example.org/'
-    testargs = ['', '--probe', url, '--target-language', 'de']
+    conf = os.path.join(RESOURCES_DIR, 'zerolength.cfg')
+    testargs = ['', '--probe', url, '--target-language', 'de', '--config-file', conf]
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
+
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
     if LANGID_FLAG:
         assert f.getvalue().strip() == ''
+        args.target_language = 'en'
+        f2 = io.StringIO()
+        with redirect_stdout(f2):
+            cli.process_args(args)
+        assert f2.getvalue().strip() == url
     else:
         assert f.getvalue().strip() == url
-    args.target_language = 'en'
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli.process_args(args)
-    assert f.getvalue().strip() == url
 
 
 if __name__ == '__main__':

diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg
@@ -0,0 +1,35 @@
+# Defines settings for trafilatura (https://github.com/adbar/trafilatura)
+
+[DEFAULT]
+
+# Download
+DOWNLOAD_TIMEOUT = 10
+MAX_FILE_SIZE = 20000000
+MIN_FILE_SIZE = 10
+# sleep between requests
+SLEEP_TIME = 0.25
+# List of user-agents. Each user-agent should be put on a new line like so:
+#     "agent1"
+#     "agent2"
+#     ...
+USER_AGENTS =
+    Firefox
+    Chrome
+# cookie for HTTP requests
+COOKIE = yummy_cookie=choco; tasty_cookie=strawberry
+
+# Extraction
+MIN_EXTRACTED_SIZE = 0
+MIN_EXTRACTED_COMM_SIZE = 0
+MIN_OUTPUT_SIZE = 0
+MIN_OUTPUT_COMM_SIZE = 0
+
+# Set to 0 to disable signal
+EXTRACTION_TIMEOUT = 0
+
+# Deduplication
+MIN_DUPLCHECK_SIZE = 10
+MAX_REPETITIONS = 3
+
+# Extraction option for Htmldate
+EXTENSIVE_DATE_SEARCH = off
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -368,6 +368,9 @@ def test_html2txt():
     assert html2txt(html.fromstring(mydoc)) == "Here is the body text"
     assert html2txt("") == ""
     assert html2txt("123") == ""
+    assert html2txt("<html></html>") == ""
+    assert html2txt("<html><body/></html>") == ""
+    assert html2txt("<html><body><style>font-size: 8pt</style><p>ABC</p></body></html>") == "ABC"
 
 
 def test_external():

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -26,7 +26,7 @@
                              handle_textnode, link_density_test_tables,
                              process_node, prune_unwanted_nodes, tree_cleaning)
 from .metadata import Document, extract_metadata
-from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
+from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config
 from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
 from .xml import (build_json_output, build_tei_output, build_xml_output,
                   control_xml_output, remove_empty_elements, strip_double_tags,
@@ -712,6 +712,13 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
     return body, text, len_text
 
 
+def basic_cleaning(tree):
+    "Remove a few section types from the document."
+    for elem in tree.xpath(BASIC_CLEAN_XPATH):
+        elem.getparent().remove(elem)
+    return tree
+
+
 def baseline(filecontent):
     """Use baseline extraction function targeting text paragraphs and/or JSON metadata.
 
@@ -735,9 +742,9 @@ def baseline(filecontent):
                 elem = SubElement(postbody, 'p')
                 elem.text = trim(mymatch[1].replace('\\"', '"'))
                 return postbody, elem.text, len(elem.text)
-    # basic tree cleaning
-    for elem in tree.xpath('.//aside|.//footer|.//script|.//style'):
-        elem.getparent().remove(elem)
+
+    tree = basic_cleaning(tree)
+
     # scrape from article tag
     article_elem = tree.find('.//article')
     if article_elem is not None:
@@ -787,8 +794,12 @@ def html2txt(content):
     """
     tree = load_html(content)
     if tree is None:
-        return ''
-    return ' '.join(tree.text_content().split()).strip()
+        return ""
+    body = tree.find(".//body")
+    if body is None:
+        return ""
+    tree = basic_cleaning(tree)
+    return " ".join(body.text_content().split()).strip()
 
 
 def determine_returnstring(document, output_format, include_formatting, tei_validation):

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -72,6 +72,8 @@ def use_config(filename=None, config=None):
 ]
 # 'center', 'rb', 'wbr'
 
+BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style"
+
 TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote'])
 # + list(CUT_EMPTY_ELEMS)