From 5c2761e9aa9395c20cd98b953b64e278f3a3ae3c Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi
Date: Tue, 23 Jan 2024 16:35:28 +0100
Subject: [PATCH] improve html2txt extraction (#483)
* better html2txt extraction
* fix tests
* fix coverage
* add function
---
tests/cli_tests.py | 14 ++++++++------
tests/resources/zerolength.cfg | 35 ++++++++++++++++++++++++++++++++++
tests/unit_tests.py | 3 +++
trafilatura/core.py | 23 ++++++++++++++++------
trafilatura/settings.py | 2 ++
5 files changed, 65 insertions(+), 12 deletions(-)
create mode 100644 tests/resources/zerolength.cfg
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
index 5a349465..b83b970a 100644
--- a/tests/cli_tests.py
+++ b/tests/cli_tests.py
@@ -449,21 +449,23 @@ def test_crawling():
def test_probing():
"Test webpage probing functions."
url = 'https://example.org/'
- testargs = ['', '--probe', url, '--target-language', 'de']
+ conf = os.path.join(RESOURCES_DIR, 'zerolength.cfg')
+ testargs = ['', '--probe', url, '--target-language', 'de', '--config-file', conf]
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
+
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
if LANGID_FLAG:
assert f.getvalue().strip() == ''
+ args.target_language = 'en'
+ f2 = io.StringIO()
+ with redirect_stdout(f2):
+ cli.process_args(args)
+ assert f2.getvalue().strip() == url
else:
assert f.getvalue().strip() == url
- args.target_language = 'en'
- f = io.StringIO()
- with redirect_stdout(f):
- cli.process_args(args)
- assert f.getvalue().strip() == url
if __name__ == '__main__':
diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg
new file mode 100644
index 00000000..ed6a0941
--- /dev/null
+++ b/tests/resources/zerolength.cfg
@@ -0,0 +1,35 @@
+# Defines settings for trafilatura (https://github.com/adbar/trafilatura)
+
+[DEFAULT]
+
+# Download
+DOWNLOAD_TIMEOUT = 10
+MAX_FILE_SIZE = 20000000
+MIN_FILE_SIZE = 10
+# sleep between requests
+SLEEP_TIME = 0.25
+# List of user-agents. Each user-agent should be put on a new line like so:
+# "agent1"
+# "agent2"
+# ...
+USER_AGENTS =
+ Firefox
+ Chrome
+# cookie for HTTP requests
+COOKIE = yummy_cookie=choco; tasty_cookie=strawberry
+
+# Extraction
+MIN_EXTRACTED_SIZE = 0
+MIN_EXTRACTED_COMM_SIZE = 0
+MIN_OUTPUT_SIZE = 0
+MIN_OUTPUT_COMM_SIZE = 0
+
+# Set to 0 to disable signal
+EXTRACTION_TIMEOUT = 0
+
+# Deduplication
+MIN_DUPLCHECK_SIZE = 10
+MAX_REPETITIONS = 3
+
+# Extraction option for Htmldate
+EXTENSIVE_DATE_SEARCH = off
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 3bdf1625..9847afc0 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -368,6 +368,9 @@ def test_html2txt():
assert html2txt(html.fromstring(mydoc)) == "Here is the body text"
assert html2txt("") == ""
assert html2txt("123") == ""
+ assert html2txt("") == ""
+ assert html2txt("") == ""
+ assert html2txt("
ABC
") == "ABC"
def test_external():
diff --git a/trafilatura/core.py b/trafilatura/core.py
index f27638b2..1904e2ad 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -26,7 +26,7 @@
handle_textnode, link_density_test_tables,
process_node, prune_unwanted_nodes, tree_cleaning)
from .metadata import Document, extract_metadata
-from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
+from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
from .xml import (build_json_output, build_tei_output, build_xml_output,
control_xml_output, remove_empty_elements, strip_double_tags,
@@ -712,6 +712,13 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
return body, text, len_text
+def basic_cleaning(tree):
+ "Remove a few section types from the document."
+ for elem in tree.xpath(BASIC_CLEAN_XPATH):
+ elem.getparent().remove(elem)
+ return tree
+
+
def baseline(filecontent):
"""Use baseline extraction function targeting text paragraphs and/or JSON metadata.
@@ -735,9 +742,9 @@ def baseline(filecontent):
elem = SubElement(postbody, 'p')
elem.text = trim(mymatch[1].replace('\\"', '"'))
return postbody, elem.text, len(elem.text)
- # basic tree cleaning
- for elem in tree.xpath('.//aside|.//footer|.//script|.//style'):
- elem.getparent().remove(elem)
+
+ tree = basic_cleaning(tree)
+
# scrape from article tag
article_elem = tree.find('.//article')
if article_elem is not None:
@@ -787,8 +794,12 @@ def html2txt(content):
"""
tree = load_html(content)
if tree is None:
- return ''
- return ' '.join(tree.text_content().split()).strip()
+ return ""
+ body = tree.find(".//body")
+ if body is None:
+ return ""
+ tree = basic_cleaning(tree)
+ return " ".join(body.text_content().split()).strip()
def determine_returnstring(document, output_format, include_formatting, tei_validation):
diff --git a/trafilatura/settings.py b/trafilatura/settings.py
index bbad0e3e..53f88e23 100644
--- a/trafilatura/settings.py
+++ b/trafilatura/settings.py
@@ -72,6 +72,8 @@ def use_config(filename=None, config=None):
]
# 'center', 'rb', 'wbr'
+BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style"
+
TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote'])
# + list(CUT_EMPTY_ELEMS)