Skip to content

Commit

Permalink
improve html2txt extraction (#483)
Browse files Browse the repository at this point in the history
* better html2txt extraction

* fix tests

* fix coverage

* add function
  • Loading branch information
adbar authored Jan 23, 2024
1 parent 02c8342 commit 5c2761e
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 12 deletions.
14 changes: 8 additions & 6 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,21 +449,23 @@ def test_crawling():
def test_probing():
"Test webpage probing functions."
url = 'https://example.org/'
testargs = ['', '--probe', url, '--target-language', 'de']
conf = os.path.join(RESOURCES_DIR, 'zerolength.cfg')
testargs = ['', '--probe', url, '--target-language', 'de', '--config-file', conf]
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)

f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
if LANGID_FLAG:
assert f.getvalue().strip() == ''
args.target_language = 'en'
f2 = io.StringIO()
with redirect_stdout(f2):
cli.process_args(args)
assert f2.getvalue().strip() == url
else:
assert f.getvalue().strip() == url
args.target_language = 'en'
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip() == url


if __name__ == '__main__':
Expand Down
35 changes: 35 additions & 0 deletions tests/resources/zerolength.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Defines settings for trafilatura (https://github.com/adbar/trafilatura)

[DEFAULT]

# Download
DOWNLOAD_TIMEOUT = 10
MAX_FILE_SIZE = 20000000
MIN_FILE_SIZE = 10
# sleep between requests
SLEEP_TIME = 0.25
# List of user-agents. Each user-agent should be put on a new line like so:
# "agent1"
# "agent2"
# ...
USER_AGENTS =
Firefox
Chrome
# cookie for HTTP requests
COOKIE = yummy_cookie=choco; tasty_cookie=strawberry

# Extraction
MIN_EXTRACTED_SIZE = 0
MIN_EXTRACTED_COMM_SIZE = 0
MIN_OUTPUT_SIZE = 0
MIN_OUTPUT_COMM_SIZE = 0

# Set to 0 to disable signal
EXTRACTION_TIMEOUT = 0

# Deduplication
MIN_DUPLCHECK_SIZE = 10
MAX_REPETITIONS = 3

# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = off
3 changes: 3 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,9 @@ def test_html2txt():
assert html2txt(html.fromstring(mydoc)) == "Here is the body text"
assert html2txt("") == ""
assert html2txt("123") == ""
assert html2txt("<html></html>") == ""
assert html2txt("<html><body/></html>") == ""
assert html2txt("<html><body><style>font-size: 8pt</style><p>ABC</p></body></html>") == "ABC"


def test_external():
Expand Down
23 changes: 17 additions & 6 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
handle_textnode, link_density_test_tables,
process_node, prune_unwanted_nodes, tree_cleaning)
from .metadata import Document, extract_metadata
from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
from .xml import (build_json_output, build_tei_output, build_xml_output,
control_xml_output, remove_empty_elements, strip_double_tags,
Expand Down Expand Up @@ -712,6 +712,13 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
return body, text, len_text


def basic_cleaning(tree):
"Remove a few section types from the document."
for elem in tree.xpath(BASIC_CLEAN_XPATH):
elem.getparent().remove(elem)
return tree


def baseline(filecontent):
"""Use baseline extraction function targeting text paragraphs and/or JSON metadata.
Expand All @@ -735,9 +742,9 @@ def baseline(filecontent):
elem = SubElement(postbody, 'p')
elem.text = trim(mymatch[1].replace('\\"', '"'))
return postbody, elem.text, len(elem.text)
# basic tree cleaning
for elem in tree.xpath('.//aside|.//footer|.//script|.//style'):
elem.getparent().remove(elem)

tree = basic_cleaning(tree)

# scrape from article tag
article_elem = tree.find('.//article')
if article_elem is not None:
Expand Down Expand Up @@ -787,8 +794,12 @@ def html2txt(content):
"""
tree = load_html(content)
if tree is None:
return ''
return ' '.join(tree.text_content().split()).strip()
return ""
body = tree.find(".//body")
if body is None:
return ""
tree = basic_cleaning(tree)
return " ".join(body.text_content().split()).strip()


def determine_returnstring(document, output_format, include_formatting, tei_validation):
Expand Down
2 changes: 2 additions & 0 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def use_config(filename=None, config=None):
]
# 'center', 'rb', 'wbr'

BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style"

TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote'])
# + list(CUT_EMPTY_ELEMS)

Expand Down

0 comments on commit 5c2761e

Please sign in to comment.