Skip to content

Commit

Permalink
add function
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 22, 2024
1 parent 21f5141 commit e79c8e3
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
18 changes: 12 additions & 6 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
handle_textnode, link_density_test_tables,
process_node, prune_unwanted_nodes, tree_cleaning)
from .metadata import Document, extract_metadata
from .settings import DEFAULT_CONFIG, HTML2TXT_CLEAN, TAG_CATALOG, use_config
from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
from .xml import (build_json_output, build_tei_output, build_xml_output,
control_xml_output, remove_empty_elements, strip_double_tags,
Expand Down Expand Up @@ -712,6 +712,13 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
return body, text, len_text


def basic_cleaning(tree):
"Remove a few section types from the document."
for elem in tree.xpath(BASIC_CLEAN_XPATH):
elem.getparent().remove(elem)
return tree


def baseline(filecontent):
"""Use baseline extraction function targeting text paragraphs and/or JSON metadata.
Expand All @@ -735,9 +742,9 @@ def baseline(filecontent):
elem = SubElement(postbody, 'p')
elem.text = trim(mymatch[1].replace('\\"', '"'))
return postbody, elem.text, len(elem.text)
# basic tree cleaning
for elem in tree.xpath('.//aside|.//footer|.//script|.//style'):
elem.getparent().remove(elem)

tree = basic_cleaning(tree)

# scrape from article tag
article_elem = tree.find('.//article')
if article_elem is not None:
Expand Down Expand Up @@ -791,8 +798,7 @@ def html2txt(content):
body = tree.find(".//body")
if body is None:
return ""
for elem in tree.xpath(HTML2TXT_CLEAN):
elem.getparent().remove(elem)
tree = basic_cleaning(tree)
return " ".join(body.text_content().split()).strip()


Expand Down
2 changes: 1 addition & 1 deletion trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def use_config(filename=None, config=None):
]
# 'center', 'rb', 'wbr'

HTML2TXT_CLEAN = ".//aside|.//footer|.//script|.//style"
BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style"

TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote'])
# + list(CUT_EMPTY_ELEMS)
Expand Down

0 comments on commit e79c8e3

Please sign in to comment.