diff --git a/tests/cli_tests.py b/tests/cli_tests.py index 5a349465..b83b970a 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -449,21 +449,23 @@ def test_crawling(): def test_probing(): "Test webpage probing functions." url = 'https://example.org/' - testargs = ['', '--probe', url, '--target-language', 'de'] + conf = os.path.join(RESOURCES_DIR, 'zerolength.cfg') + testargs = ['', '--probe', url, '--target-language', 'de', '--config-file', conf] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) + f = io.StringIO() with redirect_stdout(f): cli.process_args(args) if LANGID_FLAG: assert f.getvalue().strip() == '' + args.target_language = 'en' + f2 = io.StringIO() + with redirect_stdout(f2): + cli.process_args(args) + assert f2.getvalue().strip() == url else: assert f.getvalue().strip() == url - args.target_language = 'en' - f = io.StringIO() - with redirect_stdout(f): - cli.process_args(args) - assert f.getvalue().strip() == url if __name__ == '__main__': diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg new file mode 100644 index 00000000..ed6a0941 --- /dev/null +++ b/tests/resources/zerolength.cfg @@ -0,0 +1,35 @@ +# Defines settings for trafilatura (https://github.com/adbar/trafilatura) + +[DEFAULT] + +# Download +DOWNLOAD_TIMEOUT = 10 +MAX_FILE_SIZE = 20000000 +MIN_FILE_SIZE = 10 +# sleep between requests +SLEEP_TIME = 0.25 +# List of user-agents. Each user-agent should be put on a new line like so: +# "agent1" +# "agent2" +# ... +USER_AGENTS = + Firefox + Chrome +# cookie for HTTP requests +COOKIE = yummy_cookie=choco; tasty_cookie=strawberry + +# Extraction +MIN_EXTRACTED_SIZE = 0 +MIN_EXTRACTED_COMM_SIZE = 0 +MIN_OUTPUT_SIZE = 0 +MIN_OUTPUT_COMM_SIZE = 0 + +# Set to 0 to disable signal +EXTRACTION_TIMEOUT = 0 + +# Deduplication +MIN_DUPLCHECK_SIZE = 10 +MAX_REPETITIONS = 3 + +# Extraction option for Htmldate +EXTENSIVE_DATE_SEARCH = off diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 3bdf1625..9847afc0 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -368,6 +368,9 @@ def test_html2txt(): assert html2txt(html.fromstring(mydoc)) == "Here is the body text" assert html2txt("") == "" assert html2txt("123") == "" + assert html2txt("") == "" + assert html2txt("") == "" + assert html2txt("

ABC

") == "ABC" def test_external(): diff --git a/trafilatura/core.py b/trafilatura/core.py index f27638b2..1904e2ad 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -26,7 +26,7 @@ handle_textnode, link_density_test_tables, process_node, prune_unwanted_nodes, tree_cleaning) from .metadata import Document, extract_metadata -from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config +from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED from .xml import (build_json_output, build_tei_output, build_xml_output, control_xml_output, remove_empty_elements, strip_double_tags, @@ -712,6 +712,13 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options): return body, text, len_text +def basic_cleaning(tree): + "Remove a few section types from the document." + for elem in tree.xpath(BASIC_CLEAN_XPATH): + elem.getparent().remove(elem) + return tree + + def baseline(filecontent): """Use baseline extraction function targeting text paragraphs and/or JSON metadata. @@ -735,9 +742,9 @@ def baseline(filecontent): elem = SubElement(postbody, 'p') elem.text = trim(mymatch[1].replace('\\"', '"')) return postbody, elem.text, len(elem.text) - # basic tree cleaning - for elem in tree.xpath('.//aside|.//footer|.//script|.//style'): - elem.getparent().remove(elem) + + tree = basic_cleaning(tree) + # scrape from article tag article_elem = tree.find('.//article') if article_elem is not None: @@ -787,8 +794,12 @@ def html2txt(content): """ tree = load_html(content) if tree is None: - return '' - return ' '.join(tree.text_content().split()).strip() + return "" + body = tree.find(".//body") + if body is None: + return "" + tree = basic_cleaning(tree) + return " ".join(body.text_content().split()).strip() def determine_returnstring(document, output_format, include_formatting, tei_validation): diff --git a/trafilatura/settings.py b/trafilatura/settings.py index bbad0e3e..53f88e23 100644 --- a/trafilatura/settings.py +++ b/trafilatura/settings.py @@ -72,6 +72,8 @@ def use_config(filename=None, config=None): ] # 'center', 'rb', 'wbr' +BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style" + TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote']) # + list(CUT_EMPTY_ELEMS)