From 6bc28d4dec3261c1adfba3f656766224a329e12c Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 2 Feb 2024 13:15:37 +0100 Subject: [PATCH] review code --- trafilatura/htmlprocessing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 7f7a043d..8eb4bffb 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -67,17 +67,16 @@ def tree_cleaning(tree, options): for element in tree.getiterator(expression): delete_element(element) - return prune_html(tree) + prune_html(tree) + return tree def prune_html(tree): "Delete selected empty elements to save space and processing time." - # //processing-instruction() # //comment() needed for date extraction - for element in tree.xpath(".//*[not(node())]"): + for element in tree.xpath("//processing-instruction()|//*[not(node())]"): if element.tag in CUT_EMPTY_ELEMS: delete_element(element) - return tree def prune_unwanted_nodes(tree, nodelist, with_backup=False):