Skip to content

Commit

Permalink
review code
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Feb 2, 2024
1 parent b2ddcdd commit 6bc28d4
Showing 1 changed file with 3 additions and 4 deletions.
7 changes: 3 additions & 4 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,16 @@ def tree_cleaning(tree, options):
for element in tree.getiterator(expression):
delete_element(element)

return prune_html(tree)
prune_html(tree)
return tree


def prune_html(tree):
"Delete selected empty elements to save space and processing time."
# //processing-instruction()
# //comment() needed for date extraction
for element in tree.xpath(".//*[not(node())]"):
for element in tree.xpath("//processing-instruction()|//*[not(node())]"):
if element.tag in CUT_EMPTY_ELEMS:
delete_element(element)
return tree


def prune_unwanted_nodes(tree, nodelist, with_backup=False):
Expand Down

0 comments on commit 6bc28d4

Please sign in to comment.