diff --git a/trafilatura/xml.py b/trafilatura/xml.py index c1acaa26..a31e70da 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -305,6 +305,11 @@ def process_element(element: _Element, returnlist: List[str], include_formatting # this is the text that comes before the first child returnlist.append(replace_element_text(element, include_formatting)) + if element.tail and element.tag != 'graphic' and is_in_table_cell(element): + # if element is in table cell, append tail after element text when element is not graphic since we deal with + # graphic tail alone, textless elements like lb should be processed here too, otherwise process tail at the end + returnlist.append(element.tail.strip()) + for child in element: process_element(child, returnlist, include_formatting)