Skip to content

Commit

Permalink
Add tweaks to table-finding code
Browse files Browse the repository at this point in the history
Unfortunately, `tagged_text` contains unescaped `&`s, which caused etree to
explode. This patches the logic to account for this scenario.

I've also created eregs#287 to note that we need to replace these fields
altogether.
  • Loading branch information
CM Lubinski committed Jul 29, 2016
1 parent 043375e commit c3fc64e
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 10 deletions.
9 changes: 6 additions & 3 deletions regparser/layer/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,12 @@ def node_to_table_xml_els(node):
if node.source_xml is not None:
root_xml_el = node.source_xml
else:
# Wrap tagged_text in a placeholder tag as it might be an XML fragment
root_xml_el = etree.fromstring(u'<ROOT>{}</ROOT>'.format(
getattr(node, 'tagged_text', '')))
# tagged_text isn't quite XML -- it's often a fragment with unescaped
# characters. Clean it up before searching it
tagged_text = getattr(node, 'tagged_text', '')
tagged_text = tagged_text.replace('&', '&amp;')
tagged_text = u'<ROOT>{}</ROOT>'.format(tagged_text)
root_xml_el = etree.fromstring(tagged_text)

return root_xml_el.xpath('self::GPOTABLE|.//GPOTABLE')

Expand Down
17 changes: 10 additions & 7 deletions tests/layer_formatting_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from unittest import TestCase

from lxml import etree

from regparser.layer import formatting
from regparser.test_utils.xml_builder import XMLBuilder
from regparser.tree.struct import Node
Expand Down Expand Up @@ -391,13 +392,15 @@ def mkhd(t, c, r):


def test_node_to_table_xml_els():
"""We should be able to find a GPOTABLE in multiple places"""
xml_str = '<GPOTABLE unique="id">Content</GPOTABLE>'
nested_str = '<P>Stuff <NESTED>{}</NESTED></P>'.format(xml_str)
node1 = Node(source_xml=etree.fromstring(xml_str))
node2 = Node(source_xml=etree.fromstring(nested_str))
node3 = Node(tagged_text=xml_str)
node4 = Node(tagged_text=nested_str)
"""We should be able to find a GPOTABLE in multiple places. Tagged_text is
can be an unescaped XML _fragment_"""
escaped_str = '<GPOTABLE unique="id">Content &amp; stuff</GPOTABLE>'
unescaped_str = '<GPOTABLE unique="id">Content & stuff</GPOTABLE>'
nested_str = '<P>Stuff <NESTED>{}</NESTED></P>'
node1 = Node(source_xml=etree.fromstring(escaped_str))
node2 = Node(source_xml=etree.fromstring(nested_str.format(escaped_str)))
node3 = Node(tagged_text=unescaped_str)
node4 = Node(tagged_text=nested_str.format(unescaped_str))
for node in (node1, node2, node3, node4):
result = formatting.node_to_table_xml_els(node)
assert len(result) == 1
Expand Down

0 comments on commit c3fc64e

Please sign in to comment.