Skip to content

Commit

Permalink
Simplify logic for detecting first text of a note
Browse files Browse the repository at this point in the history
Co-authored-by: Laure Thompson <[email protected]>
  • Loading branch information
rlskoeser and laurejt committed Oct 23, 2024
1 parent 5dd991b commit a1815ba
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 47 deletions.
66 changes: 30 additions & 36 deletions ppa/archive/eebo_tcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ class Page(xmlmap.XmlObject):
# page beginning tags delimit content instead of containing it;
# use following axis to find all text nodes following this page beginning
text_contents = xmlmap.StringListField("following::text()")

# notes on or after this page, for managing footnote text
notes = xmlmap.NodeListField("following::NOTE", xmlmap.XmlObject)
# count following notes as a quick check to bail out of note detection logic
has_notes = xmlmap.IntegerField("count(following::NOTE)")

def __repr__(self):
return f"<Page {self.number or '-'} ({self.section_type})>"
Expand All @@ -40,20 +39,21 @@ def __repr__(self):

def get_note_mark(self, i):
"""Generate a note marker based on note index and :attr:`note_marks`;
symbols are used in order and then doubled, tripled, etc as needed."""
symbols are used in order and then doubled, tripled, etc as needed.
(Fallback, for use when a note does not have an N attribute.)"""

# use modulo to map to the list of available marks
mark_index = i % self.num_note_marks
# use division to determine how many times to repeat the mark
repeat = int(i / self.num_note_marks) + 1
return self.note_marks[mark_index] * repeat

def text_inside_note(self, text):
def parent_note(self, text):
"""check if a text element occurs within a NOTE element; if so,
return the note element"""

# bail out if there are no notes following this page
if not self.notes:
if not self.has_notes:
return None

# check if this text is directly inside a note tag
Expand All @@ -64,57 +64,51 @@ def text_inside_note(self, text):
within_note = parent
# otherwise, check if text is nested somewhere under a note tag
else:
# get the first/nearest ancestor note, if there is one
note_ancestors = parent.xpath("ancestor::NOTE[1]")
within_note = note_ancestors[0] if note_ancestors else None

return within_note

def note_index(self, note):
# given a note element, determine the 0-based index on this page

# use a for loop so we can bail out once we get a match
for i, n in enumerate(self.notes):
if n.node == note:
return i

# in normal use the note should be found; raise an exception
# if it is not so this will fail loudly
raise ValueError

def page_contents(self):
"""generator of text strings between this page beginning tag and
the next one"""

# strictly speaking we are returning lxml "smart strings"
# (lxml.etree._ElementUnicodeResult)

# collect any notes and include after main page text contents
notes = []
# collect text content for any notes, to be included
# after main page text contents
notes_text = []
# keep track of note count as we encounter them;
# used for locally generated footnote marks
note_index = 0

# iterate and yield text following the current page
# break until we hit the next page beginning
for i, text in enumerate(self.text_contents):
parent = text.getparent()

# determine if this text falls inside a note tag
within_note = self.text_inside_note(text)
# check if this text falls inside a note tag
within_note = self.parent_note(text)
if within_note is not None:
# if text is inside a note, determine which one
note_index = self.note_index(within_note)
# if this is the first text for this note,
# add a marker inline with the text AND the note
# index equals length, start a new note at the end of the list of notes
if len(notes) == note_index:
# some note tags have an N attribute; use if present
# otherwise, use a note mark from our list of symbols
# is this the first text in this note?
within_note.xpath(".//text()")
# print(f"first note text = {first_text}")
is_first_text = within_note.xpath(".//text()")[0] == str(text)
if is_first_text:
# print(f"hit first note text {text}")
# if this is the first text for this note,
# add a marker inline with the text AND to the note
note_mark = within_note.get("N", self.get_note_mark(note_index))
yield note_mark
notes.append(f"\n{note_mark} ")
notes_text.append(f"{note_mark} ")
note_index += 1

# add text to the appropriate note
notes[note_index] = f"{notes[note_index]}{text}"
# save note text content to be yielded later
notes_text.append(text)

# skip to next loop
# skip to next loop without yielding text in current context
continue

# lxml handles text between elements as "tail" text;
Expand All @@ -134,10 +128,10 @@ def page_contents(self):
yield text

# if this page includes notes, yield notes after main text content
if notes:
if notes_text:
# yield two blank lines to separate main text content from notes
yield "\n\n"
yield from notes
yield from notes_text

divider = "∣"

Expand Down
43 changes: 32 additions & 11 deletions ppa/archive/tests/test_eebo_tcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,32 +39,53 @@ def test_eebo_tcp_page_contents():
)


# ruff: noqa: E501
# sample xml pages with notes (minimal wrapping tags to load as a text)
PAGE_WITH_NOTE = """<ETS><EEBO><TEXT><PB N="257" REF="198"/><LG>
<L>Whom Monarchs like domestick Slaves obey'd,</L>
<L>On the bleak Shoar now lies th' abandon'd King,</L>
<L N="765"><NOTE N="*" PLACE="foot"><HI>This whole line is taken from Sir</HI> John Derhan.</NOTE> A headless Carcass, and a nameless thing.</L>
</LG></TEXT></EEBO></ETS>"""
PAGE_WITH_MULTIPLE_NOTES = """<ETS><EEBO><TEXT><P><PB N="3" REF="11"/>
many Colonies. But now the several Languages that are used in the world do farre exceed this number.<NOTE PLACE="marg">Nat. Hist. lib. 6. cap. 5. <HI>Strabo,</HI> lib. 11.</NOTE> <HI>Pliny</HI> and <HI>Strabo</HI> do both make mention of a great Mart-Town in <HI>Colchos</HI> named <HI>Dioscuria,</HI> to which men of three hundred Nations, and of so many several Languages, were wont to resort for Trading. Which, considering the narrow compass of Traf∣fick before the invention of the magnetic Needle, must needs be but a small proportion, in comparison to those many of the remoter and un∣known parts of the world.</P>
<P>Some of the <HI>American</HI> Histories relate,<NOTE PLACE="marg">Mr. <HI>Cambden</HI>'s Remains.</NOTE> that in every fourscore miles of that vast Country, and almost in every particular valley of <HI>Peru,</HI> the Inhabitants have a distinct Language. And one who for several years travelled the Northern parts of <HI>America</HI> about <HI>Florida,</HI><NOTE PLACE="marg"><HI>Purchas</HI> Pilg. lib. 8. sect. 4. chap. 1.</NOTE> and could speak six several Languages of those people, doth affirm, that he found, upon his enquiry and converse with them, more than a thousand different Lan∣guages amongst them.</P>
<P>As for those Languages which seem to have no derivation from, or de∣pendance upon, or affinity with one another,<NOTE PLACE="marg">§. III.</NOTE> they are styled <HI>Linguae ma∣trices,</HI> or <HI>Mother-tongues.</HI> Of these <HI>Ioseph Scaliger</HI>
affirms there are ele∣ven, and not more, used in <HI>Europe</HI>;<NOTE PLACE="marg">Diatribe de Europaeorum linguis.</NOTE> whereof four are of more general and large extent, and the other seven of a narrower compass and use. Of the more general Tongues
.</P></TEXT></EEBO></ETS>"""


def test_text_inside_note():
many Colonies. But now the several Languages that are used in the
world do farre exceed this number.<NOTE PLACE="marg">Nat. Hist. lib. 6. cap. 5. <HI>Strabo,</HI> lib. 11.</NOTE>
<HI>Pliny</HI> and <HI>Strabo</HI> do both make mention of a great Mart-Town
in <HI>Colchos</HI> named <HI>Dioscuria,</HI> to which men of three hundred
Nations, and of so many several Languages, were wont to resort for Trading.
Which, considering the narrow compass of Traf∣fick before the invention of
the magnetic Needle, must needs be but a small proportion, in comparison
to those many of the remoter and un∣known parts of the world.</P>
<P>Some of the <HI>American</HI> Histories relate,<NOTE PLACE="marg">Mr. <HI>Cambden</HI>'s Remains.</NOTE> that in every
fourscore miles of that vast Country, and almost in every particular
valley of <HI>Peru,</HI> the Inhabitants have a distinct Language.
And one who for several years travelled the Northern parts of
<HI>America</HI> about <HI>Florida,</HI>
<NOTE PLACE="marg"><HI>Purchas</HI> Pilg. lib. 8. sect. 4. chap. 1.</NOTE>
and could speak six several Languages of those people, doth affirm, that
he found, upon his enquiry and converse with them, more than a thousand
different Lan∣guages amongst them.</P>
<P>As for those Languages which seem to have no derivation from, or
de∣pendance upon, or affinity with one another,<NOTE PLACE="marg">§. III.</NOTE>
they are styled <HI>Linguae ma∣trices,</HI> or <HI>Mother-tongues.</HI>
Of these <HI>Ioseph Scaliger</HI> affirms there are ele∣ven, and not more,
used in <HI>Europe</HI>;<NOTE PLACE="marg">Diatribe de Europaeorum linguis.</NOTE> whereof
four are of more general and large extent, and the
other seven of a narrower compass and use. Of the more general Tongues.</P>
</TEXT></EEBO></ETS>"""


def test_parent_note():
text = load_xmlobject_from_string(PAGE_WITH_MULTIPLE_NOTES, eebo_tcp.Text)
page = text.pages[0]
# text directly inside a note tag (note is parent)
diatribe_note_text = text.node.xpath("//NOTE[contains(., 'Diatribe')]/text()")[0]
assert page.text_inside_note(diatribe_note_text) is not None
assert page.parent_note(diatribe_note_text) is not None
# text nested under a tag within a note tag (note is ancestor)
camden_note_text = text.node.xpath("//NOTE/HI[contains(., 'Cambden')]/text()")[0]
assert page.text_inside_note(camden_note_text) is not None
assert page.parent_note(camden_note_text) is not None
# text in a tag that is NOT inside a note tag (note is not parent/ancestor)
europe_hi_text = text.node.xpath("//HI[contains(., 'Europe')]/text()")[0]
assert page.text_inside_note(europe_hi_text) is None
assert page.parent_note(europe_hi_text) is None


def test_get_note_mark():
Expand Down

0 comments on commit a1815ba

Please sign in to comment.