Skip to content

Commit

Permalink
Merge branch 'back-section-figure-tables-upstream' of https://github.…
Browse files Browse the repository at this point in the history
…com/elifesciences/grobid into elifesciences-back-section-figure-tables-upstream
  • Loading branch information
lfoppiano committed Dec 26, 2024
2 parents f372f99 + 6f45c06 commit 673c983
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 129 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -191,17 +191,17 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
for(Funding funding : fundings) {
if (funding.getFunder() == null) {
List<Funding> localfundings = fundingRelation.get(Funder.EMPTY);
if (localfundings == null)
if (localfundings == null)
localfundings = new ArrayList<>();
localfundings.add(funding);
fundingRelation.put(Funder.EMPTY, localfundings);
} else {
List<Funding> localfundings = fundingRelation.get(funding.getFunder());
if (localfundings == null)
if (localfundings == null)
localfundings = new ArrayList<>();
localfundings.add(funding);
fundingRelation.put(funding.getFunder(), localfundings);
}
}
}

List<Funder> localFunders = new ArrayList<>();
Expand Down Expand Up @@ -268,8 +268,8 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
// We introduce something more meaningful with TEI customization to encode copyrights information:
// - @resp with value "publisher", "authors", "unknown", we add a comment to clarify that @resp
// should be interpreted as the copyrights owner
// - license related to copyrights exception is encoded via <licence>
// (note: I have no clue what can mean "free" as status for a document - there are always some sort of
// - license related to copyrights exception is encoded via <licence>
// (note: I have no clue what can mean "free" as status for a document - there are always some sort of
// restrictions like moral rights even for public domain documents)
if (copyrightsLicense != null) {
tei.append("\t\t\t\t<availability ");
Expand Down Expand Up @@ -306,7 +306,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio,

tei.append(" status=\"unknown\">\n");
tei.append("\t\t\t\t\t<licence/>\n");

if (defaultPublicationStatement != null) {
tei.append("\t\t\t\t\t<p>" +
TextUtilities.HTMLEncode(defaultPublicationStatement) + "</p>\n");
Expand All @@ -331,7 +331,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
} else {
tei.append("\t\t\t\t<date>");
}

if (biblio.getPublicationDate() != null) {
tei.append(TextUtilities.HTMLEncode(biblio.getPublicationDate()));
} else {
Expand Down Expand Up @@ -1079,7 +1079,7 @@ public StringBuilder toTEIBody(StringBuilder buffer,
protected List<Note> getTeiNotes(Document doc) {
// There are two types of structured notes currently supported, foot notes and margin notes.
// We consider that head notes are always only presentation matter and are never references
// in a text body.
// in a text body.

SortedSet<DocumentPiece> documentNoteParts = doc.getDocumentPart(SegmentationLabels.FOOTNOTE);
List<Note> notes = getTeiNotes(doc, documentNoteParts, Note.NoteType.FOOT);
Expand Down Expand Up @@ -1123,7 +1123,7 @@ protected List<Note> getTeiNotes(Document doc, SortedSet<DocumentPiece> document
if (localNotes != null)
notes.addAll(localNotes);
}

notes.stream()
.forEach(n -> n.setText(TextUtilities.dehyphenize(n.getText())));

Expand Down Expand Up @@ -1179,13 +1179,13 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
Note localNote = null;
if (currentNumber == -1)
localNote = new Note(null, noteTokens, footText, noteType);
else
else
localNote = new Note(""+currentNumber, noteTokens, footText, noteType);

notes.add(localNote);

// add possible subsequent notes concatenated in the same note sequence (this is a common error,
// which is addressed here by heuristics, it may not be necessary in the future with a better
// which is addressed here by heuristics, it may not be necessary in the future with a better
// segmentation model using more footnotes training data)
if (currentNumber != -1) {
String nextLabel = " " + (currentNumber+1);
Expand All @@ -1195,7 +1195,7 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No

int nextFootnoteLabelIndex = footText.indexOf(nextLabel);
if (nextFootnoteLabelIndex != -1) {
// optionally we could restrict here to superscript numbers
// optionally we could restrict here to superscript numbers
// review local note
localNote.setText(footText.substring(0, nextFootnoteLabelIndex));
int pos = 0;
Expand Down Expand Up @@ -1237,9 +1237,9 @@ private StringBuilder toTEINote(StringBuilder tei,
List<MarkerType> markerTypes,
GrobidAnalysisConfig config) throws Exception {
// pattern is <note n="1" place="foot" xml:id="foot_1">
// or
// or
// pattern is <note n="1" place="margin" xml:id="margin_1">

// if no note label is found, no @n attribute but we generate a random xml:id (not be used currently)

for (Note note : notes) {
Expand All @@ -1251,20 +1251,20 @@ private StringBuilder toTEINote(StringBuilder tei,

addXmlId(desc, note.getIdentifier());

// this is a paragraph element for storing text content of the note, which is
// this is a paragraph element for storing text content of the note, which is
// better practice than just putting the text under the <note> element
Element pNote = XmlBuilderUtils.teiElement("p");
if (config.isGenerateTeiIds()) {
String pID = KeyGen.getKey().substring(0, 7);
addXmlId(pNote, "_" + pID);
}

if (config.isGenerateTeiCoordinates("p")) {
String coords = LayoutTokensUtil.getCoordsString(note.getTokens());
desc.addAttribute(new Attribute("coords", coords));
}
// for labelling bibliographical references in notes

// for labelling bibliographical references in notes
List<LayoutToken> noteTokens = note.getTokens();

String coords = null;
Expand Down Expand Up @@ -1358,7 +1358,7 @@ public StringBuilder processTEIDivSection(String xmlType,
StringBuilder contentBuffer = new StringBuilder();

contentBuffer = toTEITextPiece(contentBuffer, text, null, biblioData, false,
new LayoutTokenization(tokens), null, null, null,
new LayoutTokenization(tokens), null, null, null,
null, null, doc, config);
String result = contentBuffer.toString();
String[] resultAsArray = result.split("\n");
Expand All @@ -1380,6 +1380,9 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,
BiblioItem biblio,
List<BibDataSet> bds,
List<LayoutToken> tokenizations,
List<Figure> figures,
List<Table> tables,
List<Equation> equations,
List<MarkerType> markerTypes,
Document doc,
GrobidAnalysisConfig config) throws Exception {
Expand All @@ -1389,7 +1392,7 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,

buffer.append("\t\t\t<div type=\"annex\">\n");
buffer = toTEITextPiece(buffer, result, biblio, bds, true,
new LayoutTokenization(tokenizations), null, null, null, null,
new LayoutTokenization(tokenizations), figures, tables, equations, null,
markerTypes, doc, config);
buffer.append("\t\t\t</div>\n");

Expand Down Expand Up @@ -1731,12 +1734,12 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
} else {
throw new IllegalStateException("Unsupported marker type: " + clusterLabel);
}

if (refNodes != null) {
boolean footNoteCallout = false;

if (refNodes.size() == 1 && (refNodes.get(0) instanceof Text)) {
// filtered out superscript reference marker (based on the defined citationMarkerType) might
// filtered out superscript reference marker (based on the defined citationMarkerType) might
// be foot note callout - se we need in this particular case to try to match existing notes
// similarly as within paragraph
if (citationMarkerType == null || citationMarkerType != MarkerType.SUPERSCRIPT_NUMBER) {
Expand Down Expand Up @@ -1768,16 +1771,16 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
}
}
}
}
}
}

if (!footNoteCallout) {
for (Node n : refNodes) {
parent.appendChild(n);
}
}
}
}

if (curParagraph != null)
curParagraphTokens.addAll(cluster.concatTokens());
} else if (clusterLabel.equals(TaggingLabels.FIGURE) || clusterLabel.equals(TaggingLabels.TABLE)) {
Expand Down Expand Up @@ -1879,7 +1882,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}

public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang, List<PDFAnnotation> annotations) {
// in order to avoid having a sentence boundary in the middle of a ref element
// in order to avoid having a sentence boundary in the middle of a ref element
// (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
// we only consider for sentence segmentation texts under <p> and skip the text under <ref>.
if (curParagraph == null)
Expand Down Expand Up @@ -1919,7 +1922,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
forbiddenPositions.addAll(offsetPositionsUrls);

List<OffsetPosition> theSentences =
List<OffsetPosition> theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));

/*if (theSentences.size() == 0) {
Expand Down Expand Up @@ -1982,7 +1985,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
for (List<LayoutToken> segmentedParagraphToken : segmentedParagraphTokens) {
if (k < theSentences.size())
System.out.println(k + " sentence segmented text-only: " + text.substring(theSentences.get(k).start, theSentences.get(k).end));
else
else
System.out.println("no text-only sentence at index " + k);
System.out.print(k + " layout token segmented sentence: ");
System.out.println(segmentedParagraphToken);
Expand Down Expand Up @@ -2059,7 +2062,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}
}

}
}

/**
* Return the graphic objects in a given interval position in the document.
Expand Down Expand Up @@ -2154,7 +2157,7 @@ public List<Node> markReferencesTEILuceneBased(List<LayoutToken> refTokens,
}

public List<Node> markReferencesTEILuceneBased(List<LayoutToken> refTokens,
ReferenceMarkerMatcher markerMatcher,
ReferenceMarkerMatcher markerMatcher,
boolean generateCoordinates,
boolean keepUnsolvedCallout,
MarkerType citationMarkerType) throws EntityMatcherException {
Expand All @@ -2179,7 +2182,7 @@ public List<Node> markReferencesTEILuceneBased(List<LayoutToken> refTokens,
if (refToken.isSuperscript()) {
hasSuperScriptNumber = true;
break;
}
}
}

if (citationMarkerType == MarkerType.SUPERSCRIPT_NUMBER) {
Expand Down Expand Up @@ -2234,11 +2237,11 @@ public List<Node> markReferencesTEILuceneBased(List<LayoutToken> refTokens,
}


public List<Node> markReferencesFigureTEI(String refText,
public List<Node> markReferencesFigureTEI(String refText,
List<LayoutToken> allRefTokens,
List<Figure> figures,
boolean generateCoordinates) {
if (refText == null ||
if (refText == null ||
refText.trim().isEmpty()) {
return null;
}
Expand All @@ -2265,7 +2268,7 @@ public List<Node> markReferencesFigureTEI(String refText,
}

if (labels == null || labels.size() <= 1) {
org.grobid.core.utilities.Pair<String, List<LayoutToken>> localLabel =
org.grobid.core.utilities.Pair<String, List<LayoutToken>> localLabel =
new org.grobid.core.utilities.Pair(refText, allRefTokens);
labels = new ArrayList<>();
labels.add(localLabel);
Expand Down Expand Up @@ -2313,7 +2316,7 @@ public List<Node> markReferencesFigureTEI(String refText,

String andWordString = null;
if (text.endsWith("and") || text.endsWith("&")) {
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
if (text.endsWith("and")) {
text = text.substring(0, text.length()-3);
andWordString = "and";
Expand Down Expand Up @@ -2362,7 +2365,7 @@ else if (text.endsWith("&")) {
public List<Node> markReferencesTableTEI(String refText, List<LayoutToken> allRefTokens,
List<Table> tables,
boolean generateCoordinates) {
if (refText == null ||
if (refText == null ||
refText.trim().isEmpty()) {
return null;
}
Expand All @@ -2389,7 +2392,7 @@ public List<Node> markReferencesTableTEI(String refText, List<LayoutToken> allRe
}

if (labels == null || labels.size() <= 1) {
org.grobid.core.utilities.Pair<String, List<LayoutToken>> localLabel =
org.grobid.core.utilities.Pair<String, List<LayoutToken>> localLabel =
new org.grobid.core.utilities.Pair(refText, allRefTokens);
labels = new ArrayList<>();
labels.add(localLabel);
Expand Down Expand Up @@ -2437,7 +2440,7 @@ public List<Node> markReferencesTableTEI(String refText, List<LayoutToken> allRe

String andWordString = null;
if (text.endsWith("and") || text.endsWith("&")) {
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
if (text.endsWith("and")) {
text = text.substring(0, text.length()-3);
andWordString = "and";
Expand Down Expand Up @@ -2475,7 +2478,7 @@ else if (text.endsWith("&")) {
if (andWordString != null) {
nodes.add(new Text(andWordString));
}

if (spaceEnd)
nodes.add(new Text(" "));
}
Expand Down
Loading

0 comments on commit 673c983

Please sign in to comment.