diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 14d468418c..08fea642fa 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -1,5 +1,6 @@ package org.grobid.core.data; +import org.apache.commons.collections4.CollectionUtils; import org.grobid.core.GrobidModels; import org.apache.commons.lang3.StringUtils; import org.grobid.core.data.table.Cell; @@ -32,6 +33,7 @@ import nu.xom.Node; import nu.xom.Text; +import static org.grobid.core.document.TEIFormatter.isNewParagraph; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; @@ -169,7 +171,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } Element noteNode = null; - if (note != null && note.toString().trim().length()>0) { + if (StringUtils.isNotBlank(note)) { noteNode = XmlBuilderUtils.teiElement("note"); if (config.isGenerateTeiIds()) { @@ -177,7 +179,8 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form addXmlId(noteNode, "_" + divID); } - if ( (labeledNote != null) && (labeledNote.length() > 0) ) { + if (StringUtils.isNotEmpty(labeledNote) ) { + Element p = null; TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens); List clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { @@ -186,7 +189,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } MarkerType citationMarkerType = null; - if (markerTypes != null && markerTypes.size()>0) { + if (CollectionUtils.isNotEmpty(markerTypes)) { citationMarkerType = markerTypes.get(0); } @@ -194,6 +197,10 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens()); String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { + if (p == null) { + LOGGER.warn("Problem when serializing TEI fragment for table note, there is a reference at the beginning of the sentence. "); + p = teiElement("p"); + } try { List refNodes = formatter.markReferencesTEILuceneBased( cluster.concatTokens(), @@ -203,30 +210,29 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form citationMarkerType); if (refNodes != null) { for (Node n : refNodes) { - noteNode.appendChild(n); + p.appendChild(n); } } } catch(Exception e) { LOGGER.warn("Problem when serializing TEI fragment for table note", e); } } else { - noteNode.appendChild(textNode(clusterContent)); + if (p == null) { + p = teiElement("p"); + } else if (isNewParagraph(clusterLabel, p)) { + noteNode.appendChild(p); + p = teiElement("p"); + } + p.appendChild(textNode(clusterContent)); } - if (noteNode != null && config.isWithSentenceSegmentation()) { + if (config.isWithSentenceSegmentation()) { // we need a sentence segmentation of the figure caption formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } - - // enclose note content in a

element - if (noteNode != null) { - noteNode.setLocalName("p"); - - Element tabNote = XmlBuilderUtils.teiElement("note"); - tabNote.appendChild(noteNode); - - noteNode = tabNote; - } + } + if (p != null && p.getChildCount() > 0) { + noteNode.appendChild(p); } } else { noteNode = XmlBuilderUtils.teiElement("note", LayoutTokensUtil.normalizeText(note.toString()).trim());