Skip to content

Commit

Permalink
tests and refactoring in smaller pieces
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Dec 17, 2024
1 parent 2f04ccd commit f5eb758
Show file tree
Hide file tree
Showing 4 changed files with 365 additions and 251 deletions.
117 changes: 64 additions & 53 deletions grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import java.nio.charset.StandardCharsets;

import org.apache.lucene.util.CollectionUtil;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.*;
import org.grobid.core.document.Document;
Expand All @@ -33,7 +32,6 @@
import org.grobid.core.features.FeaturesVectorFulltext;
import org.grobid.core.lang.Language;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.lexicon.Lexicon.OrganizationRecord;
import org.grobid.core.layout.*;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
Expand All @@ -43,6 +41,7 @@
import org.grobid.core.engines.citations.CalloutAnalyzer;
import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType;

import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -362,69 +361,38 @@ else if (config.getConsolidateCitations() == 2)
}
}

private static String revertResultsForBadItems(List<? extends Figure> badFiguresOrTables, String resultBody, String itemLabel) {
static String revertResultsForBadItems(List<? extends Figure> badFiguresOrTables, String resultBody, String itemLabel) {
//LF: we update the resultBody sequence by reverting these tables as <paragraph> elements
if (CollectionUtils.isNotEmpty(badFiguresOrTables)) {
List<List<String>> splitResult = Arrays.stream(resultBody.split("\n"))
List<List<String>> labelledResultsAsList = Arrays.stream(resultBody.split("\n"))
.map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList()))
.collect(Collectors.toList());

for (Figure badItem : badFiguresOrTables) {
// Find the index of the first layoutToken of the table in the tokenization
List<LayoutToken> rawLayoutTokenTable = badItem.getLayoutTokens();
LayoutToken firstLayoutTokenItem = rawLayoutTokenTable.get(0);

List<Integer> candidateIndexes = IntStream.range(0, splitResult.size())
.filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenItem.getText())
&& Iterables.getLast(splitResult.get(i)).equals("I-"+itemLabel))
.boxed()
.collect(Collectors.toList());

List<LayoutToken> layoutTokenItem = badItem.getLayoutTokens();
List<Integer> candidateIndexes = findCandiateIndex(layoutTokenItem, labelledResultsAsList, itemLabel);
if (candidateIndexes.isEmpty()) {
candidateIndexes = IntStream.range(0, splitResult.size())
.filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenItem.getText())
&& Iterables.getLast(splitResult.get(i)).equals(itemLabel))
.boxed()
.collect(Collectors.toList());
if (candidateIndexes.isEmpty()) {
LOGGER.info("Cannot find the candidate index for fixing the tables.");
continue;
}
LOGGER.info("Cannot find the candidate index for fixing the tables.");
continue;
}

// Need to match with the rest
List<String> tokensNoSpace = rawLayoutTokenTable.stream()
//A this point i have more than one candidate, which can be matched if the same first
// token is repeated in the sequence. The next step is to find the matching figure/table
// using a large sequence

List<String> sequenceTokenWithoutSpaces = layoutTokenItem.stream()
.map(LayoutToken::getText)
.map(StringUtils::strip)
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList());

int resultIndexCandidate = -1;
if (candidateIndexes.isEmpty()){
LOGGER.warn("Cannot find the candidate index for fixing the tables.");
} else if (candidateIndexes.size() == 1){
resultIndexCandidate = candidateIndexes.get(0);
} else {
for (int candidateIndex: candidateIndexes) {
List<String> candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpace.size(), splitResult.size()))
.stream()
.map(i -> i.get(0))
.collect(Collectors.toList());

String candidateTableText = String.join("", candidateTable);
String tokensText = String.join("", tokensNoSpace);

if (candidateTableText.equals(tokensText)) {
resultIndexCandidate = candidateIndex;
break;
}
}
}
int resultIndexCandidate = consolidateResultCandidateThroughSequence(candidateIndexes, labelledResultsAsList, sequenceTokenWithoutSpaces);

if (resultIndexCandidate > -1) {
boolean first = true;
for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + tokensNoSpace.size(), splitResult.size()); i++) {
List<String> line = splitResult.get(i);
for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + sequenceTokenWithoutSpaces.size(), labelledResultsAsList.size()); i++) {
List<String> line = labelledResultsAsList.get(i);
String label = Iterables.getLast(line);
if (first) {
first = false;
Expand All @@ -440,15 +408,58 @@ private static String revertResultsForBadItems(List<? extends Figure> badFigures
}
}

String resultBody2 = splitResult.stream()
String updatedResultBody = labelledResultsAsList.stream()
.map(l -> String.join("\t", l))
.collect(Collectors.joining("\n"));

resultBody = resultBody2;
resultBody = updatedResultBody;
}
return resultBody;
}

static int consolidateResultCandidateThroughSequence(List<Integer> candidateIndexes, List<List<String>> splitResult, List<String> tokensNoSpace) {
int resultIndexCandidate = -1;
if (candidateIndexes.size() == 1){
resultIndexCandidate = candidateIndexes.get(0);
} else {
for (int candidateIndex: candidateIndexes) {
List<String> candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpace.size(), splitResult.size()))
.stream()
.map(i -> i.get(0))
.collect(Collectors.toList());

String candidateTableText = String.join("", candidateTable);
String tokensText = String.join("", tokensNoSpace);

if (candidateTableText.equals(tokensText)) {
resultIndexCandidate = candidateIndex;
break;
}
}
}
return resultIndexCandidate;
}

@NotNull
static List<Integer> findCandiateIndex(List<LayoutToken> layoutTokenItem, List<List<String>> labelledResultsAsList, String itemLabel) {
LayoutToken firstLayoutTokenItem = layoutTokenItem.get(0);

List<Integer> candidateIndexes = IntStream.range(0, labelledResultsAsList.size())
.filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText())
&& Iterables.getLast(labelledResultsAsList.get(i)).equals("I-"+ itemLabel))
.boxed()
.collect(Collectors.toList());

if (candidateIndexes.isEmpty()) {
candidateIndexes = IntStream.range(0, labelledResultsAsList.size())
.filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText())
&& Iterables.getLast(labelledResultsAsList.get(i)).equals(itemLabel))
.boxed()
.collect(Collectors.toList());
}
return candidateIndexes;
}


/**
* Machine-learning recognition of full text structures limted to header and funding information.
Expand Down Expand Up @@ -2062,10 +2073,10 @@ private static boolean testClosingTag(StringBuilder buffer,
buffer.append("</ref>");

// Make sure that paragraph is closed when markers are at the end of it
if (!currentTag0.equals("<paragraph>") &&
(!currentTag0.equals("<citation_marker>") ||
!currentTag0.equals("<figure_marker>") ||
!currentTag0.equals("<table_marker>") ||
if (!currentTag0.equals("<paragraph>") &&
(!currentTag0.equals("<citation_marker>") ||
!currentTag0.equals("<figure_marker>") ||
!currentTag0.equals("<table_marker>") ||
!currentTag0.equals("<equation_marker>")
)
) {
Expand Down
Loading

0 comments on commit f5eb758

Please sign in to comment.