Skip to content

Commit

Permalink
Merge pull request #1082 from kermitt2/review-patent
Browse files Browse the repository at this point in the history
Review patent process
  • Loading branch information
kermitt2 authored Feb 7, 2024
2 parents 4816a7a + 1ebc6c8 commit 269c897
Show file tree
Hide file tree
Showing 30 changed files with 1,798 additions and 805 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ public interface Analyzer {

List<LayoutToken> tokenizeWithLayoutToken(String text);

List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens);

List<String> retokenizeSubdigits(List<String> chunks);

List<LayoutToken> retokenizeSubdigitsWithLayoutToken(List<String> chunks);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,10 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text) {
return tokenizeWithLayoutToken(text, null);
}

public List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens) {
return GrobidDefaultAnalyzer.getInstance().retokenizeFromLayoutToken(tokens);
}

public List<LayoutToken> tokenizeWithLayoutToken(String text, Language lang) {
text = UnicodeUtil.normaliseText(text);
List<String> tokens = tokenize(text, lang);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,44 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text, Language language)
return result;
}

/**
* To tokenize an existing list of tokens. Only useful if input tokens have
* been tokenized with a non-default Grobid tokenizer.
* Note: the coordinates of the subtokens are not recomputed here (at least for
* the moment).
* <p>
* 1/74 -> "1", "/", "74"
*
*/
public List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens) {
List<LayoutToken> result = new ArrayList<>();
for(LayoutToken token : tokens) {
if (token.getText() == null || token.getText().trim().length() == 0) {
result.add(token);
} else {
String tokenText = token.getText();
List<String> subtokens = tokenize(tokenText);
int offset = token.getOffset();
for (int i = 0; i < subtokens.size(); i++) {
LayoutToken layoutToken = new LayoutToken();
layoutToken.setText(subtokens.get(i));
layoutToken.setOffset(offset);

// coordinates - TODO: refine the width/X for the sub token
layoutToken.setX(token.getX());
layoutToken.setY(token.getY());
layoutToken.setHeight(token.getHeight());
layoutToken.setWidth(token.getWidth());
layoutToken.setPage(token.getPage());

offset += subtokens.get(i).length();
result.add(layoutToken);
}
}
}
return result;
}

/**
* To tokenize mixture of alphabetical and numerical characters by separating
* separate alphabetical and numerical character subsequences. To be used
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public class PatentItem implements Comparable<PatentItem> {

// scores
private double conf = 1.0;
private String confidence = null;
//private String confidence = null;

// position in document
private int offset_begin = 0;
Expand Down Expand Up @@ -84,9 +84,9 @@ public double getConf() {
return conf;
}

public String getConfidence() {
/*public String getConfidence() {
return confidence;
}
}*/

public int getOffsetBegin() {
return offset_begin;
Expand Down Expand Up @@ -240,7 +240,7 @@ public String toString() {
+ ", kindCode=" + kindCode + ", application=" + application
+ ", provisional=" + provisional + ", reissued=" + reissued
+ ", plant=" + plant + ", design=" + design + ", conf=" + conf
+ ", confidence=" + confidence + ", offset_begin="
+ ", offset_begin="
+ offset_begin + ", offset_end=" + offset_end + ", offset_raw="
+ offset_raw + ", context=" + context + "]";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -874,7 +874,9 @@ public String processAllCitationsInPatent(String text,
}
// we initialize the attribute individually for readability...
boolean filterDuplicate = false;
return parsers.getReferenceExtractor().extractAllReferencesString(text, filterDuplicate,
List<String> texts = new ArrayList<>();
texts.add(text);
return parsers.getReferenceExtractor().extractAllReferencesString(texts, filterDuplicate,
consolidateCitations, includeRawCitations, patentResults, nplResults);
}

Expand Down

Large diffs are not rendered by default.

Loading

0 comments on commit 269c897

Please sign in to comment.