Skip to content

Commit

Permalink
Don't be SpaceAfter=No annotations on words which are at the start or…
Browse files Browse the repository at this point in the history
… middle of an MWT
  • Loading branch information
AngledLuffa committed Nov 28, 2024
1 parent 6f6eb93 commit 2341d33
Showing 1 changed file with 30 additions and 10 deletions.
40 changes: 30 additions & 10 deletions src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,20 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg

// don't use after() directly; it returns a default of ""
if (token.get(CoreAnnotations.AfterAnnotation.class) != null && token.after().equals("")) {
if (misc.equals("_")) {
misc = "SpaceAfter=No";
} else {
misc = misc + "|SpaceAfter=No";
IndexedWord nextVertex = tokenSg.getNodeByIndex(token.index() + 1);
// the next word needs to exist and be part of the same MWT
// and either this word is the start of the MWT
// or this word is the middle of the same MWT as the next word
// if that is true, we will skip the SpaceAfter annotation
boolean inMWT = ((nextVertex != null && isMWTbutNotStart(nextVertex)) &&
((token.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) && token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) ||
(isMWTbutNotStart(token))));
if (!inMWT) {
if (misc.equals("_")) {
misc = "SpaceAfter=No";
} else {
misc = misc + "|SpaceAfter=No";
}
}
}

Expand Down Expand Up @@ -151,19 +161,29 @@ public static void printSpan(StringBuilder sb, AbstractCoreLabel token) {
}
}

/**
* Is the word part of an MWT, but not the start?
*/
public static boolean isMWTbutNotStart(IndexedWord nextVertex) {
if (nextVertex.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) &&
nextVertex.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) {
return false;
}
if (!nextVertex.containsKey(CoreAnnotations.IsMultiWordTokenAnnotation.class) ||
!nextVertex.get(CoreAnnotations.IsMultiWordTokenAnnotation.class)) {
return false;
}
return true;
}

public static void printMWT(StringBuilder sb, SemanticGraph graph, IndexedWord token) {
int startIndex = token.index();
int endIndex = startIndex;
// advance endIndex until we reach the end of the sentence, the start of the next MWT,
// or a word which isn't part of any MWT
IndexedWord nextVertex;
while ((nextVertex = graph.getNodeByIndex(endIndex+1)) != null) {
if (nextVertex.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) &&
nextVertex.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) {
break;
}
if (!nextVertex.containsKey(CoreAnnotations.IsMultiWordTokenAnnotation.class) ||
!nextVertex.get(CoreAnnotations.IsMultiWordTokenAnnotation.class)) {
if (!isMWTbutNotStart(nextVertex)) {
break;
}
++endIndex;
Expand Down

0 comments on commit 2341d33

Please sign in to comment.