diff --git a/elit-ddr/README.md b/elit-ddr/README.md
new file mode 100644
index 0000000..c7f7ce1
--- /dev/null
+++ b/elit-ddr/README.md
@@ -0,0 +1,134 @@
+# DDR Conversion
+
+DDR conversion generates the [deep dependency graphs](https://github.com/emorynlp/ddr) from the Penn Treebank style constituency trees.
+The conversion tool is written in Java and developed by [Emory NLP](http://nlp.mathcs.emory.edu) as a part of the [ELIT](https://elit.cloud) project.
+
+## Installation
+
+Add the following dependency to your maven project:
+
+```
+
+ cloud.elit
+ elit-ddr
+ 0.0.4
+
+```
+
+* Download the conversion script: [nlp4j-ddr.jar](http://nlp.mathcs.emory.edu/nlp4j/nlp4j-ddr.jar).
+* Make sure [Java 8 or above](http://www.oracle.com/technetwork/java/javase/downloads) is installed on your machine:
+
+ ```
+$ java -version
+java version "1.8.x"
+Java(TM) SE Runtime Environment (build 1.8.x)
+...
+ ```
+
+* Run the following command:
+
+ ```
+java edu.emory.mathcs.nlp.bin.DDGConvert -i [ -r -n -pe -oe ]
+ ```
+
+ * `-i`: the path to the parse file or a directory containing the parse files to convert.
+ * `-r`: if set, process all files with the extension in the subdirectories of the input directory recursively.
+ * `-n`: if set, normalize the parse trees before the conversion.
+ * `-pe`: the extension of the parse files; required if the input path indicates a directory (default: `parse`).
+ * `-oe`: the extension of the output files (default: `ddg`).
+
+## Corpora
+
+DDG conversion has been tested on the following corpora. Some of these corpora require you to be a member of the [Linguistic Data Consortium](https://www.ldc.upenn.edu) (LDC). Retrieve the corpora from LDC and run the following command for each corpus to generate DDG.
+
+* [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19):
+
+ ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -r -i ontonotes-release-5.0/data/files/data/english/annotations
+ ```
+
+* [English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13):
+
+ ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -r -i eng_web_tbk/data -pe tree
+ ```
+
+* [QuestionBank with Manually Revised Treebank Annotation 1.0](https://catalog.ldc.upenn.edu/LDC2012R121):
+
+ ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -i QB-revised.tree
+ ```
+
+## Merge
+
+We have internally updated these corpora to reduce annotation errors and produce a richer representation. If you want to take advantage of our latest updates, merge the original annotation with our annotation. You still need to retrieve the original corpora from LDC.
+
+* Clone this repository:
+
+ ```
+git clone https://github.com/emorynlp/ddr.git
+ ```
+
+* Run the following command:
+
+ ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge
+ ```
+
+ * ``: the path to the original corpus.
+ * ``: the path to our annotation.
+ * `: the extension of the parse files.
+
+
+* [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19):
+
+ ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge ontonotes-release-5.0/data/files/data/english/annotations ddr/english/ontonotes parse
+ ```
+
+* [English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13):
+
+ ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge eng_web_tbk/data ddr/english/google/ewt tree
+ ```
+
+* [QuestionBank with Manually Revised Treebank Annotation 1.0](https://catalog.ldc.upenn.edu/LDC2012R121):
+
+ ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge QB-revised.tree ddr/english/google/qb/QB-revised.tree.skel tree
+ ```
+
+
+## Format
+
+DDG is represented in the tab separated values format (TSV), where each column represents a different field. The semantic roles are indicated in the `feats` column with the key, `sem`.
+
+```
+1 You you PRP _ 3 nsbj 7:nsbj O
+2 can can MD _ 3 modal _ O
+3 ascend ascend VB _ 0 root _ O
+4 Victoria victoria NNP _ 5 com _ B-LOC
+5 Peak peak NNP _ 3 obj _ L-LOC
+6 to to TO _ 7 aux _ O
+7 get get VB sem=prp 3 advcl _ O
+8 a a DT _ 10 det _ O
+9 panoramic panoramic JJ _ 10 attr _ O
+10 view view NN _ 7 obj _ O
+11 of of IN _ 16 case _ O
+12 Victoria victoria NNP _ 13 com _ B-LOC
+13 Harbor harbor NNP _ 16 poss _ I-LOC
+14 's 's POS _ 13 case _ L-LOC
+15 beautiful beautiful JJ _ 16 attr _ O
+16 scenery scenery NN _ 10 ppmod _ O
+17 . . . _ 3 p _ O
+```
+
+* `id`: current token ID (starting at 1).
+* `form`: word form.
+* `lemma`: lemma.
+* `pos`: part-of-speech tag.
+* `feats`: extra features; different features are delimited by `|`, keys and values are delimited by `=` (`_` indicates no feature).
+* `headId`: head token ID.
+* `deprel`: dependency label.
+* `sheads`: secondary heads (`_` indicates no secondary head).
+* `nament`: named entity tags in the `BILOU` notation if the annotation is available.
diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDGConvert.java b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvert.java
similarity index 74%
rename from elit-ddr/src/main/java/cloud/elit/ddr/bin/DDGConvert.java
rename to elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvert.java
index acc4489..50f77e0 100644
--- a/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDGConvert.java
+++ b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvert.java
@@ -15,7 +15,6 @@
*/
package cloud.elit.ddr.bin;
-import cloud.elit.ddr.constituency.CTNode;
import cloud.elit.ddr.util.*;
import cloud.elit.ddr.constituency.CTReader;
import cloud.elit.ddr.constituency.CTTree;
@@ -23,7 +22,6 @@
import cloud.elit.ddr.conversion.EnglishC2DConverter;
import cloud.elit.sdk.collection.tuple.ObjectIntIntTuple;
import cloud.elit.sdk.structure.Chunk;
-import cloud.elit.sdk.structure.Document;
import cloud.elit.sdk.structure.Sentence;
import cloud.elit.sdk.structure.node.NLPNode;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
@@ -34,7 +32,7 @@
import java.util.ArrayList;
import java.util.List;
-public class DDGConvert {
+public class DDRConvert {
@Option(name = "-d", usage = "input path (required)", required = true, metaVar = "")
private String input_path;
@Option(name = "-pe", usage = "parse file extension (default: parse)", metaVar = "")
@@ -46,11 +44,11 @@ public class DDGConvert {
@Option(name = "-r", usage = "if set, traverse parse files recursively", metaVar = "")
private boolean recursive = false;
- public DDGConvert() {
+ public DDRConvert() {
}
- public DDGConvert(String[] args) {
+ public DDRConvert(String[] args) {
BinUtils.initArgs(args, this);
Language language = Language.ENGLISH;
@@ -58,12 +56,37 @@ public DDGConvert(String[] args) {
C2DConverter converter = new EnglishC2DConverter();
for (String parseFile : parseFiles) {
- int n = convert(converter, language, parseFile, parse_ext, output_ext, normalize);
+ int n = convert(converter, language, parseFile, parseFile + "." + output_ext, normalize);
System.out.printf("%s: %d trees\n", parseFile, n);
}
}
- Int2ObjectMap>> getNER(String parseFile) {
+ int convert(C2DConverter converter, Language language, String parseFile, String outputFile, boolean normalize) {
+ Int2ObjectMap>> ner_map = getNamedEntities(parseFile);
+ CTReader reader = new CTReader(IOUtils.createFileInputStream(parseFile), language);
+ PrintStream fout = IOUtils.createBufferedPrintStream(outputFile);
+ Sentence dTree;
+ CTTree cTree;
+ int n;
+
+ for (n = 0; (cTree = reader.next()) != null; n++) {
+ if (normalize) cTree.normalizeIndices();
+ dTree = converter.toDependencyGraph(cTree);
+
+ if (dTree == null) {
+ System.err.println("No token in the tree " + (n + 1) + "\n" + cTree.toStringLine());
+ } else {
+ processNamedEntities(ner_map, cTree, dTree, n);
+ fout.println(dTree.toTSV() + "\n");
+ }
+ }
+
+ reader.close();
+ fout.close();
+ return n;
+ }
+
+ Int2ObjectMap>> getNamedEntities(String parseFile) {
final String nameFile = parseFile.substring(0, parseFile.length() - 5) + "name";
Int2ObjectMap>> map = new Int2ObjectOpenHashMap<>();
File file = new File(nameFile);
@@ -85,57 +108,31 @@ Int2ObjectMap>> getNER(String parseFile) {
}
}
} catch (Exception e) {
+ map = null;
e.printStackTrace();
}
return map;
}
- int convert(C2DConverter converter, Language language, String parseFile, String parseExt, String outputExt, boolean normalize) {
- CTReader reader = new CTReader(IOUtils.createFileInputStream(parseFile), language);
- Int2ObjectMap>> ner_map = getNER(parseFile);
- Document doc = new Document();
- Sentence dTree;
- CTTree cTree;
-
- for (int n = 0; (cTree = reader.next()) != null; n++) {
- for (CTNode nn : cTree.getTokens()) {
- if (nn.isSyntacticTag("EMO")) nn.setSyntacticTag(PTBLib.P_NFP);
- }
-
- if (normalize) cTree.normalizeIndices();
- dTree = converter.toDependencyGraph(cTree);
-
- if (dTree == null)
- System.err.println("No token in the tree " + (n + 1) + "\n" + cTree.toStringLine());
- else {
- doc.add(dTree);
-
- if (ner_map == null)
- dTree.setNamedEntities(null);
- else if (ner_map.containsKey(n)) {
- List chunks = new ArrayList<>();
+ void processNamedEntities(Int2ObjectMap>> ner_map, CTTree cTree, Sentence dTree, int sen_id) {
+ if (ner_map == null) {
+ dTree.setNamedEntities(null);
+ return;
+ }
- for (ObjectIntIntTuple t : ner_map.get(n)) {
- List nodes = new ArrayList<>();
+ List> list = ner_map.get(sen_id);
- for (int tok_id = cTree.getTerminal(t.i1).getTokenID(); tok_id < cTree.getTerminal(t.i2).getTokenID() + 1; tok_id++)
- nodes.add(dTree.get(tok_id));
+ if (list != null) {
+ for (ObjectIntIntTuple t : list) {
+ List nodes = new ArrayList<>();
- chunks.add(new Chunk(nodes, t.o));
- }
+ for (int tok_id = cTree.getTerminal(t.i1).getTokenID(); tok_id < cTree.getTerminal(t.i2).getTokenID() + 1; tok_id++)
+ nodes.add(dTree.get(tok_id));
- dTree.setNamedEntities(chunks);
- }
+ dTree.addNamedEntity(new Chunk(nodes, t.o));
}
}
-
- reader.close();
-
- PrintStream fout = IOUtils.createBufferedPrintStream(parseFile + "." + outputExt);
- fout.println(outputExt.equalsIgnoreCase("tsv") ? doc.toTSV() : doc.toString());
- fout.close();
- return doc.size();
}
void convertEnglish() {
@@ -148,7 +145,7 @@ void convertEnglish() {
boolean norm = dir.equals("bionlp") || dir.equals("bolt");
for (String parseFile : parseFiles) {
- int n = convert(converter, Language.ENGLISH, parseFile, "parse", "tsv", norm);
+ int n = convert(converter, Language.ENGLISH, parseFile, "tsv", norm);
System.out.printf("%s: %d trees\n", parseFile, n);
}
}
@@ -156,8 +153,7 @@ void convertEnglish() {
public static void main(String[] args) {
try {
- new DDGConvert(args);
-// new DDGConvert().convertEnglish();
+ new DDRConvert(args);
} catch (Exception e) {
e.printStackTrace();
}
diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java
new file mode 100644
index 0000000..c4da264
--- /dev/null
+++ b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2018 Emory University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cloud.elit.ddr.bin;
+
+import cloud.elit.ddr.constituency.CTTree;
+import cloud.elit.ddr.conversion.C2DConverter;
+import cloud.elit.ddr.conversion.EnglishC2DConverter;
+import cloud.elit.ddr.util.Language;
+import cloud.elit.sdk.structure.Document;
+import cloud.elit.sdk.structure.Sentence;
+
+public class DDRConvertDemo {
+ public static void main(String[] args) {
+ final String parseFile = "/Users/jdchoi/workspace/elit-java/relcl.parse";
+ final String tsvFile = "/Users/jdchoi/workspace/elit-java/relcl.tsv";
+ C2DConverter converter = new EnglishC2DConverter();
+ DDRConvert ddr = new DDRConvert();
+ ddr.convert(converter, Language.ENGLISH, parseFile, tsvFile, false);
+ }
+}
diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java b/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java
index 87bfd57..a6c3d56 100644
--- a/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java
+++ b/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java
@@ -219,7 +219,7 @@ private CTNode getTerminalHead(CTNode node) {
protected Sentence createDependencyGraph(CTTree tree) {
List tokens = tree.getTokens();
Sentence graph = new Sentence();
- String form, pos, lemma, nament;
+ String form, pos, lemma;
NLPNode node, head;
int id;
diff --git a/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java b/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java
index 60b4b7d..5262bd7 100644
--- a/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java
+++ b/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java
@@ -15,7 +15,6 @@
*/
package cloud.elit.sdk.structure;
-import cloud.elit.sdk.structure.node.NLPArc;
import cloud.elit.sdk.structure.node.NLPNode;
import cloud.elit.sdk.structure.util.ELITUtils;
import cloud.elit.sdk.structure.util.Fields;
@@ -256,8 +255,7 @@ private void toTSVNamedEntities(List> conll) {
}
for (List c : conll) {
- if (c.size() < 9)
- c.add("O");
+ if (c.size() < 9) c.add("O");
}
}
}
diff --git a/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java b/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java
index 1ebb5c9..2ec1b06 100644
--- a/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java
+++ b/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java
@@ -152,6 +152,7 @@ public boolean addChild(int index, N node) {
* @param node the node.
* @return the previously index'th node if added; otherwise, {@code null}.
*/
+ @SuppressWarnings("UnusedReturnValue")
public N setChild(int index, N node) {
if (!isParentOf(node)) {
if (node.hasParent())
@@ -173,6 +174,7 @@ public N setChild(int index, N node) {
* @param node the node.
* @return the removed child if exists; otherwise, {@code null}.
*/
+ @SuppressWarnings("UnusedReturnValue")
public N removeChild(N node) {
return removeChild(indexOf(node));
}