Added DDRConvertDemo.

elitcloud · May 9, 2018 · 9126ece · 9126ece
1 parent 2dda92c
commit 9126ece
Show file tree

Hide file tree

Showing 6 changed files with 217 additions and 53 deletions.
diff --git a/elit-ddr/README.md b/elit-ddr/README.md
@@ -0,0 +1,134 @@
+# DDR Conversion
+
+DDR conversion generates the [deep dependency graphs](https://github.com/emorynlp/ddr) from the Penn Treebank style constituency trees.
+The conversion tool is written in Java and developed by [Emory NLP](http://nlp.mathcs.emory.edu) as a part of the [ELIT](https://elit.cloud) project.
+
+## Installation
+
+Add the following dependency to your maven project:
+
+```
+<dependency>
+    <groupId>cloud.elit</groupId>
+    <artifactId>elit-ddr</artifactId>
+    <version>0.0.4</version>
+</dependency>
+```
+
+* Download the conversion script: [nlp4j-ddr.jar](http://nlp.mathcs.emory.edu/nlp4j/nlp4j-ddr.jar).
+* Make sure [Java 8 or above](http://www.oracle.com/technetwork/java/javase/downloads) is installed on your machine:
+
+   ```
+$ java -version
+java version "1.8.x"
+Java(TM) SE Runtime Environment (build 1.8.x)
+...
+   ```
+
+* Run the following command:
+
+   ```
+java edu.emory.mathcs.nlp.bin.DDGConvert -i <filepath> [ -r -n -pe <string> -oe <string>]
+   ```
+
+   * `-i`: the path to the parse file or a directory containing the parse files to convert.
+   * `-r`: if set, process all files with the extension in the subdirectories of the input directory recursively.
+   * `-n`: if set, normalize the parse trees before the conversion.
+   * `-pe`: the extension of the parse files; required if the input path indicates a directory (default: `parse`).
+   * `-oe`: the extension of the output files (default: `ddg`).
+
+## Corpora
+
+DDG conversion has been tested on the following corpora. Some of these corpora require you to be a member of the [Linguistic Data Consortium](https://www.ldc.upenn.edu) (LDC). Retrieve the corpora from LDC and run the following command for each corpus to generate DDG.
+
+* [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19):
+
+   ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -r -i ontonotes-release-5.0/data/files/data/english/annotations
+   ```
+
+* [English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13):
+
+   ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -r -i eng_web_tbk/data -pe tree
+   ```
+
+* [QuestionBank with Manually Revised Treebank Annotation 1.0](https://catalog.ldc.upenn.edu/LDC2012R121):
+
+   ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -i QB-revised.tree
+   ```
+
+## Merge
+
+We have internally updated these corpora to reduce annotation errors and produce a richer representation. If you want to take advantage of our latest updates, merge the original annotation with our annotation. You still need to retrieve the original corpora from LDC.
+
+* Clone this repository:
+
+   ```
+git clone https://github.com/emorynlp/ddr.git
+   ```
+
+* Run the following command:
+
+   ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge <source path> <target path> <parse ext>
+   ```
+
+   * `<source path>`: the path to the original corpus.
+   * `<target path>`: the path to our annotation.
+   * `<parse ext`>: the extension of the parse files.
+
+
+* [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19):
+
+   ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge ontonotes-release-5.0/data/files/data/english/annotations ddr/english/ontonotes parse
+   ```
+
+* [English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13):
+
+   ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge eng_web_tbk/data ddr/english/google/ewt tree
+   ```
+
+* [QuestionBank with Manually Revised Treebank Annotation 1.0](https://catalog.ldc.upenn.edu/LDC2012R121):
+
+   ```
+java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge QB-revised.tree ddr/english/google/qb/QB-revised.tree.skel tree
+   ```
+
+
+## Format
+
+DDG is represented in the tab separated values format (TSV), where each column represents a different field. The semantic roles are indicated in the `feats` column with the key, `sem`.
+
+```
+1   You        you        PRP  _        3   nsbj   7:nsbj  O
+2   can        can        MD   _        3   modal  _       O
+3   ascend     ascend     VB   _        0   root   _       O
+4   Victoria   victoria   NNP  _        5   com    _       B-LOC
+5   Peak       peak       NNP  _        3   obj    _       L-LOC
+6   to         to         TO   _        7   aux    _       O
+7   get        get        VB   sem=prp  3   advcl  _       O
+8   a          a          DT   _        10  det    _       O
+9   panoramic  panoramic  JJ   _        10  attr   _       O
+10  view       view       NN   _        7   obj    _       O
+11  of         of         IN   _        16  case   _       O
+12  Victoria   victoria   NNP  _        13  com    _       B-LOC
+13  Harbor     harbor     NNP  _        16  poss   _       I-LOC
+14  's         's         POS  _        13  case   _       L-LOC
+15  beautiful  beautiful  JJ   _        16  attr   _       O
+16  scenery    scenery    NN   _        10  ppmod  _       O
+17  .          .          .    _        3   p      _       O
+```
+
+* `id`: current token ID (starting at 1).
+* `form`: word form.
+* `lemma`: lemma.
+* `pos`: part-of-speech tag.
+* `feats`: extra features; different features are delimited by `|`, keys and values are delimited by `=` (`_` indicates no feature).
+* `headId`: head token ID.
+* `deprel`: dependency label.
+* `sheads`: secondary heads (`_` indicates no secondary head).
+* `nament`: named entity tags in the `BILOU` notation if the annotation is available.
diff --git a/...n/java/cloud/elit/ddr/bin/DDGConvert.java → ...n/java/cloud/elit/ddr/bin/DDRConvert.java b/...n/java/cloud/elit/ddr/bin/DDGConvert.java → ...n/java/cloud/elit/ddr/bin/DDRConvert.java
@@ -15,15 +15,13 @@
  */
 package cloud.elit.ddr.bin;
 
-import cloud.elit.ddr.constituency.CTNode;
 import cloud.elit.ddr.util.*;
 import cloud.elit.ddr.constituency.CTReader;
 import cloud.elit.ddr.constituency.CTTree;
 import cloud.elit.ddr.conversion.C2DConverter;
 import cloud.elit.ddr.conversion.EnglishC2DConverter;
 import cloud.elit.sdk.collection.tuple.ObjectIntIntTuple;
 import cloud.elit.sdk.structure.Chunk;
-import cloud.elit.sdk.structure.Document;
 import cloud.elit.sdk.structure.Sentence;
 import cloud.elit.sdk.structure.node.NLPNode;
 import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
@@ -34,7 +32,7 @@
 import java.util.ArrayList;
 import java.util.List;
 
-public class DDGConvert {
+public class DDRConvert {
     @Option(name = "-d", usage = "input path (required)", required = true, metaVar = "<filepath>")
     private String input_path;
     @Option(name = "-pe", usage = "parse file extension (default: parse)", metaVar = "<string>")
@@ -46,24 +44,49 @@ public class DDGConvert {
     @Option(name = "-r", usage = "if set, traverse parse files recursively", metaVar = "<boolean>")
     private boolean recursive = false;
 
-    public DDGConvert() {
+    public DDRConvert() {
 
     }
 
-    public DDGConvert(String[] args) {
+    public DDRConvert(String[] args) {
         BinUtils.initArgs(args, this);
         Language language = Language.ENGLISH;
 
         List<String> parseFiles = FileUtils.getFileList(input_path, parse_ext, recursive);
         C2DConverter converter = new EnglishC2DConverter();
 
         for (String parseFile : parseFiles) {
-            int n = convert(converter, language, parseFile, parse_ext, output_ext, normalize);
+            int n = convert(converter, language, parseFile, parseFile + "." + output_ext, normalize);
             System.out.printf("%s: %d trees\n", parseFile, n);
         }
     }
 
-    Int2ObjectMap<List<ObjectIntIntTuple<String>>> getNER(String parseFile) {
+    int convert(C2DConverter converter, Language language, String parseFile, String outputFile, boolean normalize) {
+        Int2ObjectMap<List<ObjectIntIntTuple<String>>> ner_map = getNamedEntities(parseFile);
+        CTReader reader = new CTReader(IOUtils.createFileInputStream(parseFile), language);
+        PrintStream fout = IOUtils.createBufferedPrintStream(outputFile);
+        Sentence dTree;
+        CTTree cTree;
+        int n;
+
+        for (n = 0; (cTree = reader.next()) != null; n++) {
+            if (normalize) cTree.normalizeIndices();
+            dTree = converter.toDependencyGraph(cTree);
+
+            if (dTree == null) {
+                System.err.println("No token in the tree " + (n + 1) + "\n" + cTree.toStringLine());
+            } else {
+                processNamedEntities(ner_map, cTree, dTree, n);
+                fout.println(dTree.toTSV() + "\n");
+            }
+        }
+
+        reader.close();
+        fout.close();
+        return n;
+    }
+
+    Int2ObjectMap<List<ObjectIntIntTuple<String>>> getNamedEntities(String parseFile) {
         final String nameFile = parseFile.substring(0, parseFile.length() - 5) + "name";
         Int2ObjectMap<List<ObjectIntIntTuple<String>>> map = new Int2ObjectOpenHashMap<>();
         File file = new File(nameFile);
@@ -85,57 +108,31 @@ Int2ObjectMap<List<ObjectIntIntTuple<String>>> getNER(String parseFile) {
                 }
             }
         } catch (Exception e) {
+            map = null;
             e.printStackTrace();
         }
 
         return map;
     }
 
-    int convert(C2DConverter converter, Language language, String parseFile, String parseExt, String outputExt, boolean normalize) {
-        CTReader reader = new CTReader(IOUtils.createFileInputStream(parseFile), language);
-        Int2ObjectMap<List<ObjectIntIntTuple<String>>> ner_map = getNER(parseFile);
-        Document doc = new Document();
-        Sentence dTree;
-        CTTree cTree;
-
-        for (int n = 0; (cTree = reader.next()) != null; n++) {
-            for (CTNode nn : cTree.getTokens()) {
-                if (nn.isSyntacticTag("EMO")) nn.setSyntacticTag(PTBLib.P_NFP);
-            }
-
-            if (normalize) cTree.normalizeIndices();
-            dTree = converter.toDependencyGraph(cTree);
-
-            if (dTree == null)
-                System.err.println("No token in the tree " + (n + 1) + "\n" + cTree.toStringLine());
-            else {
-                doc.add(dTree);
-
-                if (ner_map == null)
-                    dTree.setNamedEntities(null);
-                else if (ner_map.containsKey(n)) {
-                    List<Chunk> chunks = new ArrayList<>();
+    void processNamedEntities(Int2ObjectMap<List<ObjectIntIntTuple<String>>> ner_map, CTTree cTree, Sentence dTree, int sen_id) {
+        if (ner_map == null) {
+            dTree.setNamedEntities(null);
+            return;
+        }
 
-                    for (ObjectIntIntTuple<String> t : ner_map.get(n)) {
-                        List<NLPNode> nodes = new ArrayList<>();
+        List<ObjectIntIntTuple<String>> list = ner_map.get(sen_id);
 
-                        for (int tok_id = cTree.getTerminal(t.i1).getTokenID(); tok_id < cTree.getTerminal(t.i2).getTokenID() + 1; tok_id++)
-                            nodes.add(dTree.get(tok_id));
+        if (list != null) {
+            for (ObjectIntIntTuple<String> t : list) {
+                List<NLPNode> nodes = new ArrayList<>();
 
-                        chunks.add(new Chunk(nodes, t.o));
-                    }
+                for (int tok_id = cTree.getTerminal(t.i1).getTokenID(); tok_id < cTree.getTerminal(t.i2).getTokenID() + 1; tok_id++)
+                    nodes.add(dTree.get(tok_id));
 
-                    dTree.setNamedEntities(chunks);
-                }
+                dTree.addNamedEntity(new Chunk(nodes, t.o));
             }
         }
-
-        reader.close();
-
-        PrintStream fout = IOUtils.createBufferedPrintStream(parseFile + "." + outputExt);
-        fout.println(outputExt.equalsIgnoreCase("tsv") ? doc.toTSV() : doc.toString());
-        fout.close();
-        return doc.size();
     }
 
     void convertEnglish() {
@@ -148,16 +145,15 @@ void convertEnglish() {
             boolean norm = dir.equals("bionlp") || dir.equals("bolt");
 
             for (String parseFile : parseFiles) {
-                int n = convert(converter, Language.ENGLISH, parseFile, "parse", "tsv", norm);
+                int n = convert(converter, Language.ENGLISH, parseFile, "tsv", norm);
                 System.out.printf("%s: %d trees\n", parseFile, n);
             }
         }
     }
 
     public static void main(String[] args) {
         try {
-            new DDGConvert(args);
-//            new DDGConvert().convertEnglish();
+            new DDRConvert(args);
         } catch (Exception e) {
             e.printStackTrace();
         }

diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2018 Emory University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cloud.elit.ddr.bin;
+
+import cloud.elit.ddr.constituency.CTTree;
+import cloud.elit.ddr.conversion.C2DConverter;
+import cloud.elit.ddr.conversion.EnglishC2DConverter;
+import cloud.elit.ddr.util.Language;
+import cloud.elit.sdk.structure.Document;
+import cloud.elit.sdk.structure.Sentence;
+
+public class DDRConvertDemo {
+    public static void main(String[] args) {
+        final String parseFile = "/Users/jdchoi/workspace/elit-java/relcl.parse";
+        final String tsvFile = "/Users/jdchoi/workspace/elit-java/relcl.tsv";
+        C2DConverter converter = new EnglishC2DConverter();
+        DDRConvert ddr = new DDRConvert();
+        ddr.convert(converter, Language.ENGLISH, parseFile, tsvFile, false);
+    }
+}
diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java b/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java
@@ -219,7 +219,7 @@ private CTNode getTerminalHead(CTNode node) {
     protected Sentence createDependencyGraph(CTTree tree) {
         List<CTNode> tokens = tree.getTokens();
         Sentence graph = new Sentence();
-        String form, pos, lemma, nament;
+        String form, pos, lemma;
         NLPNode node, head;
         int id;
 

diff --git a/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java b/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java
@@ -15,7 +15,6 @@
  */
 package cloud.elit.sdk.structure;
 
-import cloud.elit.sdk.structure.node.NLPArc;
 import cloud.elit.sdk.structure.node.NLPNode;
 import cloud.elit.sdk.structure.util.ELITUtils;
 import cloud.elit.sdk.structure.util.Fields;
@@ -256,8 +255,7 @@ private void toTSVNamedEntities(List<List<String>> conll) {
         }
 
         for (List<String> c : conll) {
-            if (c.size() < 9)
-                c.add("O");
+            if (c.size() < 9) c.add("O");
         }
     }
 }

diff --git a/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java b/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java
@@ -152,6 +152,7 @@ public boolean addChild(int index, N node) {
      * @param node  the node.
      * @return the previously index'th node if added; otherwise, {@code null}.
      */
+    @SuppressWarnings("UnusedReturnValue")
     public N setChild(int index, N node) {
         if (!isParentOf(node)) {
             if (node.hasParent())
@@ -173,6 +174,7 @@ public N setChild(int index, N node) {
      * @param node the node.
      * @return the removed child if exists; otherwise, {@code null}.
      */
+    @SuppressWarnings("UnusedReturnValue")
     public N removeChild(N node) {
         return removeChild(indexOf(node));
     }