diff --git a/elit-ddr/README.md b/elit-ddr/README.md new file mode 100644 index 0000000..c7f7ce1 --- /dev/null +++ b/elit-ddr/README.md @@ -0,0 +1,134 @@ +# DDR Conversion + +DDR conversion generates the [deep dependency graphs](https://github.com/emorynlp/ddr) from the Penn Treebank style constituency trees. +The conversion tool is written in Java and developed by [Emory NLP](http://nlp.mathcs.emory.edu) as a part of the [ELIT](https://elit.cloud) project. + +## Installation + +Add the following dependency to your maven project: + +``` + + cloud.elit + elit-ddr + 0.0.4 + +``` + +* Download the conversion script: [nlp4j-ddr.jar](http://nlp.mathcs.emory.edu/nlp4j/nlp4j-ddr.jar). +* Make sure [Java 8 or above](http://www.oracle.com/technetwork/java/javase/downloads) is installed on your machine: + + ``` +$ java -version +java version "1.8.x" +Java(TM) SE Runtime Environment (build 1.8.x) +... + ``` + +* Run the following command: + + ``` +java edu.emory.mathcs.nlp.bin.DDGConvert -i [ -r -n -pe -oe ] + ``` + + * `-i`: the path to the parse file or a directory containing the parse files to convert. + * `-r`: if set, process all files with the extension in the subdirectories of the input directory recursively. + * `-n`: if set, normalize the parse trees before the conversion. + * `-pe`: the extension of the parse files; required if the input path indicates a directory (default: `parse`). + * `-oe`: the extension of the output files (default: `ddg`). + +## Corpora + +DDG conversion has been tested on the following corpora. Some of these corpora require you to be a member of the [Linguistic Data Consortium](https://www.ldc.upenn.edu) (LDC). Retrieve the corpora from LDC and run the following command for each corpus to generate DDG. + +* [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19): + + ``` +java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -r -i ontonotes-release-5.0/data/files/data/english/annotations + ``` + +* [English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13): + + ``` +java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -r -i eng_web_tbk/data -pe tree + ``` + +* [QuestionBank with Manually Revised Treebank Annotation 1.0](https://catalog.ldc.upenn.edu/LDC2012R121): + + ``` +java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGConvert -i QB-revised.tree + ``` + +## Merge + +We have internally updated these corpora to reduce annotation errors and produce a richer representation. If you want to take advantage of our latest updates, merge the original annotation with our annotation. You still need to retrieve the original corpora from LDC. + +* Clone this repository: + + ``` +git clone https://github.com/emorynlp/ddr.git + ``` + +* Run the following command: + + ``` +java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge + ``` + + * ``: the path to the original corpus. + * ``: the path to our annotation. + * `: the extension of the parse files. + + +* [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19): + + ``` +java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge ontonotes-release-5.0/data/files/data/english/annotations ddr/english/ontonotes parse + ``` + +* [English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13): + + ``` +java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge eng_web_tbk/data ddr/english/google/ewt tree + ``` + +* [QuestionBank with Manually Revised Treebank Annotation 1.0](https://catalog.ldc.upenn.edu/LDC2012R121): + + ``` +java -cp nlp4j-ddr.jar edu.emory.mathcs.nlp.bin.DDGMerge QB-revised.tree ddr/english/google/qb/QB-revised.tree.skel tree + ``` + + +## Format + +DDG is represented in the tab separated values format (TSV), where each column represents a different field. The semantic roles are indicated in the `feats` column with the key, `sem`. + +``` +1 You you PRP _ 3 nsbj 7:nsbj O +2 can can MD _ 3 modal _ O +3 ascend ascend VB _ 0 root _ O +4 Victoria victoria NNP _ 5 com _ B-LOC +5 Peak peak NNP _ 3 obj _ L-LOC +6 to to TO _ 7 aux _ O +7 get get VB sem=prp 3 advcl _ O +8 a a DT _ 10 det _ O +9 panoramic panoramic JJ _ 10 attr _ O +10 view view NN _ 7 obj _ O +11 of of IN _ 16 case _ O +12 Victoria victoria NNP _ 13 com _ B-LOC +13 Harbor harbor NNP _ 16 poss _ I-LOC +14 's 's POS _ 13 case _ L-LOC +15 beautiful beautiful JJ _ 16 attr _ O +16 scenery scenery NN _ 10 ppmod _ O +17 . . . _ 3 p _ O +``` + +* `id`: current token ID (starting at 1). +* `form`: word form. +* `lemma`: lemma. +* `pos`: part-of-speech tag. +* `feats`: extra features; different features are delimited by `|`, keys and values are delimited by `=` (`_` indicates no feature). +* `headId`: head token ID. +* `deprel`: dependency label. +* `sheads`: secondary heads (`_` indicates no secondary head). +* `nament`: named entity tags in the `BILOU` notation if the annotation is available. diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDGConvert.java b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvert.java similarity index 74% rename from elit-ddr/src/main/java/cloud/elit/ddr/bin/DDGConvert.java rename to elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvert.java index acc4489..50f77e0 100644 --- a/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDGConvert.java +++ b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvert.java @@ -15,7 +15,6 @@ */ package cloud.elit.ddr.bin; -import cloud.elit.ddr.constituency.CTNode; import cloud.elit.ddr.util.*; import cloud.elit.ddr.constituency.CTReader; import cloud.elit.ddr.constituency.CTTree; @@ -23,7 +22,6 @@ import cloud.elit.ddr.conversion.EnglishC2DConverter; import cloud.elit.sdk.collection.tuple.ObjectIntIntTuple; import cloud.elit.sdk.structure.Chunk; -import cloud.elit.sdk.structure.Document; import cloud.elit.sdk.structure.Sentence; import cloud.elit.sdk.structure.node.NLPNode; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; @@ -34,7 +32,7 @@ import java.util.ArrayList; import java.util.List; -public class DDGConvert { +public class DDRConvert { @Option(name = "-d", usage = "input path (required)", required = true, metaVar = "") private String input_path; @Option(name = "-pe", usage = "parse file extension (default: parse)", metaVar = "") @@ -46,11 +44,11 @@ public class DDGConvert { @Option(name = "-r", usage = "if set, traverse parse files recursively", metaVar = "") private boolean recursive = false; - public DDGConvert() { + public DDRConvert() { } - public DDGConvert(String[] args) { + public DDRConvert(String[] args) { BinUtils.initArgs(args, this); Language language = Language.ENGLISH; @@ -58,12 +56,37 @@ public DDGConvert(String[] args) { C2DConverter converter = new EnglishC2DConverter(); for (String parseFile : parseFiles) { - int n = convert(converter, language, parseFile, parse_ext, output_ext, normalize); + int n = convert(converter, language, parseFile, parseFile + "." + output_ext, normalize); System.out.printf("%s: %d trees\n", parseFile, n); } } - Int2ObjectMap>> getNER(String parseFile) { + int convert(C2DConverter converter, Language language, String parseFile, String outputFile, boolean normalize) { + Int2ObjectMap>> ner_map = getNamedEntities(parseFile); + CTReader reader = new CTReader(IOUtils.createFileInputStream(parseFile), language); + PrintStream fout = IOUtils.createBufferedPrintStream(outputFile); + Sentence dTree; + CTTree cTree; + int n; + + for (n = 0; (cTree = reader.next()) != null; n++) { + if (normalize) cTree.normalizeIndices(); + dTree = converter.toDependencyGraph(cTree); + + if (dTree == null) { + System.err.println("No token in the tree " + (n + 1) + "\n" + cTree.toStringLine()); + } else { + processNamedEntities(ner_map, cTree, dTree, n); + fout.println(dTree.toTSV() + "\n"); + } + } + + reader.close(); + fout.close(); + return n; + } + + Int2ObjectMap>> getNamedEntities(String parseFile) { final String nameFile = parseFile.substring(0, parseFile.length() - 5) + "name"; Int2ObjectMap>> map = new Int2ObjectOpenHashMap<>(); File file = new File(nameFile); @@ -85,57 +108,31 @@ Int2ObjectMap>> getNER(String parseFile) { } } } catch (Exception e) { + map = null; e.printStackTrace(); } return map; } - int convert(C2DConverter converter, Language language, String parseFile, String parseExt, String outputExt, boolean normalize) { - CTReader reader = new CTReader(IOUtils.createFileInputStream(parseFile), language); - Int2ObjectMap>> ner_map = getNER(parseFile); - Document doc = new Document(); - Sentence dTree; - CTTree cTree; - - for (int n = 0; (cTree = reader.next()) != null; n++) { - for (CTNode nn : cTree.getTokens()) { - if (nn.isSyntacticTag("EMO")) nn.setSyntacticTag(PTBLib.P_NFP); - } - - if (normalize) cTree.normalizeIndices(); - dTree = converter.toDependencyGraph(cTree); - - if (dTree == null) - System.err.println("No token in the tree " + (n + 1) + "\n" + cTree.toStringLine()); - else { - doc.add(dTree); - - if (ner_map == null) - dTree.setNamedEntities(null); - else if (ner_map.containsKey(n)) { - List chunks = new ArrayList<>(); + void processNamedEntities(Int2ObjectMap>> ner_map, CTTree cTree, Sentence dTree, int sen_id) { + if (ner_map == null) { + dTree.setNamedEntities(null); + return; + } - for (ObjectIntIntTuple t : ner_map.get(n)) { - List nodes = new ArrayList<>(); + List> list = ner_map.get(sen_id); - for (int tok_id = cTree.getTerminal(t.i1).getTokenID(); tok_id < cTree.getTerminal(t.i2).getTokenID() + 1; tok_id++) - nodes.add(dTree.get(tok_id)); + if (list != null) { + for (ObjectIntIntTuple t : list) { + List nodes = new ArrayList<>(); - chunks.add(new Chunk(nodes, t.o)); - } + for (int tok_id = cTree.getTerminal(t.i1).getTokenID(); tok_id < cTree.getTerminal(t.i2).getTokenID() + 1; tok_id++) + nodes.add(dTree.get(tok_id)); - dTree.setNamedEntities(chunks); - } + dTree.addNamedEntity(new Chunk(nodes, t.o)); } } - - reader.close(); - - PrintStream fout = IOUtils.createBufferedPrintStream(parseFile + "." + outputExt); - fout.println(outputExt.equalsIgnoreCase("tsv") ? doc.toTSV() : doc.toString()); - fout.close(); - return doc.size(); } void convertEnglish() { @@ -148,7 +145,7 @@ void convertEnglish() { boolean norm = dir.equals("bionlp") || dir.equals("bolt"); for (String parseFile : parseFiles) { - int n = convert(converter, Language.ENGLISH, parseFile, "parse", "tsv", norm); + int n = convert(converter, Language.ENGLISH, parseFile, "tsv", norm); System.out.printf("%s: %d trees\n", parseFile, n); } } @@ -156,8 +153,7 @@ void convertEnglish() { public static void main(String[] args) { try { - new DDGConvert(args); -// new DDGConvert().convertEnglish(); + new DDRConvert(args); } catch (Exception e) { e.printStackTrace(); } diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java new file mode 100644 index 0000000..c4da264 --- /dev/null +++ b/elit-ddr/src/main/java/cloud/elit/ddr/bin/DDRConvertDemo.java @@ -0,0 +1,34 @@ +/* + * Copyright 2018 Emory University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cloud.elit.ddr.bin; + +import cloud.elit.ddr.constituency.CTTree; +import cloud.elit.ddr.conversion.C2DConverter; +import cloud.elit.ddr.conversion.EnglishC2DConverter; +import cloud.elit.ddr.util.Language; +import cloud.elit.sdk.structure.Document; +import cloud.elit.sdk.structure.Sentence; + +public class DDRConvertDemo { + public static void main(String[] args) { + final String parseFile = "/Users/jdchoi/workspace/elit-java/relcl.parse"; + final String tsvFile = "/Users/jdchoi/workspace/elit-java/relcl.tsv"; + C2DConverter converter = new EnglishC2DConverter(); + DDRConvert ddr = new DDRConvert(); + ddr.convert(converter, Language.ENGLISH, parseFile, tsvFile, false); + } +} diff --git a/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java b/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java index 87bfd57..a6c3d56 100644 --- a/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java +++ b/elit-ddr/src/main/java/cloud/elit/ddr/conversion/C2DConverter.java @@ -219,7 +219,7 @@ private CTNode getTerminalHead(CTNode node) { protected Sentence createDependencyGraph(CTTree tree) { List tokens = tree.getTokens(); Sentence graph = new Sentence(); - String form, pos, lemma, nament; + String form, pos, lemma; NLPNode node, head; int id; diff --git a/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java b/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java index 60b4b7d..5262bd7 100644 --- a/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java +++ b/elit-sdk/src/main/java/cloud/elit/sdk/structure/Sentence.java @@ -15,7 +15,6 @@ */ package cloud.elit.sdk.structure; -import cloud.elit.sdk.structure.node.NLPArc; import cloud.elit.sdk.structure.node.NLPNode; import cloud.elit.sdk.structure.util.ELITUtils; import cloud.elit.sdk.structure.util.Fields; @@ -256,8 +255,7 @@ private void toTSVNamedEntities(List> conll) { } for (List c : conll) { - if (c.size() < 9) - c.add("O"); + if (c.size() < 9) c.add("O"); } } } diff --git a/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java b/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java index 1ebb5c9..2ec1b06 100644 --- a/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java +++ b/elit-sdk/src/main/java/cloud/elit/sdk/structure/node/Node.java @@ -152,6 +152,7 @@ public boolean addChild(int index, N node) { * @param node the node. * @return the previously index'th node if added; otherwise, {@code null}. */ + @SuppressWarnings("UnusedReturnValue") public N setChild(int index, N node) { if (!isParentOf(node)) { if (node.hasParent()) @@ -173,6 +174,7 @@ public N setChild(int index, N node) { * @param node the node. * @return the removed child if exists; otherwise, {@code null}. */ + @SuppressWarnings("UnusedReturnValue") public N removeChild(N node) { return removeChild(indexOf(node)); }