V3.2.0

amir-zeldes · Aug 12, 2021 · d417064 · d417064
1 parent baaa95b
commit d417064
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -58,4 +58,20 @@ Batch mode options:
                         .depedit)
 ```
 
-For more information see https://corpling.uis.georgetown.edu/depedit/ and read the included User Guide PDF in doc/.
+For more information see https://corpling.uis.georgetown.edu/depedit/ and read the included User Guide PDF in doc/.
+
+## Citing
+
+If you are using DepEdit in a scholarly paper, please cite the following reference:
+
+```
+ @InProceedings{PengZeldes2020,
+   author    = {Siyao Peng and Amir Zeldes},
+   title     = {All Roads Lead to {UD}: Converting {S}tanford and {P}enn Parses to {E}nglish {U}niversal {D}ependencies with Multilayer Annotations},
+   booktitle = {Proceedings of the Joint Workshop on Linguistic Annotation, Multiword Expressions and Constructions ({LAW}-{MWE}-{C}x{G}-2018)},
+   year      = {2018},
+   pages     = {167--177},
+   address   = {Santa Fe, NM},
+   url       = {https://www.aclweb.org/anthology/W18-4918}
+ }
+ ```
diff --git a/depedit/depedit.py b/depedit/depedit.py
@@ -22,7 +22,7 @@
 import io
 from six import iteritems, iterkeys
 
-__version__ = "3.1.0.0"
+__version__ = "3.2.0.0"
 
 ALIASES = {"form":"text","upostag":"pos","xpostag":"cpos","feats":"morph","deprel":"func","deps":"head2","misc":"func2",
            "xpos": "cpos","upos":"pos"}

diff --git a/docs/DepEdit_user_guide.pdf b/docs/DepEdit_user_guide.pdf
diff --git a/examples/eng_enhance.ini b/examples/eng_enhance.ini
@@ -5,30 +5,35 @@
 # set a variable for transitive verbal lemmas that trigger subject control for `xcomp`
 {$subject_control}=/promise|threaten|offer|propose/
 
-# erase any existing enhanced dependencies in the input
-head2=/.*/	none	#1:head2=;#1:edep=
+# erase any existing enhanced dependencies in the input except for ellipsis nodes
+num!=/.*\..*/&head2=/.*/	none	#1:head2=;#1:edep=
 
 # store desired lemma form for `conj`, `case` and `mark` augmentation
 # use lowercase of lemma by default
-lemma=/(.*)/	none	#1:head2=$1L
+lemma=/(.*)/	none	#1:storage2=$1L
 # for participles used as mark/case, use the lowercased word form, not the verbal lemma
-text=/(.*)/&xpos=/VB[GN]/	none	#1:head2=$1L
+text=/(.*)/&xpos=/VB[GN]/	none	#1:storage2=$1L
 # for non-alphabetic words set to empty to avoid validation errors
-text=/.*[^A-Za-z]/	none	#1:head2=
+text=/.*[^A-Za-z]/	none	#1:storage2=
 # try known mappings for specific non-alphabetic cases
-text=/[–-]+/	none	#1:head2=to
-text=/\u002F+/	none	#1:head2=slash
-text=/\+/	none	#1:head2=plus
-text=/-/	none	#1:head2=minus
-text=/@/	none	#1:head2=at
-text=/vs/	none	#1:head2=versus
-text=/:/	none	#1:head2=colon
-text=/±/	none	#1:head2=plus_minus
-text=/à/	none	#1:head2=a
-text=/ca?\./	none	#1:head2=circa
+text=/[–-]+/	none	#1:storage2=to
+text=/\u002F+/	none	#1:storage2=slash
+text=/\+/	none	#1:storage2=plus
+text=/-/	none	#1:storage2=minus
+text=/@/	none	#1:storage2=at
+text=/[Vv][Ss]\.?/	none	#1:storage2=versus
+text=/:/	none	#1:storage2=colon
+text=/±/	none	#1:storage2=plus_minus
+text=/à/	none	#1:storage2=a
+text=/ca?\./	none	#1:storage2=circa
+text=/&/	none	#1:storage2=and
+text=/n'/	none	#1:storage2=and
+text=/cuz/	none	#1:storage2=because
+text=/x/&func=/case/	none	#1:storage2=by
 
 # default edep - duplicate the regular dependency
 text=/.*/;func=/(.*)/	#1>#2	#1~#2;#2:edep=$1
+func=/root/	none	#1:ehead=0;#1:edep=root
 
 # annotate tokens internally to mark whether they have certain dependents
 xpos=/[VN].*|JJ.?|RB.?/;func=/aux.*/	#1>#2	#1:storage+=hasaux
@@ -38,29 +43,35 @@ xpos=/VBN/;func=/aux:pass/	#1>#2	#1:storage+=haspass
 xpos=/[NP].*/;func=/case/	#1>#2	#1:storage+=hascase
 
 # handle augmented `case`
-# two and three word `fixed` expressions
-text=/.*/;func=/^(obl|nmod)$/;head2=/(.*)/&func=/(case)/;func=/fixed/&head2=/(.*)/;func=/fixed/&head2=/(.*)/	#1>#2>#3>#4;#3>#5;#4.#5	#1~#2;#2:edep=$1:$2_$4_$5;#3:storage=hasdblfixed
-text=/.*/;func=/^(obl|nmod)$/;head2=/(.*)/&func=/(case)/&storage!=/hasdblfixed/;func=/fixed/&head2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2_$4;#3:storage=hasfixed
+# combo case + mark
+func=/advcl/;func=/case/&storage2=/(.*)/;func=/mark/&xpos=/W.*/&storage2=/(.*)/;text=/.*/	#1>#2;#1>#3;#2.#3;#4>#1	#4~#1;#1:edep=advcl:$1_$2;#2:storage=hasdblfixed
+# combo mark-fixed + mark
+func=/advcl/;func=/mark/&storage2=/(.*)/;func=/fixed/&storage2=/(.*)/;func=/mark/&storage2=/(.*)/;text=/.*/	#1>#2>#3;#1>#4;#2.#3.#4;#5>#1	#5~#1;#1:edep=advcl:$1_$2_$3;#2:storage=hasdblfixed
+# two and three word `fixed` and `goeswith` expressions
+text=/.*/;func=/^(obl|nmod)$/;storage2=/(.*)/&func=/(case)/;func=/fixed/&storage2=/(.*)/;func=/fixed/&storage2=/(.*)/	#1>#2>#3>#4;#3>#5;#4.#5	#1~#2;#2:edep=$1:$2_$4_$5;#3:storage=hasdblfixed
+text=/.*/;func=/^(obl|nmod)$/;storage2=/(.*)/&func=/(case)/&storage!=/hasdblfixed/;func=/fixed/&storage2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2_$4;#3:storage=hasfixed
+text=/.*/;func=/^(obl|nmod)$/;storage2=/(.*)/&func=/(case)/&storage!=/hasdblfixed/;func=/goeswith/&storage2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2$4;#3:storage=hasgoeswith
 # all other regular cases - and ruling out genitive 's as the augmentation (xpos=POS)
-text=/.*/;func=/^(obl|nmod)$/;head2=/(.*)/&func=/(case)/&xpos!=/POS/&storage!=/hasfixed/	#1>#2>#3	#1~#2;#2:edep=$1:$2;#3:storage=
-text=/.*/;func=/^(obl|nmod)$/;head2=/(.*)/&func=/(case)/&xpos!=/POS/&storage!=/hasfixed/;func=/conj/&storage!=/.*hascase.*/	#1>#2>#3;#2>#4	#1~#4;#4:edep=$1:$2
+text=/.*/;func=/^(obl|nmod)$/;storage2=/(.*)/&func=/(case)/&xpos!=/POS/&storage!=/hasfixed/	#1>#2>#3	#1~#2;#2:edep=$1:$2;#3:storage=
+text=/.*/;func=/^(obl|nmod)$/;storage2=/(.*)/&func=/(case)/&xpos!=/POS/&storage!=/hasfixed/;func=/conj/&storage!=/.*hascase.*/	#1>#2>#3;#2>#4	#1~#4;#4:edep=$1:$2
 # handle double case, e.g. obl:out_of for "out of X", rather than obl:out
-text=/.*/;func=/^(obl|nmod)$/;func=/case/&xpos!=/POS/&head2=/(.*)/;func=/case/&xpos!=/POS/&head2=/(.*)/	#3.*#4;#1>#2;#2>#3;#2>#4	#1~#2;#2:edep=$1:$2_$3;#3:storage=;#4:storage=
+text=/.*/;func=/^(obl|nmod)$/;func=/case/&xpos!=/POS/&storage2=/(.*)/;func=/case/&xpos!=/POS/&storage2=/(.*)/	#3.*#4;#1>#2;#2>#3;#2>#4	#1~#2;#2:edep=$1:$2_$3;#3:storage=;#4:storage=
 # handle augmented `mark`
 # two and three word `fixed` expressions
-text=/.*/;func=/^(advcl|acl)$/;head2=/(.*)/&func=/(mark)/;func=/fixed/&head2=/(.*)/;func=/fixed/&head2=/(.*)/	#1>#2>#3>#4;#3>#5;#4.#5	#1~#2;#2:edep=$1:$2_$4_$5;#3:storage=hasdblfixed
-text=/.*/;func=/^(advcl|acl)$/;head2=/(.*)/&func=/(mark)/&storage!=/hasdblfixed/;func=/fixed/&head2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2_$4;#3:storage=hasfixed
+text=/.*/;func=/^(advcl|acl)$/;storage2=/(.*)/&func=/(mark)/;func=/fixed/&storage2=/(.*)/;func=/fixed/&storage2=/(.*)/	#1>#2>#3>#4;#3>#5;#4.#5	#1~#2;#2:edep=$1:$2_$4_$5;#3:storage=hasdblfixed
+text=/.*/;func=/^(advcl|acl)$/;storage2=/(.*)/&func=/(mark)/&storage!=/hasdblfixed/;func=/fixed/&storage2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2_$4;#3:storage=hasfixed
+text=/.*/;func=/^(advcl|acl)$/;storage2=/(.*)/&func=/(mark)/&storage!=/hasdblfixed/;func=/goeswith/&storage2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2$4;#3:storage=hasgoeswith
 # all other regular cases - and ruling out genitive 's as the augmentation (xpos=POS)
-text=/.*/;func=/^(advcl|acl)$/;head2=/(.*)/&func=/(mark)/&xpos!=/POS/&storage!=/hasfixed/	#1>#2>#3	#1~#2;#2:edep=$1:$2;#3:storage=
-text=/.*/;func=/^(advcl|acl)$/;head2=/(.*)/&func=/(mark)/&xpos!=/POS/&storage!=/hasfixed/;func=/conj/&storage!=/.*hascase.*/	#1>#2>#3;#2>#4	#1~#4;#4:edep=$1:$2
+text=/.*/;func=/^(advcl|acl)$/;storage2=/(.*)/&func=/(mark)/&xpos!=/POS/&storage!=/hasfixed/	#1>#2>#3	#1~#2;#2:edep=$1:$2;#3:storage=
+text=/.*/;func=/^(advcl|acl)$/;storage2=/(.*)/&func=/(mark)/&xpos!=/POS/&storage!=/hasfixed/;func=/conj/&storage!=/.*hascase.*/	#1>#2>#3;#2>#4	#1~#4;#4:edep=$1:$2
 # handle double mark, e.g. acl:for_to in "for X to Y", rather than acl:for
-text=/.*/;func=/^(advcl|acl)$/;func=/mark/&xpos!=/POS/&head2=/(.*)/;func=/mark/&xpos!=/POS/&head2=/(.*)/	#3.*#4;#1>#2;#2>#3;#2>#4	#1~#2;#2:edep=$1:$2_$3;#3:storage=;#4:storage=
+text=/.*/;func=/^(advcl|acl)$/;func=/mark/&xpos!=/POS/&storage2=/(.*)/;func=/mark/&xpos!=/POS/&storage2=/(.*)/	#3.*#4;#1>#2;#2>#3;#2>#4	#1~#2;#2:edep=$1:$2_$3;#3:storage=;#4:storage=
 # augment `conj` with `cc` lemma
 # two and three word `fixed` expressions
-text=/.*/;func=/^(conj)$/;head2=/(.*)/&func=/(cc)/;func=/fixed/&head2=/(.*)/;func=/fixed/&head2=/(.*)/	#1>#2>#3>#4;#3>#5;#4.#5	#1~#2;#2:edep=$1:$2_$4_$5;#3:storage=hasdblfixed
-text=/.*/;func=/^(conj)$/;head2=/(.*)/&func=/(cc)/&storage!=/hasdblfixed/;func=/fixed/&head2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2_$4;#3:storage=hasfixed
+text=/.*/;func=/^(conj)$/;storage2=/(.*)/&func=/(cc)/;func=/fixed/&storage2=/(.*)/;func=/fixed/&storage2=/(.*)/	#1>#2>#3>#4;#3>#5;#4.#5	#1~#2;#2:edep=$1:$2_$4_$5;#3:storage=hasdblfixed
+text=/.*/;func=/^(conj)$/;storage2=/(.*)/&func=/(cc)/&storage!=/hasdblfixed/;func=/fixed/&storage2=/(.*)/	#1>#2>#3>#4	#1~#2;#2:edep=$1:$2_$4;#3:storage=hasfixed
 # all other regular cases
-text=/.*/;func=/^(conj)$/;head2=/(.*)/&func=/(cc)/&storage!=/hasfixed/	#1>#2>#3	#1~#2;#2:edep=$1:$2;#3:storage=
+text=/.*/;func=/^(conj)$/;storage2=/(.*)/&func=/(cc)/&storage!=/hasfixed/	#1>#2>#3	#1~#2;#2:edep=$1:$2;#3:storage=
 # multiple conj
 text=/.*/;func=/conj/&edep=/.*conj:([^|]+).*/;func=/conj/&edep!=/.*conj:.*/	#1>#2;#1>#3;#3.*#2	#1~#3;#3:edep=conj:$1
 
@@ -91,7 +102,7 @@ func=/(.subj).*/;text=/.*/;func=/conj/&storage!=/.*hassubj.*/;func=/xcomp/&stora
 
 # coord general - duplicate all resulting deps and edeps on `conj`; note that `parataxis` is not carried over
 text=/.*/;func=/(.*)/&func!=/parataxis/;func=/conj/	#1>#2>#3	#1~#3;#3:edep=$1
-text=/.*/;edep=/(.*)/&edep!=/conj.*/&func!=/parataxis/;func=/conj/	#1~#2;#2>#3	#1~#3;#3:edep=$1
+text=/.*/;edep=/(.*)/&edep!=/conj.*/&func!=/parataxis/;func=/conj|root/	#1~#2;#2>#3	#1~#3;#3:edep=$1
 
 # coord subjects
 text=/.*/;func=/(.subj.*)/;func=/conj/	#1>#2>#3	#1~#3;#3:edep=$1
@@ -120,14 +131,32 @@ text=/.*/;func=/acl:relcl/;func=/(.*)/&xpos=/^W(DT|P.?)$/	#1>#2>#3	#1~#3;#3:edep
 # relative in embedded PP ("to which")
 text=/.*/;func=/acl:relcl/;func=/(nmod|obl)/&xpos=/^W(DT|P.?)$/;func=/case/&lemma=/(.*)/	#1>#2>#3>#4	#2~#1;#1:edep=$1:$3
 # coordinate matrix NP with embedded PP ([X and Y] to which Z)
-text=/.*/;func=/acl:relcl/;func=/(nmod|obl)/&xpos=/^W(DT|P.?)$/;func=/case/&lemma=/(.*)/;func=/conj/&storage!=/.*hascase.*/	#1>#2>#3>#4;#1>#5	#2~#5;#5:edep=$1:$3
+text=/.*/;func=/acl:relcl/;func=/(nmod|obl)/&xpos=/^W(DT|P.?)$/;func=/case/&lemma=/(.*)/;func=/conj/&storage!=/.*hascase.*/	#1>#2>#3>#4;#1>#5;#5.*#3	#2~#5;#5:edep=$1:$3
 # relative pronoun in PP embedded in NP ("... most of whom")
 text=/.*/;func=/acl:relcl/;func=/.*/;func=/(nmod)/&xpos=/^W(DT|P.?)$/;func=/case/&lemma=/(.*)/	#1>#2>#3>#4>#5	#4:edep=
 text=/.*/;func=/acl:relcl/;func=/.*/;func=/(nmod)/&xpos=/^W(DT|P.?)$/;func=/case/&lemma=/(.*)/	#1>#2>#3>#4>#5	#1~#4;#4:edep=ref;#3~#1;#1:edep=$1:$3
 # exception to previous: prevent cycle when acl:relcl head has a coordinate predicate
 func=/(.*)/;func=/acl:relcl/;func=/.*/;func=/(nmod)/&xpos=/^W(DT|P.?)$/;func=/case/&lemma=/(.*)/;func=/conj/;text=/.*/	#1>#2>#3>#4>#5;#2>#6;#7>#1	#1:edep=;#7~#1;#1:edep=$1
 
 # coord general - duplicate all resulting deps and edeps on `conj`; note that `parataxis` is not carried over
-text=/.*/;func=/(.*)/&func!=/parataxis/;func=/conj/	#1>#2>#3	#1~#3;#3:edep=$1
-text=/.*/;edep=/(.*)/&edep!=/conj.*/&func!=/parataxis/;func=/conj/	#1~#2;#2>#3	#1~#3;#3:edep=$1
+text=/.*/;func=/(.*)/&func!=/parataxis|root/;func=/conj/	#1>#2>#3	#1~#3;#3:edep=$1
+#text=/.*/;edep=/(.*)/&edep!=/conj.*/&func!=/parataxis|root/;func=/conj/	#1~#2;#2>#3	#1~#3;#3:edep=$1
 edom=/.*?([0-9.]+\|\|(nmod|obl|conj):[a-z]+).*/&edep!=/conj.*/;func=/conj/	#1>#2	#2:edom=$1
+
+# coord unlike coordination
+# compound + amod
+text=/.*/;func=/compound/;func=/conj/&xpos=/J.*|V.N/&edom=/(.*?[0-9.]+\|\|)compound(.*)/	#1>#2>#3;#1~#3	#3:edom=$1amod$2
+# amod + compound
+text=/.*/;func=/amod/;func=/conj/&xpos=/NN.*/&edom=/(.*?[0-9.]+\|\|)amod(.*)/	#1>#2>#3;#1~#3	#3:edom=$1compound$2
+# nsubj + csubj
+text=/.*/;func=/nsubj/;func=/conj/&xpos=/V.G/&edom=/(.*?[0-9.]+\|\|)nsubj(.*)/	#1>#2>#3;#1~#3	#3:edom=$1csubj$2
+
+# supertokens (=multiword tokens, MWTs)
+# uncomment the following lines to introduce MWTs for words like "don't" to data which lacks them
+#text=/^(?i)gon|wan/;text=/^(?i)na/	#1.#2	#1><#2
+#text=/^(?i)dun/;text=/^(?i)no/	#1.#2	#1><#2
+#text=/^(?i)out|got/;text=/^(?i)ta/	#1.#2	#1><#2
+#text=/^(?i)c'?m/&misc=/.*SpaceAfter.No.*/;text=/^(?i)on/	#1.#2	#1><#2
+#misc=/.*SpaceAfter.No.*/;text=/^(?i)[^A-Za-z]?(ll|d|m|ve|s)/&xpos=/VBP|MD|VHP|VBZ|VHZ/	#1.#2	#1><#2
+#misc=/.*SpaceAfter.No.*/;xpos=/POS/	#1.#2	#1><#2
+#misc=/.*SpaceAfter.No.*/;lemma=/n[o']?t/	#1.#2	#1><#2