Add files via upload

Heidelberg-NLP · Jan 17, 2021 · 7499e05 · 7499e05
commit 7499e05
Show file tree

Hide file tree

Showing 6 changed files with 974 additions and 0 deletions.
diff --git a/CoCo-Ex_entity_extraction.py b/CoCo-Ex_entity_extraction.py
diff --git a/cos_sim.py b/cos_sim.py
@@ -0,0 +1,78 @@
+import string
+import numpy as np
+from gensim.models import KeyedVectors
+from scipy.spatial.distance import cosine
+
+def cos_similarity(source_words, target_words, model, stops):
+    #source_words = source.split(" ")
+    #target_words = target.split(" ")
+
+    vectors_to_compare = list()
+
+    for words in [source_words, target_words]:
+
+        #print(words)
+
+        if len(words) > 1:
+            joined_with_dashes = "_".join(words)
+            #print("1")
+            if joined_with_dashes in model:
+                vector = model[joined_with_dashes]
+                #print("2")
+            else:
+                #print("3")
+                words_without_stops = [word for word in words if (word not in stops)]
+                #print(words_without_stops)
+                joined_with_dashes_nostops = "_".join(words_without_stops)
+                if joined_with_dashes_nostops in model:
+                    #print("4")
+                    vector = model[joined_with_dashes_nostops]
+                else:
+                    #print("5")
+                    vectors = [model[word] for word in words_without_stops if (word in model)]
+                    if vectors:
+                        #print("6")
+                        vector = np.mean(vectors, axis=0)
+                    else:
+                        #print("7")
+                        return 0.0
+
+        else:
+            try:
+                vector = model[words[0]]
+            except:
+                return 0.0
+
+        vectors_to_compare.append(vector)
+
+        """
+    if len(target_words) > 1:
+        if "_".join(target_words) in model:
+            target_vector = model["_".join(target_words)]
+        else:
+            target_vectors = [model[word] for word in target_words if word in model] # and word not in stops)]
+            if target_vectors:
+                target_vector = np.mean(target_vectors, axis=0)
+            else:
+                return 0.0
+    else:
+        try:
+            target_vector = model[target_words[0]]
+        except:
+            return 0.0
+        """
+    return 1 - cosine(vectors_to_compare[0], vectors_to_compare[1])
+
+
+if __name__ == "__main__":
+    embeddings_path = 'GoogleNews-vectors-negative300.bin'
+    #embeddings_path = "numberbatch-en-17.06.txt"
+    model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False)
+
+    with open('stopwords.txt', 'r') as f:
+        stops = set(line.strip() for line in f.readlines())
+    stops = stops.union(string.punctuation)
+
+
+    print(cos_similarity(["make", "you", "sneeze"], ["separate"], model, stops))
+
diff --git a/penn_to_universal_tagset_mapping.txt b/penn_to_universal_tagset_mapping.txt
@@ -0,0 +1,55 @@
+### COMMENT: Source of mapping is: https://universaldependencies.org/tagset-conversion/en-penn-uposf.html, accessed 2020/08/06
+#	SYM
+$	SYM
+"	PUNCT
+,	PUNCT
+-LRB-	PUNCT
+-RRB-	PUNCT
+.	PUNCT
+:	PUNCT
+AFX	ADJ
+CC	CCONJ
+CD	NUM
+DT	DET
+EX	PRON
+FW	X
+HYPH	PUNCT
+IN	ADP
+JJ	ADJ
+JJR	ADJ
+JJS	ADJ
+LS	X
+MD	VERB
+NIL	X
+NN	NOUN
+NNP	PROPN
+NNPS	PROPN
+NNS	NOUN
+PDT	DET
+POS	PART
+PRP	PRON
+PRP$	DET
+RB	ADV
+RBR	ADV
+RBS	ADV
+RP	ADP
+SYM	SYM
+TO	PART
+UH	INTJ
+VB	VERB
+VBD	VERB
+VBG	VERB
+VBN	VERB
+VBP	VERB
+VBZ	VERB
+WDT	DET
+WP	PRON
+WP$	DET
+WRB	ADV
+''	PUNCT
+``	PUNCT
+NP	NOUN
+VP	VERB
+ADJP	ADJ
+VP-reduced	VERB
+NP-compound	NOUN
diff --git a/phrases.txt b/phrases.txt
@@ -0,0 +1,16 @@
+NP
+VP
+ADJP
+JJ
+JJR
+JJS
+NN
+NNS
+NNP
+NNPS
+VB
+VBG
+VBD
+VBN
+VBP
+VBZ
diff --git a/phrases_simplification_mapping.txt b/phrases_simplification_mapping.txt
@@ -0,0 +1,18 @@
+NP	NP
+VP	VP
+ADJP	ADJP
+JJ	ADJP
+JJR	ADJP
+JJS	ADJP
+NN	NP
+NNS	NP
+NNP	NP
+NNPS	NP
+VB	VP
+VBG	VP
+VBD	VP
+VBN	VP
+VBP	VP
+VBZ	VP
+NP-compound	NP
+VP-reduced	VP
diff --git a/quick_start_guide.txt b/quick_start_guide.txt
@@ -0,0 +1,37 @@
+CoCo-Ex is written in Python and requires the following software components:
+
+- Python 3.6/3.7
+- spacy 2.3.5
+- nltk 3.5
+- gensim 3.8.3
+- pandas 1.2
+- stanford parser 3.9.2
+
+To extract entities with CoCo-Ex, run the following commands:
+
+python CoCo-Ex_entity_extraction.py "path/to/inputfile.csv" "path/to/outputfile.tsv"
+
+The system expects a .csv inputfile of the following format:
+
+text_id;sent_1;sent_2;...;sent_n
+
+Each text is one line, where the first column is the text_id and all other columns are one sentence per column.
+If your inputfile has a different format, you will need to change the code snippet where it is parsed, at the bottom of the entity_extraction.py source code.
+
+Note that you might need to set some more variables (such as the Stanford parser path, java path or embeddings path) as well, depending on your file structure.
+These variables are currently hardcoded in the "main" section of the source code and can be changed accordingly there.
+
+The output will be written to your specified outputpath as a .tsv file. It contains all the similarities calculated between each candidate node and each sentence's phrases.
+Note that this file can take up a lot of memory for large inputs.
+
+By default, the system will only calculate the length difference and dice similarity for a pair (the metrics we use in our paper), and fill the other possible similarity metrics with "None".
+This is for performance reasons. However, this can be changed by a flag in the source code, if you wish to calculate other metrics as well.
+
+To filter out overhead from the candidate nodes based on their similarities, execute the following command:
+
+python CoCo-Ex_overhead_filter.py --inputfile "path/to/outputfile_of_first_step.tsv" --outputfile "path/to/new_outputfile.tsv" --len_diff_tokenlevel 1 --len_diff_charlevel 10 --dice_coefficient 0.85
+
+The thresholds for the individual similarity metrics can be set as command line parameters as shown above (1/10/0.85 is our paper configuration).
+The overhead filter currently only implements these three filters used in our paper. However, we will keep adding filters for the other similarity metrics to it in the future.
+
+*Note that this is a preliminary release of our system. We will revamp this readme and the code for better readability/useability in future commits. Feel free to contact us for any questions or issues!
-Original file line number
+Diff line change
@@ -0,0 +1,16 @@
+    NP
+    VP
+    ADJP
+    JJ
+    JJR
+    JJS
+    NN
+    NNS
+    NNP
+    NNPS
+    VB
+    VBG
+    VBD
+    VBN
+    VBP
+    VBZ