-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7499e05
Showing
6 changed files
with
974 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import string | ||
import numpy as np | ||
from gensim.models import KeyedVectors | ||
from scipy.spatial.distance import cosine | ||
|
||
def cos_similarity(source_words, target_words, model, stops): | ||
#source_words = source.split(" ") | ||
#target_words = target.split(" ") | ||
|
||
vectors_to_compare = list() | ||
|
||
for words in [source_words, target_words]: | ||
|
||
#print(words) | ||
|
||
if len(words) > 1: | ||
joined_with_dashes = "_".join(words) | ||
#print("1") | ||
if joined_with_dashes in model: | ||
vector = model[joined_with_dashes] | ||
#print("2") | ||
else: | ||
#print("3") | ||
words_without_stops = [word for word in words if (word not in stops)] | ||
#print(words_without_stops) | ||
joined_with_dashes_nostops = "_".join(words_without_stops) | ||
if joined_with_dashes_nostops in model: | ||
#print("4") | ||
vector = model[joined_with_dashes_nostops] | ||
else: | ||
#print("5") | ||
vectors = [model[word] for word in words_without_stops if (word in model)] | ||
if vectors: | ||
#print("6") | ||
vector = np.mean(vectors, axis=0) | ||
else: | ||
#print("7") | ||
return 0.0 | ||
|
||
else: | ||
try: | ||
vector = model[words[0]] | ||
except: | ||
return 0.0 | ||
|
||
vectors_to_compare.append(vector) | ||
|
||
""" | ||
if len(target_words) > 1: | ||
if "_".join(target_words) in model: | ||
target_vector = model["_".join(target_words)] | ||
else: | ||
target_vectors = [model[word] for word in target_words if word in model] # and word not in stops)] | ||
if target_vectors: | ||
target_vector = np.mean(target_vectors, axis=0) | ||
else: | ||
return 0.0 | ||
else: | ||
try: | ||
target_vector = model[target_words[0]] | ||
except: | ||
return 0.0 | ||
""" | ||
return 1 - cosine(vectors_to_compare[0], vectors_to_compare[1]) | ||
|
||
|
||
if __name__ == "__main__": | ||
embeddings_path = 'GoogleNews-vectors-negative300.bin' | ||
#embeddings_path = "numberbatch-en-17.06.txt" | ||
model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False) | ||
|
||
with open('stopwords.txt', 'r') as f: | ||
stops = set(line.strip() for line in f.readlines()) | ||
stops = stops.union(string.punctuation) | ||
|
||
|
||
print(cos_similarity(["make", "you", "sneeze"], ["separate"], model, stops)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
### COMMENT: Source of mapping is: https://universaldependencies.org/tagset-conversion/en-penn-uposf.html, accessed 2020/08/06 | ||
# SYM | ||
$ SYM | ||
" PUNCT | ||
, PUNCT | ||
-LRB- PUNCT | ||
-RRB- PUNCT | ||
. PUNCT | ||
: PUNCT | ||
AFX ADJ | ||
CC CCONJ | ||
CD NUM | ||
DT DET | ||
EX PRON | ||
FW X | ||
HYPH PUNCT | ||
IN ADP | ||
JJ ADJ | ||
JJR ADJ | ||
JJS ADJ | ||
LS X | ||
MD VERB | ||
NIL X | ||
NN NOUN | ||
NNP PROPN | ||
NNPS PROPN | ||
NNS NOUN | ||
PDT DET | ||
POS PART | ||
PRP PRON | ||
PRP$ DET | ||
RB ADV | ||
RBR ADV | ||
RBS ADV | ||
RP ADP | ||
SYM SYM | ||
TO PART | ||
UH INTJ | ||
VB VERB | ||
VBD VERB | ||
VBG VERB | ||
VBN VERB | ||
VBP VERB | ||
VBZ VERB | ||
WDT DET | ||
WP PRON | ||
WP$ DET | ||
WRB ADV | ||
'' PUNCT | ||
`` PUNCT | ||
NP NOUN | ||
VP VERB | ||
ADJP ADJ | ||
VP-reduced VERB | ||
NP-compound NOUN |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
NP | ||
VP | ||
ADJP | ||
JJ | ||
JJR | ||
JJS | ||
NN | ||
NNS | ||
NNP | ||
NNPS | ||
VB | ||
VBG | ||
VBD | ||
VBN | ||
VBP | ||
VBZ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
NP NP | ||
VP VP | ||
ADJP ADJP | ||
JJ ADJP | ||
JJR ADJP | ||
JJS ADJP | ||
NN NP | ||
NNS NP | ||
NNP NP | ||
NNPS NP | ||
VB VP | ||
VBG VP | ||
VBD VP | ||
VBN VP | ||
VBP VP | ||
VBZ VP | ||
NP-compound NP | ||
VP-reduced VP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
CoCo-Ex is written in Python and requires the following software components: | ||
|
||
- Python 3.6/3.7 | ||
- spacy 2.3.5 | ||
- nltk 3.5 | ||
- gensim 3.8.3 | ||
- pandas 1.2 | ||
- stanford parser 3.9.2 | ||
|
||
To extract entities with CoCo-Ex, run the following commands: | ||
|
||
python CoCo-Ex_entity_extraction.py "path/to/inputfile.csv" "path/to/outputfile.tsv" | ||
|
||
The system expects a .csv inputfile of the following format: | ||
|
||
text_id;sent_1;sent_2;...;sent_n | ||
|
||
Each text is one line, where the first column is the text_id and all other columns are one sentence per column. | ||
If your inputfile has a different format, you will need to change the code snippet where it is parsed, at the bottom of the entity_extraction.py source code. | ||
|
||
Note that you might need to set some more variables (such as the Stanford parser path, java path or embeddings path) as well, depending on your file structure. | ||
These variables are currently hardcoded in the "main" section of the source code and can be changed accordingly there. | ||
|
||
The output will be written to your specified outputpath as a .tsv file. It contains all the similarities calculated between each candidate node and each sentence's phrases. | ||
Note that this file can take up a lot of memory for large inputs. | ||
|
||
By default, the system will only calculate the length difference and dice similarity for a pair (the metrics we use in our paper), and fill the other possible similarity metrics with "None". | ||
This is for performance reasons. However, this can be changed by a flag in the source code, if you wish to calculate other metrics as well. | ||
|
||
To filter out overhead from the candidate nodes based on their similarities, execute the following command: | ||
|
||
python CoCo-Ex_overhead_filter.py --inputfile "path/to/outputfile_of_first_step.tsv" --outputfile "path/to/new_outputfile.tsv" --len_diff_tokenlevel 1 --len_diff_charlevel 10 --dice_coefficient 0.85 | ||
|
||
The thresholds for the individual similarity metrics can be set as command line parameters as shown above (1/10/0.85 is our paper configuration). | ||
The overhead filter currently only implements these three filters used in our paper. However, we will keep adding filters for the other similarity metrics to it in the future. | ||
|
||
*Note that this is a preliminary release of our system. We will revamp this readme and the code for better readability/useability in future commits. Feel free to contact us for any questions or issues! |