-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate.py
66 lines (57 loc) · 2.24 KB
/
calculate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
#Get rid of stop words || POS + Lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
#removes doubles, stop_words, and lemmatizes the outcome
def process_content(text):
try:
#create three empty lists
list1, list2, list3 = ([] for i in range(3))
#Run through each sentence by tokenizing each word and making sure they are not stop words
for w in word_tokenize(text):
if w not in stop_words:
list1.append(w)
#attribute each word with the correct word type
tagged = pos_tag(list1)
#simply each word type
for a, b in tagged:
if b[0] in ('J','N','R','V','I'):
list2.append((a,b))
#lemmatize each word based on what word type it is and add it to the list
for a, b in list2:
x = lemmatizer.lemmatize(a.lower(),get_wordnet_pos(b))
if x not in list3:
list3.append(x)
return list3
#if lists are empty, throw an error
except Exception as e:
print(str(e))
#This function is called to make sure the lemmatizer works on the proper word type
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J') or treebank_tag.startswith('I'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
#Calculates the word count for each inputted list, as well as the jaccard score and overlap
#Returns each of these values
def compare_lists(list1, list2):
word_count_1 = len(list1)
word_count_2 = len(list2)
jaccard_score = jaccard_sim(list1,list2)
word_overlap = len(set(list1).intersection(set(list2)))
return jaccard_score, word_count_1, word_count_2, list1, list2, word_overlap
#Calculates jaccard score of two different sets
def jaccard_sim(x,y):
i = len(set.intersection(*[set(x),set(y)]))
j = len(set.union(*[set(x),set(y)]))
return i/float(j)