-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunctions.py
51 lines (36 loc) · 1.51 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import pickle
import nltk
nltk.data.path.append('./nltk_data/')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
english = set(nltk.corpus.words.words())
def preprocess(data):
# lemmatize
def lemmadata(doc):
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
raw_tokens = nltk.regexp_tokenize(doc, pattern)
tokens = [i.lower() for i in raw_tokens]
stop_words = set(stopwords.words('english'))
listed = [w for w in tokens if not w in stop_words]
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in listed]
lemmatized = list(filter(lambda w: w != 'lb', lemmatized))
words = list(filter(lambda w: w in english, lemmatized))
return " ".join(words)
lemmatized = [lemmadata(post) for post in data]
# picked tfidf vectorizer
tfidf = pickle.load(open("pickles/tfidf.pkl", "rb"))
transformed = tfidf.transform(lemmatized)
tfidf_df = pd.DataFrame(transformed.toarray(), columns=tfidf.get_feature_names())
# pickled the list of relevant words
relevant = pickle.load(open("pickles/relevantwords.pkl", "rb"))
testset = [tfidf_df[word] for word in relevant if word in tfidf_df.columns]
return pd.DataFrame(testset).transpose()
def classify_text(text):
# the model
mnb = pickle.load(open("pickles/mnb.pkl", "rb"))
listtext = [text]
processed = preprocess(listtext)
result = mnb.predict(processed)[0]
return result