-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_model.py
92 lines (69 loc) · 3.26 KB
/
build_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
'''
Module uses NLTK Twitter samples to build a sentiment analysis model
using a Naive Bayes Classifier. Accuracy is 99.5%.
Based on: https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
And: https://www.analyticsvidhya.com/blog/2018/07/hands-on-sentiment-analysis-dataset-python/
'''
import helpers
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import word_tokenize, casual_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
import random
def main():
print('Building model...')
print('Gathering training data...')
# set nltk twitter samples as list of strings
pos_sample_tweets = twitter_samples.strings('positive_tweets.json')
neg_sample_tweets = twitter_samples.strings('negative_tweets.json')
#### UPDATE HERE: Option to add your own tweet samples
#### Remove the empty list, uncomment and update filepaths below
pos_custom_tweets = [] ## helpers.import_csv('positive_tweets.csv')
neg_custom_tweets = [] ## helpers.import_csv('negative_tweets.csv')
# combine nltk twitter samples and custom tweets
positive_tweets = pos_sample_tweets + pos_custom_tweets
negative_tweets = neg_sample_tweets + neg_custom_tweets
# tokenize tweets
positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets]
negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets]
# set cleaned tokens lists
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
stop_words = stopwords.words('english')
# get cleaned positive tokens
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(helpers.remove_noise(tokens, stop_words))
# get cleaned negative tokens
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(helpers.remove_noise(tokens, stop_words))
# convert tokens into iterable word lists
all_pos_words = helpers.get_all_words(positive_cleaned_tokens_list)
all_neg_words = helpers.get_all_words(negative_cleaned_tokens_list)
# get frequency distribution of word lists
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
# print top 10 positive and negative words
print('Top 10 positive and negative words:')
print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))
# convert tokens to a dictionary for modelling
positive_tokens_for_model = helpers.get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = helpers.get_tweets_for_model(negative_cleaned_tokens_list)
# assign a label to positive tokens
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
# assign a label to negative tokens
negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]
# set dataset and randomize to train model
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
# split the data into a 70:30 ratio among 10K tweets
train_data = dataset[:7000]
test_data = dataset[7000:]
# train a Naive Bayes model
classifier = NaiveBayesClassifier.train(train_data)
# print model accuracy
print("Model accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))
print('Model complete!\n')
return classifier