-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtrump_twitter_anger.R
176 lines (152 loc) · 6.98 KB
/
trump_twitter_anger.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
library(tidyverse)
library(tidytext)
library(jsonlite)
library(lubridate)
# to get the full archive:
# 1. open http://www.trumptwitterarchive.com/archive
# 2. click on 'export' and select 'JSON'
# 3. if you like, unselect columns we won't use, like RTs, favorites, source
# 4. position the cursor in the text box, select all, and copy
# 5. open a new plain text document
# 6. paste the copied text into the document
# 7. save it to your desired directory
# 8. close the tab
#
# NOTE: I have repeatedly had problems with some ghost operation happening in R after I do this,
# and the only way I've found to fix it is to reboot the whole computer
mypath <- "~/documents/blog_posts/trump_tweets_20180804.txt"
X <- fromJSON(mypath)
# the date stamp in the json is a pain, so:
# 1) split it into its parts; and then
# 2) create a col of clean dates based on the key bits of those parts.
date_fields <- strsplit(X$created_at, " ")
X$date <- date(sapply(seq_along(date_fields), function(i) {
sprintf("%s-%s-%s",
date_fields[[i]][6], # year
match(date_fields[[i]][2], month.abb), # month
date_fields[[i]][3]) # day
}))
# reduce the data to only the cols we want: tweet id, date, and content
X <- X[,c("id_str", "date", "text")]
# prep the tweets for analysis by...
X2 <- X %>%
# removing RTs
filter(!grepl("^RT", text)) %>%
# removing urls and converting ampersands to ands
mutate(text = gsub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", text),
text = gsub("&", "and", text)) %>%
# creating tidy text version with one word per row (other cols preserved by default)
unnest_tokens(word, text) %>%
# removing stop words
anti_join(stop_words) %>%
# removing digits
filter(!grepl("[0-9]{1,}", word))
# make master data frame of dates spanning full period
date_master <- data.frame(date = seq(from = min(X2$date),
to = max(X2$date),
by = "day"))
# daily word counts
trump_daily_word_count <- X2 %>%
group_by(date) %>%
summarize(total_words = n()) %>%
left_join(date_master, .)
# replace NAs for days with no tweets with 0s
trump_daily_word_count[is.na(trump_daily_word_count)] <- 0
# daily anger word counts
trump_daily_word_count_anger <- X2 %>%
inner_join(., get_sentiments("nrc")) %>%
filter(sentiment == "anger") %>%
group_by(date) %>%
summarize(anger_words = n()) %>%
left_join(trump_daily_word_count, .) %>%
mutate(anger_words = ifelse(is.na(anger_words), 0, anger_words),
anger_share = ifelse(total_words == 0, 0, anger_words/total_words))
png("~/documents/blog_posts/trump_tweets_words_raw_20180804.png",
width = 7, height = 5, unit = "in", res = 300)
ggplot(filter(trump_daily_word_count, date >= "2012-01-01"), aes(date, total_words)) +
geom_col() +
geom_smooth(method = "loess", size = 1, span = 1/4, colour = "blue") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank()) +
ggtitle("Daily counts of words tweeted by @realDonaldTrump")
dev.off()
png("~/documents/blog_posts/trump_tweets_anger_raw_20180804.png",
width = 7, height = 5, unit = "in", res = 300)
ggplot(filter(trump_daily_word_count_anger, date >= "2012-01-01"), aes(date, anger_words)) +
geom_col() +
geom_smooth(method = "loess", size = 1, span = 1/4, colour = "red") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank()) +
ggtitle("Daily count of anger words tweeted by @realDonaldTrump")
dev.off()
png("~/documents/blog_posts/trump_tweets_anger_share_20180804.png",
width = 7, height = 5, unit = "in", res = 300)
ggplot(filter(trump_daily_word_count_anger, date >= "2012-01-01"), aes(date, anger_share)) +
geom_col() +
geom_smooth(method = "loess", size = 1, span = 1/3, colour = "red") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank()) +
ggtitle("Anger words as a share of daily words tweeted by @realDonaldTrump")
dev.off()
# make a table of tallies of the anger words
trump_anger_vocabulary <- X2 %>%
inner_join(., get_sentiments("nrc")) %>%
filter(sentiment == "anger") %>%
group_by(word) %>%
tally() %>%
arrange(desc(n))
# inspect the top 15 anger words
trump_anger_vocabulary[1:15,]
# huh, 'vote' is the most common anger word. most of us probably don't think of that as
# an angry word. so let's do a sensitivity analysis: remove 'vote' from the set of words
# counted in the tallies of angry word and see if the trends change (tl;dr they don't)
trump_daily_word_count_anger_sans_vote <- X2 %>%
inner_join(., get_sentiments("nrc")) %>%
filter(sentiment == "anger") %>%
# here's where we drop vote from the set of words included in the anger tallies
filter(word != "vote") %>%
group_by(date) %>%
summarize(anger_words = n()) %>%
left_join(trump_daily_word_count, .) %>%
mutate(anger_words = ifelse(is.na(anger_words), 0, anger_words),
anger_share = ifelse(total_words == 0, 0, anger_words/total_words))
png("~/documents/blog_posts/trump_tweets_anger_raw_sans_vote_20180804.png",
width = 7, height = 5, unit = "in", res = 300)
ggplot(filter(trump_daily_word_count_anger_sans_vote, date >= "2012-01-01"), aes(date, anger_words)) +
geom_col() +
geom_smooth(method = "loess", size = 1, span = 1/4, colour = "red") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank()) +
ggtitle("Daily count of anger words (minus 'vote') tweeted by @realDonaldTrump")
dev.off()
png("~/documents/blog_posts/trump_tweets_anger_share_sans_vote_20180804.png",
width = 7, height = 5, unit = "in", res = 300)
ggplot(filter(trump_daily_word_count_anger_sans_vote, date >= "2012-01-01"), aes(date, anger_share)) +
geom_col() +
geom_smooth(method = "loess", size = 1, span = 1/3, colour = "red") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank()) +
ggtitle("Anger words (minus 'vote') as a share of daily words tweeted by @realDonaldTrump")
dev.off()
# on Twitter, @jepusto suggested I try exponential smoothing to see if the apparent increase
# in the recent past was an artifact of loess' propensity for misbehaving near the edges of
# the series. Here is code to do that. TL;DR no, the finding holds.
# reduce the data to the same 2012- period used above, and make a time series object with proper
# indexing from the normalized series
x <- filter(trump_daily_word_count_anger, date >= "2012-01-01")
x <- zoo(x$anger_share, order.by = x$date)
# use the es function from the smooth package to fit a model to that series, with the default
# forecast period (which doesn't really interest us much anyway)
library(smooth)
esm <- es(x)
# plot the results using smooth's built-in graphing function, which includes a nice legend
png("~/documents/blog_posts/trump_tweets_anger_share_exponential_smoothing_20180804.png",
width = 7, height = 5, unit = "in", res = 300)
graphmaker(x, esm$forecast, esm$fitted,
main = "exponential smoothing estimates of daily anger word shares")
dev.off()