forked from UoMResearchIT/r-tidyverse-digital-humanities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepareData.R
139 lines (108 loc) · 5.07 KB
/
prepareData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Convert the twitter data to tidy format for the
# course
# David Mawdsley 7 Nov 18
library(tidyverse)
library(lubridate)
datadir <- "rawdata/"
outdir <- "_episodes_rmd/data/"
toload <- list.files(datadir, "matrix_", full.names = TRUE)
readData <- function(infiles){
allData <- list()
for(f in infiles){
thisdata <- read_csv(f) # Note trailing , causes warnings. FIXME
currentword <- str_match(f, "matrix_(.+)\\.csv")[,2]
thisdata <- thisdata %>%
select(-"__") %>%
filter(!is.na(METROPOLITAN_CODE)) %>%
gather("date", "cases", -METROPOLITAN_NAME, -METROPOLITAN_CODE) %>%
mutate(date = ymd(date)) %>%
mutate(stateCode = str_match(METROPOLITAN_NAME, "; (.+)$")[,2]) %>%
mutate(county = str_match(METROPOLITAN_NAME, "^(.+);")[,2]) %>%
filter(str_length(stateCode) == 2) %>% # How to handle counties a/c state boundaries?? Deleting for now.
filter(date >= ymd("2013-10-07")) %>% # Data collection appears to start here
# filter(!between(date, dmy("20-10-2014"), dmy("28-10-2014"))) %>% # Tweets weren't properly collected in these period
# filter(!(date %in% c(dmy("26-03-2014"), dmy("29-03-2014")))) %>% # Tweets weren't properly collected on these dates
mutate(cases = ifelse(between(date, dmy("20-10-2014"), dmy("28-10-2014")) |
(date %in% c(dmy("26-03-2014"), dmy("29-03-2014"))),
NA, cases)) %>%
mutate(word = currentword) # Extract word from filename
allData[[currentword]] <- thisdata
}
return(bind_rows(allData))
}
twitterDataCounty <- readData(toload)
twitterData <- twitterDataCounty %>%
group_by(date, stateCode, word) %>%
summarise(cases = sum(cases))%>%
ungroup() %>%
mutate(dataDay = difftime(date, min(date), units = "days") + 1)
tokenData <- twitterData %>%
filter(word == "tokens_cbsa") %>%
select(-word) %>%
rename(totalTokens = cases)
twitterData <- twitterData %>%
filter(word != "tokens_cbsa")
stateCodes <- read_csv(paste0(datadir, "/states.csv")) %>%
rename(stateCode = `State Code`)
stateRural <- read_csv(paste0(datadir, "/DEC_00_SF1_P002.csv")) %>%
rename(state = `GEO.display-label`, urban = VD03, rural = VD05, totalPop = VD01) %>%
mutate(ruralpct = rural / totalPop * 100) %>%
mutate(majorityUrbanRural = ifelse(ruralpct > 50, "Majority rural pop", "Majority urban pop")) %>%
inner_join(stateCodes, by=c("state" = "State")) %>%
select(state, stateCode, ruralpct, majorityUrbanRural, totalPop)
# Join the region and division data to the twitter data
twitterData <- twitterData %>%
inner_join(stateCodes %>% select(-State)) %>%
# And the total number of tokens
inner_join(tokenData) %>%
inner_join(stateRural %>%
select(stateCode, totalPop, ruralpct))
# Add extra data from AN
newData <- read_csv("rawdata/LI_DEMO.txt")
newDataState <- newData %>%
group_by(STATE) %>%
summarise(BLACK_2010 = weighted.mean(BLACK_2010, TOTPOP_2000),
TOTPOP_2000 = sum(TOTPOP_2000))
# Sense check the total populations
# newDataState %>%
# summarise(totpop = sum(TOTPOP_2000))
#
# newDataState %>%
# arrange(desc(TOTPOP_2000))
# newDataState %>%
# arrange((TOTPOP_2000))
# Compare total population estimates
# twitterData %>%
# filter(word == "anime") %>%
# filter(date == ymd("2013-10-07")) %>%
# inner_join(stateCodes %>% select(State, stateCode)) %>%
# mutate(stateLower = tolower(State)) %>%
# inner_join(newDataState %>% select(STATE, TOTPOP_2000), by=c("stateLower" = "STATE")) %>%
# filter(!between(TOTPOP_2000 / totalPop, 0.9, 1.1)) %>% # just look at state far from approx equality
# ggplot(aes(x = totalPop, y = TOTPOP_2000, label = State)) + geom_point() + geom_text() +
# geom_abline(slope = 1, intercept = 0)
twitterData <- twitterData %>%
inner_join(stateCodes %>% select(State, stateCode)) %>%
mutate(stateLower = tolower(State)) %>%
inner_join(newDataState %>% select(STATE, BLACK_2010), by=c("stateLower" = "STATE")) %>%
select(-stateLower)
# Monthly sums and proportions for graphing section
monthlyDataAll <- twitterData %>%
# filter(word == "bae") %>%
# filter(Region == "West") %>%
mutate(monthyear = dmy(paste0("1-", month(date), "-", year(date)))) %>%
group_by(word, monthyear, stateCode, Region) %>%
summarise(cases = sum(cases, na.rm = TRUE), totalTokens = sum(totalTokens, na.rm = TRUE),
tokenProp = sum(cases, na.rm = TRUE) / sum(totalTokens, na.rm = TRUE))
monthlyData <-
monthlyDataAll %>%
filter(word == "bae")
# I ended up not using all of the data I created. Here we remove unused variables and don't save files
# we don't need in the lesson. (I've left the code to create them so that the course can be extended)
twitterData <- twitterData %>%
select(-Division, -ruralpct, -BLACK_2010, -totalPop)
write_csv(twitterData, paste0(outdir, "twitterData.csv"))
#write_csv(tokenData, paste0(outdir, "tokenData.csv"))
#write_csv(stateRural, paste0(outdir, "stateData.csv"))
write_csv(monthlyData, paste0(outdir, "monthlyBae.csv"))
write_csv(monthlyDataAll, paste0(outdir, "monthlyAll.csv"))