-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtopics by year.R
executable file
·465 lines (392 loc) · 18.7 KB
/
topics by year.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
#############################################################################
# topics by year.R
#
# GOAL: To graph the rising and falling contributions to the corpus of each
# (or specified) topic over time.
#
# Provides two functions:
# * topics.by.year(dataset_name, ntopics, ..., per.plot): draws line
# graphs showing the proportions of the full corpus accounted for by
# each topic. Allows up to `per.plot` topics to be graphed in the same
# figure.
# * topic.variation(dataset_name, ntopics, ...): plots adjacent
# box-and-whisker plots showing the range of year-to-year proportions
# of the full corpus for each topic. In a sense, then, compresses all
# the graphs produced by topics.by.year into a single graph, allowing
# for easier comparison of the variability of topic contributions.
#####
topics.by.year <- function(dataset_name = "noexcludes2001_2015",
ntopics = 50,
subset_name = "knownprograms2001_2015",
iter_index = 1,
newnames=F, # where in the MALLET output filename does iter_index appear?
# set T if it's with the model, F if last in filename.
# Gets passed into get.doctopic.grid.
to.plot = NULL, # any pre-set topics to plot?
do.plot = TRUE, # should we draw it, or just
# return the dataframe?
per.plot = 5, # maximum how many lines per plot?
bad.topics = c(3, 8, 12, 15, 30, 34, 36, 47, 50),
smoothing = 1/3, # f span for LOWESS smoothing, as a proportion of points
# in the plot to incorporate. Must be > 0, but set close to 0
# for unsmoothed plot.
legendloc_init = NULL, # optionally specify where the legend should be
filetype = c(".pdf", ".tiff") # how should we output when remake_figs = TRUE?
)
{
require(data.table)
# require(RColorBrewer)
require(viridisLite)
filetype <- match.arg(filetype)
# Get topic weights for every document we have
if(!exists("get.doctopic.grid", mode="function")) {
source("get doctopic grid.R")
}
grid <- get.doctopic.grid(dataset_name=dataset_name, ntopics=ntopics,
subset_name=subset_name, iter_index=iter_index,
newnames=newnames)$outputfile.dt
# Get ready to merge
grid$Pub.number <- as.factor(grid$Pub.number)
setkey(grid, Pub.number)
# Merge with noexcludes to add Year data to the topic data
grid <- merge(grid, noexcludes.dt[, list(Pub.number, Year)], all.x=T)
# Re-key by year
setkey(grid, Year)
# Drop Pub.number and any bad.topics
grid <- grid[, setdiff(names(grid), c("Pub.number", bad.topics)), with=F]
# Get some stats for topics within each year
topic.year.avg <- grid[, lapply(.SD, mean), by=Year]
## why did I think I needed this to be a data.frame and not a data.table?
# df <- as.data.frame(topic.year.avg)
# Get topic labels, which you've composed elsewhere using `top docs per
# topic.R`, to use in figure legends
if(!exists("get_topic_labels", mode="function")) {
source(file="get topic labels.R")
}
topic.labels.dt <- get_topic_labels(dataset_name=dataset_name, ntopics=ntopics,
subset_name=subset_name, iter_index=iter_index,
bad.topics=bad.topics)
head(topic.labels.dt)
# Exclude non-content-bearing topics
if (is.null(bad.topics) && dataset_name=="consorts" && ntopics==55) {
bad.topics <- c("4", "47", "22", "2", "24", "13", "50")
}
if(!is.null(bad.topics)) {
topic.labels.dt <- topic.labels.dt[!(Topic %in% bad.topics)]
}
setkey(topic.labels.dt, Rank)
head(topic.labels.dt)
# We'll use plot.me in the other function, so figure it out even if we're
# not actually plotting.
if (!is.null(to.plot)) { # any pre-set topics to plot?
plot.me <- to.plot
plot.me <- plot.me[order(plot.me)] # data.tables strongly prefer ordered input,
# and this is kludgy but at least the legend should match
} else {
# plot.me <- c(51, 26, 46, 27, 43) # A range of topics to plot
# plot.me <- 2:ncol(df)-1 # gives 1:ntopics by topic number
# gives all topics in order of rank
plot.me <- topic.labels.dt[order(Rank), Topic]
}
# Set graphing parameters; see
# http://www.statmethods.net/graphs/line.html.
if(do.plot) {
## why did I think I needed this to be a data.frame and not a data.table?
df <- as.data.frame(topic.year.avg)
# X axis will be years
xrange <- range(df$Year)
# Y axis will be % of the corpus contributed by topic
yrange <- c(0, max(df[, !names(df) %in% "Year"]))
# Use different colors for each plot
# require(RColorBrewer)
# mycol <- brewer.pal(n=per.plot, name="Dark2")
require(viridisLite)
mycol <- viridis(n=per.plot, end=0.9)
# Use different symbols for each plot?
# plotchar <- seq(18, 18+length(plot.me), 1)
# Nah, use same symbols for each plot
plotchar <- rep(20, length(plot.me))
# but vary line types if possible
if(length(per.plot) < 6) {
linetype <- c(1, 2, 3, 4, 5) # skip dots: too faint
} else {
linetype <- rep(1, length(per.plot)) # not enough types, just go all solid :\
}
linewidth <- 2
maintitle <- "Average Topic Proportions over Time"
## Draw `per.plot` (by default, 5) lines on the same plot, then start
## a new plot and repeat.
# start recording to file if desired
if(remake_figs) {
if(is.null(to.plot)) {
filename <- file.path(imageloc, paste0(maintitle, ", ", dataset_name, "k", ntopics, subset_name, ", Topics ranked ",
i, "-", (i+per.plot-1), filetype))
} else {
filename <- file.path(imageloc, paste0(maintitle, ", ", dataset_name, "k", ntopics, subset_name, ", Topics ",
plot.me[i], "-", plot.me[i+per.plot-1], filetype))
}
switch(filetype,
".pdf" = pdf(filename),
".tiff" = tiff(filename, height=2400, width=2400, units="px", res=400, compression="lzw")
)
}
# set a target for when to repeat
enough <- seq(1, length(plot.me), by=per.plot)
for (i in 1:length(plot.me)) {
# Get a per-plot index to rotate through colors
j <- ((i-1) %% per.plot) + 1
# After each set of `per.plot` topics, start a new plot
if (i %in% enough) {
# make sure we don't print extra nulls at the end
if((length(plot.me) - i) > per.plot) {
legend.offset <- (per.plot-i) %% per.plot
} else {
legend.offset <- length(plot.me) - i
}
if(remake_figs) {
# close an open file connection from the previous loop,
# if there is one
dev.off()
# update filename
if(is.null(to.plot)) {
filename <- file.path(imageloc, paste0(maintitle, ", ", dataset_name, ntopics, subset_name,
", Topics ranked ", i, "-",
(i+legend.offset), filetype))
} else {
filename <- file.path(imageloc, paste0(maintitle, ", ", dataset_name, ntopics, subset_name,
", Topic", if(length(plot.me)>1) "s", " ", paste(plot.me[i:(i+legend.offset)], collapse="-"), filetype))
}
# start writing a new file
switch(filetype,
".pdf" = pdf(filename),
".tiff" = tiff(filename, height=2400, width=2400, units="px", res=400, compression="lzw")
)
}
# set up a blank plot in a standard size
plot(x = df$Year,
y = c(yrange, rep(mean(yrange), length(df$Year)-2)),
type = "n",
xaxs = "r",
ylab = "Portion of corpus (scaled to 1)",
xlab = "Year",
bty = "n",
main = maintitle)
# label the smoothing
mtext(paste("Lowess smoothing span:", round(smoothing, 2)), side=1, line=4, cex=0.7)
# add a legend for up to five values
if(is.null(legendloc_init)) {
if (mean(df[,as.character(plot.me[i])]) > mean(yrange)) {
legendloc <- "bottomleft"
} else {
legendloc <- "topleft"
}
} else {
legendloc <- legendloc_init
}
if(is.null(to.plot)) {
legend(legendloc, title = paste0("Topics, ranked ", i,
"-",(i+legend.offset), " of ",
nrow(topic.labels.dt)),
legend = paste0(plot.me[seq(i, (i+legend.offset),
1)], ": ", topic.labels.dt[Topic %in%
plot.me[i:(i+legend.offset)], Label, keyby=Topic]$Label),
fill=mycol[((j:(j+legend.offset) -1) %% length(mycol) + 1)],
border=mycol[((j:(j+legend.offset) -1) %% length(mycol) + 1)],
lty = linetype[((j:(j+legend.offset) -1) %% length(linetype) + 1)],
lwd = linewidth,
bty="n",
cex=0.8
)
} else {
legend(legendloc, title="Topics",
legend = paste0(plot.me[seq(i, (i+legend.offset),
1)], ": ", topic.labels.dt[Topic %in%
plot.me[i:(i+legend.offset)], Label, keyby=Topic]$Label),
fill=mycol[((j:(j+legend.offset) -1) %% length(mycol) + 1)],
border=mycol[((j:(j+legend.offset) -1) %% length(mycol) + 1)],
lty = linetype[((j:(j+legend.offset) -1) %% length(linetype) + 1)],
lwd = linewidth,
bty = "n",
cex = 0.8
)
}
} # end new plot + legend
# draw the line and loop back
lines(lowess(x = df$Year,
y = df[,as.character(plot.me[i])],
f = smoothing),
type = "l",
pch = plotchar[(j - 1) %% length(plotchar) + 1],
lty = linetype[(j - 1) %% length(linetype) + 1],
lwd = linewidth,
col = mycol[(j - 1) %% length(mycol) + 1]
)
} # end of for loop
# now that we're done looping, close the final file connection
if(remake_figs) {dev.off()}
} # end if(do.plot)
invisible(list("df"=topic.year.avg, "rank.order"=plot.me))
} # end of wrapper function topics.by.year()
## Second function: find year-to-year peak variation for each topic
topic.variation <- function(dataset_name = "consorts",
ntopics = 55,
subset_name = NULL,
iter_index = "",
to.plot = NULL, # any pre-set topics to plot?
notch = FALSE,
bad.topics = NULL,
use.labels = F,
show.outliers = T,
bigchange = NULL # set to a value between 0 and 1 to highlight big changes
) {
# okay, this is interesting
df <- topics.by.year(dataset_name=dataset_name, ntopics=ntopics,
subset_name=subset_name, iter_index=iter_index,
bad.topics=bad.topics, to.plot=to.plot, do.plot=FALSE)
rank.order <- df$rank.order
df <- df$df
if(!exists("build_plot_title")) {
source(file="build_plot_title.R")
}
maintitle <- build_plot_title(dataset_name=dataset_name,
ntopics=ntopics,
iter_index=iter_index,
subset_name=subset_name,
bad.topics=bad.topics,
whatitis="Yearly Variation of Topic Proportions")
# maintitle <- paste("Yearly Variation of Topic Proportions",
# "Generally Preserves Topic Rank")
# if(dataset_name == "consorts" && is.null(subset_name)) {
# subtitle <- paste0("Consortium School dissertations, N", nrow(grid),
# ", years 2001-2010")
# } else if(dataset_name == "consorts" && subset_name == "realconsorts") {
# subtitle <- paste0("Consortium Program dissertations, N", nrow(grid),
# ", years 2001-2010")
# } else {
# subtitle <- paste0(dataset_name, " dissertations, N", nrow(grid),
# ", years 2001-2010")
# }
# Get topic labels, which you've composed elsewhere using 'top docs per
# topic.R', to use in figure legends
if(!exists("get_topic_labels", mode="function")) {
source(file="get topic labels.R")
}
topic.labels.dt <- get_topic_labels(dataset_name=dataset_name, ntopics=ntopics,
subset_name=subset_name, iter_index=iter_index)
head(topic.labels.dt)
nrow(topic.labels.dt)
# Exclude non-content-bearing topics
if (is.null(bad.topics) && dataset_name=="consorts" && ntopics==55) {
bad.topics <- c("4", "47", "22", "2", "24", "13", "50")
}
if(!is.null(bad.topics)) {
topic.labels.dt <- topic.labels.dt[!(Topic %in% bad.topics)]
}
nrow(topic.labels.dt)
nrow(df)
setkey(topic.labels.dt, Rank)
head(topic.labels.dt)
# get colors based on some threshold range
if(!is.null(bigchange)) {
highlights <- df[, lapply(.SD, FUN=function(x) {
d <- as.numeric(summary(x)[5] - summary(x)[2])
if(d > bigchange) {
return("#FDE725FF") # dark yellow
} else {
return("white")
}} )
, .SDcols=!c("Year")]
}
# draw the plot
if(remake_figs) {
filename <- file.path(imageloc, paste0(maintitle, filetype))
switch(filetype,
".pdf" = pdf(filename),
".tiff" = tiff(filename, height=2400, width=2400, units="px", res=400, compression="lzw")
)
}
# sort columns by overall topic rank
setcolorder(df, c("Year", as.character(rank.order)))
if(!is.null(bigchange)) {
setcolorder(highlights, as.character(rank.order))
}
# remove Year column from plot
if(use.labels) {
par(mar=c(13, 4, 4, 2))
}
boxplot(df[, .SD, .SDcols=!c("Year")],
main = maintitle,
# xlab="Topic Number, Arranged by Overall Rank within Corpus",
# cex.axis=0.6, las=2,
ylab = "Portion of Corpus (scaled to 1)",
xaxt = "n",
notch = notch,
outline= show.outliers,
col = if(!is.null(bigchange)) { as.character(highlights) }
)
if(use.labels) {
# par(mar = c(10, 4, 4, 2))
axis(1,
at = seq_along(df[, .SD, .SDcols=!c("Year")]),
labels = topic.labels.dt[Topic %in% rank.order, Label],
las = 2,
ps = 6,
lheight=0.5,
cex.axis = 0.7
)
} else {
axis(1,
at = seq_along(df[, .SD, .SDcols=!c("Year")]),
labels = topic.labels.dt[Topic %in% rank.order, Topic],
las = 2,
ps = 6,
lheight=0.5
)
}
if(!is.null(bigchange)) {
mtext(paste0("Highlighted cells have IQR > ", bigchange), side=3, cex=0.7)
}
# mtext(subtitle, side=3)
# abline(v=(0.5+seq(from=5,to=length(plot.me), by=5)), lty="dotdash")
if(remake_figs) {
dev.off()
}
}
if(FALSE) {
dataset_name <- "noexcludes2001_2015"
ntopics <- 50
iter_index <- 1
subset_name <- "knownprograms2001_2015"
bad.topics <- c("3", "8", "12", "15", "30", "34", "36", "47", "50")
remake_figs=T
topics.by.year(dataset_name=dataset_name, ntopics=ntopics,
subset_name=subset_name, iter_index=iter_index,
bad.topics=bad.topics, smoothing=1/2)
topic.variation(dataset_name=dataset_name, ntopics=ntopics,
subset_name=subset_name, iter_index=iter_index,
bad.topics=bad.topics,
show.outliers=T,
use.labels=T,
bigchange = 0.015)
remake_figs=F
topics_of_interest <- c(44, # online circulation and social media (increasing)
2, # image, body, materiality (increasing)
28, # literacy practices (increasing)
32, # disciplinary formations (decreasing)
35, # institutional supports, barriers, constraints (stable)
26, # personal narrative and oral history (cosine wave??)
49, # scenes of teaching (decreasing)
46 # medical discourse (last minute increase)
)
remake_figs=T
topics.by.year(smoothing = 1/2,
to.plot = c(32, 35, 44),
# per.plot = 6,
legendloc_init="topright")
topics.by.year(smoothing = 1/2,
to.plot = c(2, 28, 26, 49),
# per.plot = 6,
legendloc_init="bottom")
remake_figs=F
# topic.variation(dataset_name="noexcludes", ntopics=55, iter_index="", subset_name = "realconsorts")
}