-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathTianjin_incubation_intermediates_updateddata.Rmd
396 lines (281 loc) · 16.9 KB
/
Tianjin_incubation_intermediates_updateddata.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
---
title: "Testing incubation with intermediate"
author: "Caroline Colijn, Jessica Stockdale"
date: "`r Sys.Date()`"
output:
html_document:
keep_md: TRUE
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(survminer)
library(survival)
library(tidyverse)
library(lubridate)
library(icenReg)
library(igraph)
library(visNetwork)
library(mvtnorm)
library(ggplot2)
library(hrbrthemes)
library(viridis)
library(gridExtra)
options(digits=3)
set.seed(3456)
```
## Data
Thanks to Dongxuan Chen and Louxin Zhang. These data are from three main sources:
* source1: http://wsjk.tj.gov.cn/col/col87/index.html#!uid=259&pageNum=1 (Tianjin health commission official website, for daily announcements)
* source2: https://www.weibo.com/u/2967529507?is_all (Jinyun News, Tianjin offical local media weibo account, for patient symptom onset reference)
* source3: https://m.weibo.cn/status/IrrHI1FHm?jumpfrom=weibocom (another Tianjin local media weibo link, for mall cluster reference)
```{r}
tdata=read.csv("data/Tianjin135cases_revised.csv",na.strings = "", stringsAsFactors = F)
tdata$symptom_onset=as.Date(tdata$symptom_onset, format = "%d/%m/%Y")
tdata$start_source=as.Date(tdata$start_source, format = "%d/%m/%Y")
tdata$end_source=as.Date(tdata$end_source,format = "%d/%m/%Y" )
tdata$confirm_date=as.Date(tdata$confirm_date,format = "%d/%m/%Y" )
names(tdata)[1] = "case_id"
str(tdata)
```
## Incubation period
The incubation period is the time between exposure and the onset of symptoms. We estimate this directly from the stated start and end times for cases' exposure windows. Because it is explicitly about the symptom onset, we remove those who don't have symptom onset defined. These are a small minority of cases (10) and the alternative would be to impute their symptom onset time using the others' delay to confirmation time. For now, we remove them. Then, if no other end time for the exposure is given or if the end of the exposure time is after the time of symptom onset, set the last exposure time to the symptom onset time. This is because they must have been exposed before symptom onset.
If no other start time is given, we assume that they must have been exposed within the 20 days previous to their symptom onset. We set this as an upper bound on the incubation period given prior knowledge.
These give us the maximum and minimum incubation times.
```{r}
goodii=which(!is.na(tdata$symptom_onset)) # Remove the 10 individuals without symptom onset
tdata_orig <- tdata # save the full dataset in case
tdata <- tdata[goodii,]
tdata$end_source[which(is.na(tdata$end_source))]=tdata$symptom_onset[which(is.na(tdata$end_source))] # if no end exposure: set to symptom onset
tdata$end_source = pmin(tdata$end_source, tdata$symptom_onset) # if end exposure after onset, set to onset
tdata$start_source[which(is.na(tdata$start_source))]= tdata$symptom_onset[which(is.na(tdata$start_source))] - 20 # if no start, set to symptom onset - 20
# Let's confirm that the end_source is always before or equal to the symptom onset date
sum(tdata$end_source>tdata$symptom_onset) # =0. Good
tdata$maxIncTimes=tdata$symptom_onset-tdata$start_source
tdata$minIncTimes = tdata$symptom_onset-tdata$end_source
tdata$maxIncTimes
tdata$minIncTimes
```
Define the maximum and minimum exposure times based on these assumptions. These are the times $t_{min}^i$ and $t_{max}^i$ in the notation.
We assume that incubation times have to be at least 1 day, based on prior knowledge. We set the maximum incubation times as at least 3 days, to take into account some uncertainty on symptom onset reporting.
```{r}
#spdata = filter(spdata, maxIncTimes > 2)
tdata$maxIncTimes = pmax(3, tdata$maxIncTimes)
tdata$minIncTimes = pmax(1, tdata$minIncTimes)
```
From here this file diverges from the ..wtables Rmd file.
First define the relevant times for truncation $T_i$
```{r}
tdata$Ti = as.numeric(ymd("2020-02-27")-tdata$start_source)
# last Tianjin confirm date is 22/02
```
Specify some fixed and initial parameters.
In the paper on medrxiv our estimates for the incubation period were shape: 4.06 (2.69, 5.11) and scale: 2.218 (1.405, 2.53).
Here we will have a shape $a_g$ for the generation time and $a_i$ for the incubation period, and the same scale $b$ for both.
```{r}
b=2.2 # common scale parameter
ai= 4.1 # shape for incubation period ,as estimated in first round. true value less than this?
ag = 3 #starting point for shape for generation time.
n=3 # max number of intermediate cases
r = 0.1 # add on average 1 intermediate per 10 days? who knows. must look at sensitivity to this parameter
```
Functions
```{r}
source("incufuncs.R")
```
These functions seem to work. Yay! Now we have to set up the relevant data inputs, and maximize the likelihood to solve for several parameters.
We have spdata's minIncTimes and maxIncTimes, which are the 'mintime' and 'maxtime' inputs. We already computed Ti which are the rtTime input.
```{r}
# negative log likelihood function for optim
l_optim <- function( twopars, allmaxtimes, allmintimes, allrtTimes,
maxinters=n, rate=r, comscale=b) {
gs=twopars[1] # gen time scale parameter
is=twopars[2] # incubation period scale parameter
Ncases = length(allmaxtimes)
# now compute lirt for each case i
indelikes = vapply(1:Ncases,
function(ind) lirt(maxtime=allmaxtimes[ind],
mintime=allmintimes[ind],
rtTime=allrtTimes[ind],
maxinters=maxinters, rate=rate,
genshape = gs, incshape=is,
comscale=comscale),
FUN.VALUE = 1)
# the product is the likelihood. the negative sum is the negative log likelihood
return(-sum(indelikes))
}
```
Testing: seems to work *except* if max and min times are the same, so we correct for this by adding a small noise term in those cases; then
```{r}
l_optim(c(1,2), allmaxtimes = tdata$maxIncTimes,
allmintimes=tdata$minIncTimes,allrtTimes = tdata$Ti,
maxinters=n, rate=r, comscale=b)
```
So now let's optimize!
```{r}
optim(c(1,2), l_optim, allmaxtimes = tdata$maxIncTimes, allmintime=tdata$minIncTimes,
allrtTimes = tdata$Ti, maxinters=n, rate=r, comscale=b,
lower = c(0,0), upper = c(20,20), method = "L-BFGS-B")
```
Take a look at a heatmap:
```{r}
# Grid of likelihood values
x <- c(seq(0.1,5, length.out=50))
y <- c(seq(0.1,5, length.out=50))
data <- expand.grid(X=x, Y=y)
for (i in 1:dim(data)[1]){
data$Z[i] <- -l_optim(c(data[i,1],data[i,2]), allmaxtimes = tdata$maxIncTimes,
allmintime=tdata$minIncTimes,allrtTimes = tdata$Ti,
maxinters=n, rate=r, comscale=b)
}
# Plot them
ggplot(data, aes(X, Y, fill= Z)) +
geom_tile() +
scale_fill_viridis(discrete=FALSE)
```
We can test for sensitivity to rate r - the number of intermediates 'arriving' per day
```{r}
# current MLEs: gen time shape 1.13, incubation period shape 3.12, for r=0.1
r_cur = c(seq(0.02, 0.25, length.out=20))
rec<-matrix(NA, length(r_cur), 2)
for (i in 1:length(r_cur)){
ans <- optim(c(1,2), l_optim, allmaxtimes = tdata$maxIncTimes, allmintime=tdata$minIncTimes,
allrtTimes = tdata$Ti, maxinters=n, rate=r_cur[i], comscale=b ,
lower = c(0,0), upper = c(20,20), method = "L-BFGS-B")
rec[i,]<-ans$par
}
df1 <- data.frame(r=r_cur, ag=rec[,1])
df2 <- data.frame(r=r_cur, ai=rec[,2])
plot1 <- ggplot(df1, aes(x=r, y=ag)) + geom_line(color="maroon4")+ geom_point(color="maroon4") + theme_minimal()
plot2 <- ggplot(df2, aes(x=r, y=ai)) + geom_line(color="royalblue4")+ geom_point(color="royalblue4") + theme_minimal()
grid.arrange(plot1, plot2, ncol=2)
# Plot mean estimate instead (scale 2.2)
df3 <- data.frame(r=r_cur, "Mean generation time"=rec[,1]*2.2)
df4 <- data.frame(r=r_cur, "Mean incubation period"=rec[,2]*2.2)
plot1 <- ggplot(df3, aes(x=r, y=Mean.generation.time)) + geom_line(color="maroon4")+ geom_point(color="maroon4") + theme_minimal()
plot2 <- ggplot(df4, aes(x=r, y=Mean.incubation.period)) + geom_line(color="royalblue4")+ geom_point(color="royalblue4") + theme_minimal()
grid.arrange(plot1, plot2, ncol=2)
# on the same plot
df5 <- data.frame(r=r_cur, "Mean generation time"=rec[,1]*2.2, "Mean incubation period"=rec[,2]*2.2)
ggplot(df5) + geom_line(aes(x=r, y=Mean.generation.time, color="Mean generation time",))+ geom_point(color="maroon4", aes(x=r, y=Mean.generation.time)) + theme_minimal() + geom_line(aes(x=r, y=Mean.incubation.period, color="Mean incubation period"))+ geom_point(color="royalblue4", aes(x=r, y=Mean.incubation.period)) + ylab("Time (days)") +
scale_color_manual(values = c("Mean generation time" = 'maroon4','Mean incubation period' = 'royalblue4')) +
labs(color = ' ')
#ggsave(filename = "final_figures/incubation_generation_tianjin_woboot.pdf", width = 8, height = 6)
```
Some kind of uncertainty estimate around the parameters would be helpful
We explore this at $r=0.05, r=0.1, r=0.15$ to start; we resample the data using bootstrapping, and get empirical 90% CIs for example around the $ai$ and $ag$ parameters.
```{r eval=FALSE}
nboot = 200
getBootstraps = function(nboot, dataset=spdata, therate=0.1) {
bootresults= lapply(1:nboot, function(x) {
bootind = sample(1:nrow(dataset), nrow(dataset), replace = T)
return(optim(c(1,2), l_optim, allmaxtimes = dataset$maxIncTimes[bootind], allmintime=dataset$minIncTimes[bootind],
allrtTimes = dataset$Ti[bootind], maxinters=n, rate=therate, comscale=b ))
})
gsboots= 0*1:100
isboots = 0*1:100
for (n in 1:nboot) {gsboots[n]=bootresults[[n]]$par[1]}
for (n in 1:nboot) {isboots[n]=bootresults[[n]]$par[2]}
return(data.frame(gsboots = gsboots, isboots = isboots, rate = therate))
}
boot1=getBootstraps(nboot, tdata,therate = 0.05)
boot2=getBootstraps(nboot, tdata,therate = 0.1)
boot3=getBootstraps(nboot, tdata,therate = 0.15)
boot4=getBootstraps(nboot, tdata,therate = 0.2)
#save(boot1, boot2, boot3, boot4, file = "data/interbooty2_tianjin.Rdata")
```
**Table 1 incubation period estimates (accounting for intermediates):** Table of quantile information
```{r}
load("data/interbooty2_tianjin.Rdata")
quantile(boot1$isboots*b, p=c(0.025, 0.5, 0.975))
quantile(boot2$isboots*b, p=c(0.025, 0.5, 0.975))
quantile(boot3$isboots*b, p=c(0.025, 0.5, 0.975))
quantile(boot4$isboots*b, p=c(0.025, 0.5, 0.975))
```
And quantile information for the generation time
```{r}
quantile(boot1$gsboots*b, p=c(0.025, 0.5, 0.975))
quantile(boot2$gsboots*b, p=c(0.025, 0.5, 0.975))
quantile(boot3$gsboots*b, p=c(0.025, 0.5, 0.975))
quantile(boot4$gsboots*b, p=c(0.025, 0.5, 0.975))
```
We overlay the previous plots with boxplots for the bootstraps (**Figure 5 lower panel**)
```{r}
value <- c(boot1[,1], boot2[,1], boot3[,1], boot4[,1])
group <- c(boot1[,3], boot2[,3], boot3[,3], boot4[,3])
gen_bootdata <- data.frame(value, r=group)
value <- c(boot1[,2], boot2[,2], boot3[,2], boot4[,2])
group <- c(boot1[,3], boot2[,3], boot3[,3], boot4[,3])
inc_bootdata <- data.frame(value, r=group)
# Plot a_g and a_i
df1 <- data.frame(r=r_cur, ag=rec[,1])
df2 <- data.frame(r=r_cur, ai=rec[,2])
plot1 <- ggplot(df1, aes(x=r, y=ag)) + geom_boxplot(data = gen_bootdata, aes(group=r, y=value), fill = 'lavender', colour = 'plum4', alpha = 0.7) + geom_line(color="maroon4")+ geom_point(color="maroon4") + theme_minimal() + scale_y_continuous(name = "Generation time shape parameter") + scale_x_continuous(name = "Number of intermediate cases per day, r")
plot2 <- ggplot(df2, aes(x=r, y=ai)) + geom_boxplot(data = inc_bootdata, aes(group=r, y=value), fill = 'lightblue2', colour = 'skyblue4', alpha = 0.7) + geom_line(color="royalblue4")+ geom_point(color="royalblue4") + theme_minimal() + scale_y_continuous(name = "Incubation period shape parameter") + scale_x_continuous(name = "Number of intermediate cases per day, r")
grid.arrange(plot1, plot2, ncol=2)
#save
g <- arrangeGrob(plot1, plot2, ncol=2) #generates g
#ggsave(filename = "final_figures/incgen_tianjin_shapes.pdf", g, width = 10, height = 6)
# Plot mean estimates instead (scale b=2.2)
value <- c(boot1[,1]*b, boot2[,1]*b, boot3[,1]*b, boot4[,1]*b)
group <- c(boot1[,3], boot2[,3], boot3[,3], boot4[,3])
gen_bootdata <- data.frame(value, r=group)
value <- c(boot1[,2]*b, boot2[,2]*b, boot3[,2]*b, boot4[,2]*b)
group <- c(boot1[,3], boot2[,3], boot3[,3], boot4[,3])
inc_bootdata <- data.frame(value, r=group)
df5 <- data.frame(r=r_cur, "Mean generation time"=rec[,1]*b, "Mean incubation period"=rec[,2]*b)
ggplot(df5, aes(x=r, y=Mean.generation.time)) + geom_boxplot(data = gen_bootdata, aes(group=r, y=value), fill = 'lavender', colour = 'plum4', alpha = 0.7) + geom_boxplot(data = inc_bootdata, aes(group=r, y=value), fill = 'lightblue2', colour = 'skyblue4', alpha = 0.7) + geom_line(aes(x=r, y=Mean.generation.time, color="Mean generation time",)) + geom_point(color="maroon4", aes(x=r, y=Mean.generation.time)) + theme_minimal() + geom_line(aes(x=r, y=Mean.incubation.period, color="Mean incubation period"))+ geom_point(color="royalblue4", aes(x=r, y=Mean.incubation.period)) + ylab("Time (days)") +
scale_color_manual(values = c("Mean generation time" = 'maroon4','Mean incubation period' = 'royalblue4')) +
labs(color = ' ') + scale_x_continuous(name = "Number of intermediate cases per day, r")
#ggsave(filename = "final_figures/Fig5lower_incgen_tianjin_means.pdf", width = 10, height = 6)
```
Then the remaining question is to see if we want to handle right truncation without intermediate cases. And the uncertainty and so on there. There, we could continue to do the 3 models (gamma, Weibull, lognormal) because without intermediate cases, it's just the CDFs (which we have access to in R).
We can also optimize the shared scale parameter b
```{r}
# negative log likelihood function for optim
l_optim_3 <- function( threepars, allmaxtimes, allmintimes, allrtTimes,
maxinters=n, rate=r) {
gs=threepars[1] # gen time scale parameter
is=threepars[2] # incubation period scale parameter
comscale=threepars[3]
Ncases = length(allmaxtimes)
# now compute lirt for each case i
indelikes = vapply(1:Ncases,
function(ind) lirt(maxtime=allmaxtimes[ind],
mintime=allmintimes[ind],
rtTime=allrtTimes[ind],
maxinters=maxinters, rate=rate,
genshape = gs, incshape=is,
comscale=comscale),
FUN.VALUE = 1)
# the product is the likelihood. the negative sum is the negative log likelihood
return(-sum(indelikes))
}
l_optim_3(c(3,4,b), allmaxtimes = tdata$maxIncTimes,
allmintimes=tdata$minIncTimes,allrtTimes = tdata$Ti,
maxinters=n, rate=r) # matches the 2 par version
optim(c(3,4, b), l_optim_3, allmaxtimes = tdata$maxIncTimes, allmintime=tdata$minIncTimes,
allrtTimes = tdata$Ti, maxinters=n, rate=r)
```
It wants to make scale b really small and the shape parameters really big - but actually if you look at the resulting distribution it has a very similar mean to b=2.2 so that's good. Lets try running 2 par optim for a reasonable range of b values instead.
```{r}
res<-matrix(NA,91,2)
for (i in 10:100){
ans <- optim(c(40/i,40/i), l_optim, allmaxtimes = tdata$maxIncTimes, allmintime=tdata$minIncTimes,
allrtTimes = tdata$Ti, maxinters=n, rate=r, comscale=i/20 )
res[i-9,] <- ans$par
}
# Plot mean estimate instead (scale 2.1)
dfbg <- data.frame(b=seq(10/20,100/20,length.out=91), ag=res[,1])
dfbi <- data.frame(b=seq(10/20,100/20,length.out=91), ai = res[,2])
dfbg2 <- data.frame(b=seq(10/20,100/20,length.out=91), "Mean generation time"=res[,1]*seq(10/20,100/20,length.out=91))
dfbi2 <- data.frame(b=seq(10/20,100/20,length.out=91), "Mean incubation period" = res[,2]*seq(10/20,100/20,length.out=91))
plot1 <- ggplot(dfbg, aes(x=b, y=ag)) + geom_line(color="maroon4")+ geom_point(color="maroon4") + theme_minimal()
plot2 <- ggplot(dfbi, aes(x=b, y=ai)) + geom_line(color="royalblue4")+ geom_point(color="royalblue4") + theme_minimal()
grid.arrange(plot1, plot2, ncol=2)
plot1 <- ggplot(dfbg2, aes(x=b, y=Mean.generation.time)) + geom_line(color="maroon4")+ geom_point(color="maroon4") + theme_minimal()
plot2 <- ggplot(dfbi2, aes(x=b, y=Mean.incubation.period)) + geom_line(color="royalblue4")+ geom_point(color="royalblue4") + theme_minimal()
grid.arrange(plot1, plot2, ncol=2)
# on the same plot
df_together <- data.frame(b=seq(10/20,100/20,length.out=91), "Mean generation time"=res[,1]*seq(10/20,100/20,length.out=91), "Mean incubation period"=res[,2]*seq(10/20,100/20,length.out=91))
ggplot(df_together) + geom_line(color="maroon4", aes(x=b, y=Mean.generation.time))+ geom_point(color="maroon4", aes(x=b, y=Mean.generation.time)) + theme_minimal() + geom_line(color="royalblue4", aes(x=b, y=Mean.incubation.period))+ geom_point(color="royalblue4", aes(x=b, y=Mean.incubation.period)) + ylab("Time (days)")
```