-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcov_tianjin_cc.Rmd
356 lines (248 loc) · 15.8 KB
/
cov_tianjin_cc.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
---
title: "Cov(inc period, serial interval) in Tianjin"
author: "Jessica Stockdale, Caroline Colijn"
date: "May 19, 2020"
updated: "22/05/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(viridis)
library(tidyverse)
library(lubridate)
options(digits=3)
set.seed(3456)
```
## Tianjin
First, load in the data
```{r}
tdata=read.csv("data/Tianjin135cases_revised.csv",na.strings = "", stringsAsFactors = F)
tdata$symptom_onset=as.Date(tdata$symptom_onset, format = "%d/%m/%Y")
tdata$start_source=as.Date(tdata$start_source, format = "%d/%m/%Y")
tdata$end_source=as.Date(tdata$end_source,format = "%d/%m/%Y" )
tdata$confirm_date=as.Date(tdata$confirm_date,format = "%d/%m/%Y" )
tdata <- tdata[which(!is.na(tdata$symptom_onset)),]
names(tdata)[1] = "case_id"
tdata$Infection_source <- str_replace(tdata$Infection_source, pattern = "JN", replacement = "TJ")
```
We need to do some preprocessing. NOTE: This step involves assuming that, in each pair, the one who showed symptoms first was the infector.
```{r}
tdata$Infection_source <- str_to_lower(tdata$Infection_source)
tdata$Infection_source <- str_trim(tdata$Infection_source)
tdata$Infection_source_dup <- tdata$Infection_source
tdata$Infection_source_dup <- str_replace_all(tdata$Infection_source_dup, pattern = "person", replacement = "individual")
tdata$Infection_source_dup <- str_replace(tdata$Infection_source_dup,
pattern = "coworker of a individual from wuhan",
replacement = "coworker")
tdata <- mutate(tdata, source_group = case_when(!is.na(str_match(Infection_source_dup, "wuhan|hubei")) ~ "Wuhan and Hubei", #Priority 1
!is.na(str_match(Infection_source_dup, "mall|store|shopper|shopping")) ~ "Mall", #Priority 1
!is.na(str_match(Infection_source_dup, "family|relative|wife|mother|son|sister|daughter|brother|husband|duaghtor|whife|hunsband")) ~ "Relative", #Priority 2
!is.na(str_match(Infection_source_dup, "coworker|business|workplace|colleague|colleage")) ~ "Coworker", #Priority 2
!is.na(str_match(Infection_source_dup, "tj|patient")) ~ "Other relationship", #Priority 2
!is.na(str_match(Infection_source_dup, "train|travel|trip|hebei|dalian")) ~ "Other travel", #Priority 3
!is.na(str_match(Infection_source_dup, "unknown|unclear")) ~ "Unknown", #Priority 5
is.na(Infection_source_dup) ~ "Unknown", #Priority 5
T ~ "other")) #there should be none of these, so this is just a sanity check!
#Remove the duplicated infection source column which is no longer necessary
tdata <- select(tdata, -Infection_source_dup)
#What is distribution of probably source of infection (grouped)?
table(tdata$source_group)
mynodes <- tdata$case_id
mynodes <- str_to_lower(mynodes)
tdata$case_id <- str_to_lower(tdata$case_id)
edges = data.frame(from=mynodes[9],to=mynodes[21],stringsAsFactors = F ) # i read this one manually
for (id in 1:nrow(tdata)) {
tonode=tdata$case_id[id]
fromnodes=str_extract_all(tdata$Infection_source[id], "tj\\d+", simplify = T) #in lower case due to above early/late split on infection source
if (length(fromnodes)>0) {
for (k in 1:length(fromnodes)) {
edges=rbind(edges, c(fromnodes[k], tonode))
}
}
}
head(edges)
edges=edges[-1,] #Remove the initial relationship we gave so it isn't duplicated
edges=edges[-which(is.na(edges[,1])),] # NAs arose from a few empty entries for Infection_source
tdata_sympt <- select(tdata, case_id, symptom_onset)
names(tdata_sympt) <- str_replace(names(tdata_sympt), "case_id", "from")
undir_tdates <- left_join(edges, tdata_sympt, by = "from")
names(undir_tdates) <- str_replace(names(undir_tdates), "symptom_onset", "from_sympt_date")
# Repeat, but add the date of symptom onset for the caseID of the 'to' case
names(tdata_sympt) <- str_replace(names(tdata_sympt), "from", "to")
undir_tdates <- left_join(undir_tdates, tdata_sympt, by = "to")
names(undir_tdates) <- str_replace(names(undir_tdates), "symptom_onset", "to_sympt_date")
undir_tdates <- mutate(undir_tdates, earliest_sympt_onset = pmin(to_sympt_date, from_sympt_date, na.rm = T),
raw_serial_interval = to_sympt_date - from_sympt_date,
abs_serial_interval = abs(raw_serial_interval))
pos <- filter(undir_tdates, raw_serial_interval >= 0)
neg <- filter(undir_tdates, raw_serial_interval < 0)
onlyone <- filter(undir_tdates, is.na(raw_serial_interval))
names(neg)
names(neg)[1] <- "to"
names(neg)[2] <- "from"
names(neg)[3] <- "to_sympt_date"
names(neg)[4] <- "from_sympt_date"
names(neg)
undir_tdates <- bind_rows(pos, neg, onlyone)
undir_tdates$pto <- str_replace(undir_tdates$to, pattern = "tj", replacement = "")
undir_tdates$pto <- str_pad(undir_tdates$pto, width = 3, side = "left", pad = "0")
undir_tdates$pfrom <- str_replace(undir_tdates$from, pattern = "tj", replacement = "")
undir_tdates$pfrom <- str_pad(undir_tdates$pfrom, width = 3, side = "left", pad = "0")
undir_tdates <- mutate(undir_tdates, pairID = factor(paste("tj", pfrom, "-", "tj", pto, sep = "")))
rm(pos, neg, onlyone)
```
We also need to get the incubation periods
```{r}
tdata$end_source[which(is.na(tdata$end_source))]=tdata$symptom_onset[which(is.na(tdata$end_source))] # if no end exposure: set to symptom onset
tdata$end_source = pmin(tdata$end_source, tdata$symptom_onset) # if end exposure after onset, set to onset
tdata$start_source[which(is.na(tdata$start_source))]= tdata$symptom_onset[which(is.na(tdata$start_source))] - 20 # if no start, set to symptom onset - 20
tdata$maxIncTimes=tdata$symptom_onset-tdata$start_source
tdata$minIncTimes = tdata$symptom_onset-tdata$end_source
tdata$maxIncTimes = pmax(3, tdata$maxIncTimes)
tdata$minIncTimes = pmax(1, tdata$minIncTimes)
```
We want to make a data frame with a row for every suspected infector-infectee pair - and including the serial interval for this pair, and the incubation period of both infector and infectee.
```{r}
tianjin.data <- data.frame(infector = undir_tdates$from, infectee = undir_tdates$to, serial.interval = undir_tdates$abs_serial_interval, inc.infector.min = tdata$minIncTimes[match(undir_tdates$from,tdata$case_id)], inc.infector.max = tdata$maxIncTimes[match(undir_tdates$from,tdata$case_id)], inc.infectee.min = tdata$minIncTimes[match(undir_tdates$to,tdata$case_id)], inc.infectee.max = tdata$maxIncTimes[match(undir_tdates$to,tdata$case_id)])
```
There are some NAs at the end of this, which we filter out
```{r}
tianjin.data = tianjin.data[!is.na(tianjin.data$serial.interval),]
```
```{r}
tianjin.data$serial.interval = as.numeric(tianjin.data$serial.interval)
tianjin.data$Amean = 0.5*(tianjin.data$inc.infector.min + tianjin.data$inc.infector.max)
tianjin.data$Bmean = 0.5*(tianjin.data$inc.infectee.min + tianjin.data$inc.infectee.max)
library(ggplot2)
ggplot(data=tianjin.data, aes(x= serial.interval, y=Bmean, col=Amean) ) + geom_point() + geom_smooth(method='lm')
```
Interesting, looks like a mild correlation, positive between serial and B incubation, and negative with A incubation.
```{r}
fit1 = lm(formula = serial.interval ~ Bmean, data = tianjin.data)
summary(fit1)
plot(fit1)
cov(tianjin.data$serial.interval,tianjin.data$Bmean) # 2.63
hpear = cor.test(tianjin.data$serial.interval,tianjin.data$Bmean); hpear #0.289
hspear = cor.test(tianjin.data$serial.interval,tianjin.data$Bmean,method="spearman"); hspear # 0.285
hkend= cor.test(tianjin.data$serial.interval,tianjin.data$Bmean,method = "kendall"); hkend # 0.206
```
The portion of SI - incubation period that is negative is the portion asymp transmission and in this raw data it is high. However, we probably want to exclude really long IPs and SIs because they are unlikely to be direct samples from the distributions. let's try this. I find that the correlation is not sensitive to removing long SIs but it is sensitive to removing long B incubation periods. But this is a bit unfair -- naturally removing the largest mean values will make a difference.
What we could do is remove rows where we don't have a good estimate of min and max, so our mean incubation period for the infectee is not a good one. This removes quite a few rows, but is it justified, I think.
```{r}
tjd = filter(tianjin.data, inc.infectee.min > 1 & inc.infectee.max < 20 ) # only 16 rows left
tjd = filter(tianjin.data, inc.infectee.min > 1 | inc.infectee.max < 20 ) # NOTE OR works really well
tjd = filter(tianjin.data, inc.infectee.min > 1 ) #
h = cor.test(tjd$serial.interval, tjd$Bmean, method = "spearman"); h
h = cor.test(tjd$serial.interval, tjd$Bmean); h
h = cor.test(tjd$serial.interval, tjd$Bmean, method="kendall"); h
```
The correlation is preserved but significance is lost (around 0.1 now)
Now, having estimated the SI and the incubation period gamma parameters in our various ways, we wish to sample from the joint distribution. Or from some joint distributions that can reflect the various estimates we have made.
```{r}
library(lcmix)
# install.packages("lcmix", repos="http://R-Forge.R-project.org")
mysamps = rmvgamma(n=500, shape=c(2.5,3.75), rate=c(0.5,0.5), corr=matrix(c(1, 0.3, 0.3, 1), nrow = 2))
plot(mysamps[,1],mysamps[,2])
hist(mysamps[,1]-mysamps[,2],breaks = 30)
length(which(mysamps[,1]-mysamps[,2] <0))/nrow(mysamps)
```
Those are high numbers. We could use the shape, scale info for the incubation period and the mean, variance for the serial intervals, resampling and so on, to estimate this. I would like to know how this number depends on the covariance. It will also depend on the difference in means - if the means are too different, no matter what the covariance the fraction is high.
In this file I'll look at the estimates from the main text: (1) the early and (2) late incubation period without intermediates plus the serial intervals from the ICC, and then (3) kind of as above, the generation time and the serial interval.
The first issue is that I have estimated covariance of inc period and si, but in (3) we estimated the generation time (not incubation period) and SI. The next issue is that I did not estimate a gamma distributed SI from the ICC method; it was normally distributed. I could use a gamma with the same mean and variance, or find a way to sample a joint distribution where there is a gamma on one margin and normal on another. Weird. However, allowing the SI to have slightly higher variance than a normal distribution and positive support seems completely reasonable so I will proceed with a gamma assumption.
TABLE S2 in the paper:
Incubation period:
Early: median 6.48 shape 6.01 (3.61, 7.26) scale 1.140 (0.66,1.276)
Late median 12.11 shape 17.78 (9.52, 21.47) scale 0.695 (0.379,0.778)
Serial interval: 4.31 with sigma = 0.935 (w 4 cases per cluster)
To get an approximately similar (same mean and variance) gamma distribution, I need the mean 4.31 to be $a \theta$ and the variance to be $a \theta^2$ where $a$ is the shape and $\theta$ is the scale. This gives two equations and two unknowns: $a \theta = 4.31$, $a\theta^2$ = variance is $\sigma^2 = 0.874. This gives $\theta = a\theta^2 / a\theta = 0.874/4.31 = 0.203$, and $a = 4.31/0.203 = 21.2$.
```{r}
mean(rgamma(n=10000, shape = 21.2, scale=0.203))
var(rgamma(n=10000, shape = 21.2, scale=0.203))
```
Good enough.
Now, I don't have certainty about these estimates. And for the sake of this analysis I'll just use point estimates in the middle because, well, honestly.
EARLY: histogram and one sample
```{r}
mysamps = rmvgamma(n=5000, shape=c(6.01,21.2), rate=c(1/1.14,1/0.203), corr=matrix(c(1, 0.263, 0.263, 1), nrow = 2))
plot(mysamps[,1],mysamps[,2])
hist(mysamps[,1]-mysamps[,2],breaks = 30)
length(which(mysamps[,1]-mysamps[,2] <0))/nrow(mysamps)
```
EARLY : resample from the gamma fit for the incubation period, and a little from the 21, 0.2 from the SI, and create a histogram for pre-symp transmission. This could go in the paper.
```{r}
load("data/tianjin_inc_fits.Rdata")
incparsamps = exp(rmvnorm(n=100,
mean = Eallthree$myfit_gamma$coefficients,
sigma=Eallthree$myfit$var))
simeansamps = rnorm(100, mean=21, sd=1)
bigsamps = lapply(1:100, function(x)
rmvgamma(n=500, shape=c(incparsamps[x,1], simeansamps[x]),
rate=c(incparsamps[x,2], 1/0.203),
corr = matrix(c(1, 0.263, 0.263, 1), nrow = 2)))
bigsamps = do.call(rbind, bigsamps)
hist( bigsamps[,1] - bigsamps[,2])
```
Functional version
```{r}
getMyDiffs = function(statfit, cormean=0.29) {
# choose shape and scale according to our fits
incparsamps = exp(rmvnorm(n=100,
mean = statfit$coefficients,
sigma=statfit$var))
# choose mean SI with a bit of uncertainty too, but not too much
simeansamps = rnorm(100, mean=21, sd=1)
# we don't really even know the correlation between inc and si
corvals = rnorm(100, mean=cormean, sd = 0.04)
# sample:
bigsamps = lapply(1:100, function(x)
rmvgamma(n=500, shape=c(incparsamps[x,1], simeansamps[x]),
rate=c(1/incparsamps[x,2], 1/0.203),
corr = matrix(c(1, corvals[x], corvals[x], 1), nrow = 2)))
bigsamps = do.call(rbind, bigsamps)
return(data.frame(incs=bigsamps[,1], sis=bigsamps[,2],diffs= bigsamps[,2] - bigsamps[,1]))
}
earlydiffs = getMyDiffs(Eallthree$myfit_gamma)
anydiffs=getMyDiffs(allthree$myfit_gamma)
latediffs= getMyDiffs(Lallthree$myfit_gamma)
d1=data.frame(TimeDiff=earlydiffs$diffs, group="Early")
d2=data.frame(TimeDiff=latediffs$diffs, group="Late")
d3=data.frame(TimeDiff=anydiffs$diffs, group="Unstratified")
df=rbind(d1,d2, d3)
ggplot(data=df, aes(x=TimeDiff, fill=group))+geom_histogram(position="dodge", bins=30)+theme_bw()+ggtitle("Tianjin")
```
EARLY: dependence of fraction pre on the estimate correlation
```{r}
fracpresym <- function(n, shape, rate, corval ) {
mysamps = rmvgamma(n=5000, shape=c(6.01,21.2), rate=c(1/1.14,1/0.203), corr=matrix(c(1, corval,corval, 1), nrow = 2))
return(length(which(mysamps[,1]-mysamps[,2] <0))/n)
}
corvec = seq(0.001, 0.4, by=0.01)
fpre = vapply(corvec,function(x)
fracpresym(5000, shape=c(6.01,21.2), rate=c(1/1.14,1/0.203),
corval = x),
FUN.VALUE = 1)
plot(corvec, fpre) # in this region it is not as sensitive to the correlation as to the difference in means
```
EARLY: dependence on the scale of the gamma for incubation period
```{r}
shape1vec = seq(4, 8, by=0.1)
fpre = vapply(shape1vec,function(x)
fracpresym(5000, shape=c(x,21.2), rate=c(1/1.14,1/0.203),
corval = 0.263),
FUN.VALUE = 1)
plot(shape1vec, fpre) # in this region it is not as sensitive to the correlation as to the difference in means
```
This is great -- if I did the naive thing and subtracted independent samples for the SI from the incubation period i get about 20% pre-symp transmission
```{r}
sum(rgamma(1000, shape = 6, scale = 1.14) - rnorm(1000, mean = 4.31, sd = 0.93) < 0)
```
which is consistent and shows that the correlation decreases this a little.
For the late Tianjin estimate with a mean of 12, pretty much all of the transmission will have to be pre-symptomatic, but then, we don't really believe a median incubation period of 12 any more.
For the combined Tianjin Cluster we have
median 8.06 shape 4.74 (3.35, 5.72) scale 1.827 (1.285, 2.045)
and for the late data we have
median 12.11 shape 17.78 (9.52, 21.47) scale 0.695 (0.379,0.778)
```{r}
# given the statfit (from Eallthree or Lallthree) I could get samples of the shape, scale parameter like this, but right now I am not going to
# x=exp(rmvnorm(n=10000, mean = statfit$coefficients, sigma=statfit$var)
```