-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcov_sing_cc.Rmd
209 lines (147 loc) · 8.33 KB
/
cov_sing_cc.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
---
title: "Cov(inc period, serial interval) in Singapore"
author: "Jessica Stockdale, Caroline Colijn"
date: "May 19, 2020"
updated: "22/05/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(survminer)
library(survival)
library(tidyverse)
library(lubridate)
library(icenReg)
library(igraph)
library(visNetwork)
library(mvtnorm)
library(viridis)
options(digits=3)
set.seed(3456)
```
## Singapore
First, load in the data
```{r}
spdata <-read_csv("data/COVID-19_Singapore_data_revised.csv", col_types = list(presumed_infected_date = col_datetime()))
#table(spdata$`Related cases`) # There is one cell with "\n", needs to be changed to 'NA'
spdata$`Related cases`[which(spdata$`Related cases` == "\n")] <- NA
# Rename columns 2, 3 and 4 so no spaces
spdata <- rename(spdata, related_cases = starts_with("Related"),
cluster_links = "Cluster links",
relationship_notes = starts_with("Relation"))
# Remove all the cases that do not have info on date of symptom onset
spdata <- filter(spdata, !is.na(date_onset_symptoms))
```
We need to do some preprocessing. NOTE: This step involves assuming that, in each pair, the one who showed symptoms first was the infector.
```{r}
spnodes <- spdata$CaseID
# Split into separate columns
spdata <- separate(spdata,col = related_cases,into = paste("contactID", 1:7, sep = "_"),
fill = "right")
# Turn into numeric values
spdata <- mutate(spdata,
contactID_1 = as.numeric(contactID_1),
contactID_2 = as.numeric(contactID_2),
contactID_3 = as.numeric(contactID_3),
contactID_4 = as.numeric(contactID_4),
contactID_5 = as.numeric(contactID_5),
contactID_6 = as.numeric(contactID_6),
contactID_7 = as.numeric(contactID_7))
# Select down to columns of interest
spedges <- select(spdata, CaseID, starts_with("contactID"))
# Remove rows with NAs for at least one contact
spedges <- filter(spedges, !is.na(spedges$contactID_1))
singedges = data.frame(from=2,to=1)
for (n in 1:nrow(spedges)) {
for (k in 2:ncol(spedges)) {
if (!is.na(spedges[n,k])) {
singedges=rbind(singedges, c(spedges[[n,k]],spedges[[n,1]]))
}
}
}
singedges=singedges[-1,]
# create undirected graph by removing duplicates
undir=data.frame(from = pmin(singedges[,1],singedges[,2]),
to = pmax(singedges[,1], singedges[,2]))
undir = unique(undir)
undir = undir[-which(undir[,1]==undir[,2]),]
spdata_sympt <- select(spdata, CaseID, date_onset_symptoms)
names(spdata_sympt) <- str_replace(names(spdata_sympt), "CaseID", "from")
undir_dates <- left_join(undir, spdata_sympt, by = "from")
names(undir_dates) <- str_replace(names(undir_dates), "date_onset_symptoms", "from_sympt_date")
names(spdata_sympt) <- str_replace(names(spdata_sympt), "from", "to")
undir_dates <- left_join(undir_dates, spdata_sympt, by = "to")
names(undir_dates) <- str_replace(names(undir_dates), "date_onset_symptoms", "to_sympt_date")
undir_dates <- mutate(undir_dates, earliest_sympt_onset = pmin(to_sympt_date, from_sympt_date, na.rm = T),
raw_serial_interval = to_sympt_date - from_sympt_date, #5 NAs because only 1 case in the pair has a date of symptom onset
abs_serial_interval = abs(raw_serial_interval))
pos <- filter(undir_dates, raw_serial_interval >= 0)
neg <- filter(undir_dates, raw_serial_interval < 0)
onlyone <- filter(undir_dates, is.na(raw_serial_interval))
names(neg)
names(neg)[1] <- "to"
names(neg)[2] <- "from"
names(neg)[3] <- "to_sympt_date"
names(neg)[4] <- "from_sympt_date"
names(neg)
undir_dates <- bind_rows(pos, neg, onlyone)
undir_dates$pto <- str_pad(undir_dates$to, width = 2, side = "left", pad = "0")
undir_dates$pfrom <- str_pad(undir_dates$from, width = 2, side = "left", pad = "0")
undir_dates <- mutate(undir_dates, pairID = factor(paste("case", pfrom, "-", "case", pto, sep = "")))
rm(pos, neg, onlyone)
```
We also need to get the incubation periods
```{r}
spdata$minIncTimes <- spdata$date_onset_symptoms - spdata$end_source
spdata$maxIncTimes <- spdata$date_onset_symptoms - spdata$start_source
spdata$maxIncTimes = pmax(3, spdata$maxIncTimes)
spdata$minIncTimes = pmax(1, spdata$minIncTimes)
```
We want to make a data frame with a row for every suspected infector-infectee pair - and including the serial interval for this pair, and the incubation period of both infector and infectee.
```{r}
sing.data <- data.frame(infector = undir_dates$from, infectee = undir_dates$to, serial.interval = undir_dates$abs_serial_interval, inc.infector.min = spdata$minIncTimes[match(undir_dates$from,spdata$CaseID)], inc.infector.max = spdata$maxIncTimes[match(undir_dates$from,spdata$CaseID)], inc.infectee.min = spdata$minIncTimes[match(undir_dates$to,spdata$CaseID)], inc.infectee.max = spdata$maxIncTimes[match(undir_dates$to,spdata$CaseID)])
```
There are some NAs at the end of this (people without symptom onsets), which we filter out
```{r}
sing.data = sing.data[!is.na(sing.data$serial.interval),]
```
```{r}
sing.data$serial.interval = as.numeric(sing.data$serial.interval)
sing.data$Amean = 0.5*(sing.data$inc.infector.min + sing.data$inc.infector.max)
sing.data$Bmean = 0.5*(sing.data$inc.infectee.min + sing.data$inc.infectee.max)
library(ggplot2)
ggplot(data=sing.data, aes(x= serial.interval, y=Bmean, col=Amean) ) + geom_point() + geom_smooth(method='lm')
```
```{r}
fit1 = lm(formula = serial.interval ~ Bmean, data = sing.data)
summary(fit1)
plot(fit1)
cov(sing.data$serial.interval,sing.data$Bmean) # 5.88
cor(sing.data$serial.interval,sing.data$Bmean) # 0.429
hpear = cor.test(sing.data$serial.interval,sing.data$Bmean); hpear
hspear = cor.test(sing.data$serial.interval,sing.data$Bmean,method="spearman"); hspear # 0.174
hkend= cor.test(sing.data$serial.interval,sing.data$Bmean,method = "kendall"); hkend # 0.134
```
The portion of SI - incubation period that is negative is the portion asymp transmission and in this raw data it is high. However, we probably want to exclude really long IPs and SIs because they are unlikely to be direct samples from the distributions. let's try this. I find that the correlation is not sensitive to removing long SIs but it is sensitive to removing long B incubation periods. But this is a bit unfair -- naturally removing the largest mean values will make a difference.
What we could do is remove rows where we don't have a good estimate of min and max, so our mean incubation period for the infectee is not a good one. This removes quite a few rows, but it is justified, I think.
```{r}
singd = filter(sing.data, inc.infectee.min > 1 & inc.infectee.max < 20 ) # only 16 rows left
singd = filter(sing.data, inc.infectee.min > 1 | inc.infectee.max < 20 ) # NOTE OR works really well
singd = filter(sing.data, inc.infectee.min > 1 ) #
h = cor.test(singd$serial.interval, singd$Bmean, method = "spearman"); h
h = cor.test(singd$serial.interval, singd$Bmean); h
h = cor.test(singd$serial.interval, singd$Bmean, method="kendall"); h
```
The correlation is preserved but significance is lost (around 0.1 now)
Now, having estimated the SI and the incubation period gamma parameters in our various ways, we wish to sample from the joint distribution. Or from some joint distributions that can reflect the various estimates we have made.
```{r}
library(lcmix)
# install.packages("lcmix", repos="http://R-Forge.R-project.org")
mysamps = rmvgamma(n=500, shape=c(2.5,3.75), rate=c(0.5,0.5), corr=matrix(c(1, 0.3, 0.3, 1), nrow = 2))
plot(mysamps[,1],mysamps[,2])
hist(mysamps[,1]-mysamps[,2],breaks = 30)
length(which(mysamps[,1]-mysamps[,2] <0))/nrow(mysamps)
```
Those are high numbers. We could use the shape, scale info for the incubation period and the mean, variance for the serial intervals, resampling and so on, to estimate this. I would like to know how this number depends on the covariance. It will also depend on the difference in means - if the means are too different, no matter what the covariance the fraction is high ..
I will pick this up later.
We move this to the file presymptomatic .. revised.Rmd to complete. The Singapore correlation estimates is higher than Tianjin - it is about 0.429, but that is likely driven by a few outliers. The spearman correlation is 0.448 but p is 0.08. Some indication of correlation, for sure.