-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsingapore_serial_intervals_revised.Rmd
1086 lines (857 loc) · 56.3 KB
/
singapore_serial_intervals_revised.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Singapore Serial Intervals - Revisions"
author: "Caroline Colijn, Michelle Coombe, Manu Saraswat and Jessica Stockdale"
date: "`r Sys.Date()`"
output:
html_document:
keep_md: TRUE
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(survminer)
library(survival)
library(tidyverse)
library(lubridate)
library(icenReg)
library(igraph)
library(visNetwork)
library(mvtnorm)
library(viridis)
options(digits=3)
set.seed(3456)
```
## Singapore data
Thanks to EpiCoronaHack Cluster team. These data are manually entered from postings from the Government of Singapore website: [website](https://www.moh.gov.sg/covid-19). Data includes the 93 cases confirmed between Jan 19 to Feb 26, 2020. Date of symptom onset, close contacts and a few transmission clusters were identified for many of the Singapore cases.
Here we will upload and examine the data prior to analysis.
```{r}
#spdata <- read_csv("data/COVID-19_Singapore_formated_dates.csv")
spdata <-read_csv("data/COVID-19_Singapore_data_revised.csv", col_types = list(presumed_infected_date = col_datetime()))
# Ensure properly imported
glimpse(spdata)
#table(spdata$`Related cases`) # There is one cell with "\n", needs to be changed to 'NA'
spdata$`Related cases`[which(spdata$`Related cases` == "\n")] <- NA
# Rename columns 2, 3 and 4 so no spaces
spdata <- rename(spdata, related_cases = starts_with("Related"),
cluster_links = "Cluster links",
relationship_notes = starts_with("Relation"))
# Make sure we are seeing the number of missing data we are expecting
colSums(is.na(spdata))
# make sure dates parsed properly
range(spdata$presumed_infected_date, na.rm = T)
range(spdata$last_poss_exposure, na.rm = T)
range(spdata$contact_based_exposure, na.rm = T)
range(spdata$date_onset_symptoms, na.rm = T)
range(spdata$date_quarantine, na.rm = T)
range(spdata$date_hospital, na.rm = T)
range(spdata$date_confirmation, na.rm = T)
range(spdata$date_discharge, na.rm = T)
range(spdata$start_source, na.rm = T)
range(spdata$end_source, na.rm = T)
```
**Missing data and imputation**: We have 10 cases that are missing date of symptom onset. We will start the analysis by just removing these cases from the dataset; however, we will then repeat the analysis when imputing the data to determine the effect of removing these cases.
```{r}
# Keep a copy of the original dataset with missing data for imputation later
spdata_org <- spdata
# Remove all the cases that do not have info on date of symptom onset
spdata <- filter(spdata, !is.na(date_onset_symptoms))
# This removes 10 cases; we will examine the effect of imputing these values later on
```
**End_source and start_source columns**:
In order to determine ICC later, we need columns that define a window for possible start and end of exposure to the virus. These are now explicitly listed for both Tianjin and Singapore datasets in the 'start_source' and 'end_source' columns.
The rules for defining these start and end dates are as follows:
- For Wuhan travel cases, their end_source is equal to the time they travelled from Wuhan. In the absence of any other contact info, their start_source is equal to their symptom onset - 20 days, to account for wide uncertainty.
- For cluster cases thought to originate from an index case (but with no further known dates of contact), the start source is set to the 1st symptom onset in the cluster - 7 days. The end date is set to the minimum of the earliest quarantine, hospitalization or hospitalization in the cluster, and the symptom onset date of the case in question. (We assume that once a case in a cluster was identified, people were well aware of this and stopped mixing).
- For cluster cases thought to originate from a specific meeting/event (e.g. company meeting at Grand Hyatt hotel), the start_source is set to the 1st known meeting day. The end_source is set to that day + 4. (4 to account for some error/uncertainty)
- For cases with no known contact or travel info, their start_source is their symptom onset - 20 and their end_source is their symptom onset date (essentially, we have no information on these cases)
If no other end time for the exposure is given (by a known epidemiological route) or if the end of the exposure time is after the time of symptom onset, we set the last exposure time to the symptom onset time. This is because they must have been exposed before symptom onset.
Because this is explicitly about the symptom onset, we removed those who don't have symptom onset defined. (But see effect of imputing those values later).
```{r}
# Let's confirm that the end_source is always before or equal to the symptom onset date
sum(spdata$end_source>spdata$date_onset_symptoms) # =0. Good
```
## Data description for manuscript
Here we will determine some of the summary statistics about the dataset useful for our introduction.
```{r}
#Range of confirmed dates
(srng <- range(spdata_org$date_confirmation))
#Number of recovered
(snotsick <- sum(!is.na(spdata_org$date_discharge)))
#Number died
(sdead <- sum(!is.na(spdata_org$death)))
#Average (symptom onset - end exposure window)
(s.avgendexp <- mean(spdata_org$date_onset_symptoms - spdata_org$end_source, na.rm = T))
(s.sdendexp <- sd(spdata_org$date_onset_symptoms - spdata_org$end_source, na.rm = T))
#Average time between symtom onset and date confirmation
(s.avgtest <- mean(spdata_org$date_confirmation - spdata_org$date_onset_symptoms, na.rm = T))
(s.sdtest <- sd(spdata_org$date_confirmation - spdata_org$date_onset_symptoms, na.rm = T))
#Average duration of hospitalization (date_discharged - date_hospitalization)
(s.avghosp <- mean(spdata_org$date_discharge - spdata_org$date_hospital, na.rm = T))
(s.sdhosp <- sd(spdata_org$date_discharge - spdata_org$date_hospital, na.rm = T))
```
New confirmed cases in this dataset occured between `r srng[1]` to `r srng[2]`. The number of recovered patients is `r snotsick`/`r nrow(spdata_org)`. (`r snotsick/nrow(spdata)*100`%), while the number of patients who had died is `r sdead` (`r sdead/nrow(spdata_org)*100`%). The average time between symptom onset and end of possible exposure window is `r s.avgendexp` (sd `r s.sdendexp`). The average time between symptom onset and case confirmation is `r s.avgtest` (sd `r s.sdtest`). The duration of hospitalization is on average `r s.avghosp` (sd `r s.sdhosp`).
## Estimates of serial interval from Singapore data (without imputation)
We will estimate the serial interval using the 'interval case to case' approach given in Vink et al (https://academic.oup.com/aje/article/180/9/865/2739204).
The dataset has several instances where a putative infector or contact is known. These are listed in the 'related_cases' column. We first make a graph in which nodes are individuals and edges are present from cases listed as possible sources, to the cases for whom they are possible sources.
```{r}
spnodes <- spdata$CaseID
## How to extract caseIDs from related_cases column - there are multiple values in some cells, separated by commas
#The +1 is because the comma splits between 2 related caseIDs (i.e. related cases with only 1 entry will have no comma!)
max(str_count(spdata$related_cases, pattern = ",") + 1,
na.rm = T)
#7 max within one cell
# Split into separate columns
spdata <- separate(spdata,
col = related_cases,
into = paste("contactID", 1:7, sep = "_"),
fill = "right")
# Turn into numeric values
spdata <- mutate(spdata,
contactID_1 = as.numeric(contactID_1),
contactID_2 = as.numeric(contactID_2),
contactID_3 = as.numeric(contactID_3),
contactID_4 = as.numeric(contactID_4),
contactID_5 = as.numeric(contactID_5),
contactID_6 = as.numeric(contactID_6),
contactID_7 = as.numeric(contactID_7))
# Select down to columns of interest
spedges <- select(spdata, c(CaseID, starts_with("contactID")))
# Remove rows with NAs for at least one contact
spedges <- filter(spedges, !is.na(spedges$contactID_1)) #43 CasesIDs with 1 or more possible contacts
```
Both visNetwork and igraph require an edge list with "from" and "to" nodes. So for each row of spedges we create entries like these.
```{r}
singedges = data.frame(from=2,to=1)
for (n in 1:nrow(spedges)) {
for (k in 2:ncol(spedges)) {
if (!is.na(spedges[n,k])) {
singedges=rbind(singedges, c(spedges[[n,k]],spedges[[n,1]]))
}
}
}
singedges=singedges[-1,]
# create undirected graph by removing duplicates
undir=data.frame(from = pmin(singedges[,1],singedges[,2]),
to = pmax(singedges[,1], singedges[,2]))
undir = unique(undir)
undir = undir[-which(undir[,1]==undir[,2]),]
```
As relationships in the 'related_cases' column were not directional, we need to make sure the cases labelled as "from" (aka the infectors) actually got the virus prior to those in the "to" column (aka the infectees). It is reasonable to assume that cases that were infected first will show signs of infection first, so within case-pairs we will assign the case with the earliest date of symptom onset as the "from" (infector) case and the case with the later date of symptom onset as "to".
To do this, let's start by making a new dataset from our case pairs ('undir') that contains date of symptom onset for each case. We will make a few new columns, both for determining which case has the earliest date of symptom onset, as well as to plot serial intervals over time later on.
```{r}
# Make a smaller dataset of original spdata that contains only the CaseID and date of symptom onset
spdata_sympt <- select(spdata_org, CaseID, date_onset_symptoms)
# Add the date of symptom onset -for the caseID of the 'from' case - to the case pairs dataset (undir)
#Do some renaming so the join is based on the caseID in the from column and that name of date column reflects this
names(spdata_sympt) <- str_replace(names(spdata_sympt), "CaseID", "from")
undir_dates <- left_join(undir, spdata_sympt, by = "from")
names(undir_dates) <- str_replace(names(undir_dates), "date_onset_symptoms", "from_sympt_date")
# Repeat, but add the date of symptom onset for the caseID of the 'to' case
names(spdata_sympt) <- str_replace(names(spdata_sympt), "from", "to")
undir_dates <- left_join(undir_dates, spdata_sympt, by = "to")
names(undir_dates) <- str_replace(names(undir_dates), "date_onset_symptoms", "to_sympt_date")
# Now add some extra columns which give us the raw serial interval (i.e. number of days between symptom onset in infector-infectee pairs)
#As well as the absolute value of the serial interval (as some cases in the "from" and "to" columns should be switched around!)
#And finally a 'direction' column in case we need to sort out which directions the arrows should be going in for a network graph and where we have missing dates
undir_dates <- mutate(undir_dates, earliest_sympt_onset = pmin(to_sympt_date, from_sympt_date, na.rm = T),
raw_serial_interval = to_sympt_date - from_sympt_date, #5 NAs because only 1 case in the pair has a date of symptom onset
abs_serial_interval = abs(raw_serial_interval))
```
Now we need to split the dataset apart so that we can switch around the directionality of presumed transmission for case-pairs where the serial interval is negative. Easiest way to do this is to rename columns and then join back to the other parts of the dataset, based on the column names.
```{r}
# Split dataset into positive (or 0) serial interval vs. negative vs. NA
#A negative serial interval means our "to" and "from" cases are mixed up
pos <- filter(undir_dates, raw_serial_interval >= 0)
neg <- filter(undir_dates, raw_serial_interval < 0)
onlyone <- filter(undir_dates, is.na(raw_serial_interval)) #Keep to make columns for graphs later
# Negative dataset needs the column headers changed to reflect that the 'from' and 'to' columns are backwards
#as we are assuming that the 'case with the earliest onset of symptoms would have been infected first,
#and passed on the infection to the other case in the pair
names(neg)
names(neg)[1] <- "to"
names(neg)[2] <- "from"
names(neg)[3] <- "to_sympt_date"
names(neg)[4] <- "from_sympt_date"
names(neg)
# Now bind the rows of the seperated datasets back together based on column names
#Must use dplyr::bind_rows to bind based on column name rather than position
undir_dates <- bind_rows(pos, neg, onlyone)
# For plotting - Add a column with padded to and from caseID numbers so they print in numerical order
#Add a zero on the left of the number so all numbers have two digits
undir_dates$pto <- str_pad(undir_dates$to, width = 2, side = "left", pad = "0")
undir_dates$pfrom <- str_pad(undir_dates$from, width = 2, side = "left", pad = "0")
# For plotting - Make a new column with case pair ID
undir_dates <- mutate(undir_dates, pairID = factor(paste("case", pfrom, "-", "case", pto, sep = "")))
rm(pos, neg, onlyone)
```
From this edge list we can use visNetwork to visualise the graph. Make 'group' based on source of probably infection. Colours are from the infection source column (but we could have a better colour scheme, like date of symptom onset).
```{r}
# Turn 'presumed_reason' into lower case and get trim any whitespace so don't have issues with case sensitivity, etc
spdata$presumed_reason <- str_to_lower(spdata$presumed_reason)
spdata$presumed_reason <- str_trim(spdata$presumed_reason)
#table(spdata$presumed_reason)
sum(is.na(spdata$presumed_reason)) #16 NAs
# Make a new column where we group the 'presumed_reason' under a label (known relationship, gathering, wuhan travel) for each of the above three groups
spdata <- mutate(spdata, presumed_reason_group = case_when(!is.na(str_match(presumed_reason, "wuhan|airport")) ~ "Wuhan travel",
#'airport' case (CaseID 17) does not have 'wuhan' in reason but does have it under 'Case' column that they are from Wuhan
!is.na(str_match(presumed_reason, "symptom onset|via")) ~ "Known relationship",
!is.na(str_match(presumed_reason, "grace")) ~ "Grace Assembly of God",
!is.na(str_match(presumed_reason, "grand")) ~ "Grand Hyatt Singapore",
!is.na(str_match(presumed_reason, "life")) ~ "Life Church",
!is.na(str_match(presumed_reason, "seletar")) ~ "Seletar Aerospace Heights",
!is.na(str_match(presumed_reason, "yong")) ~ "Yong Thai Hang",
is.na(presumed_reason) ~ "Unknown",
TRUE ~ "other")) #should not be any other, so is just a double check this has run correctly, especially as dataset grows
table(spdata$presumed_reason_group)
# Save this object for plotting incidence curve by infection source group (figure 1b)
#save(spdata, file = "data/Singapore_cleaned_infection_source_groups.rdata")
```
We also need to make a data frame of the edges (indicating direction of probable transmission) and nodes (the cases).
```{r}
# Make data frame of edges, where the cases as the 'earliest' date of symptom onset are labeled as the "from" cases
#First need to remove any cases missing date of symptom onset that were added back in from the 'related_cases' column
fedges <- filter(undir_dates, !is.na(raw_serial_interval))
fedges <- select(fedges, from, to)
fedges = data.frame(from = paste("case", fedges[ ,1], sep=""),
to = paste("case", fedges[ ,2], sep=""))
fedges$arrows <- "to"
# Save this object so we can use it to plot Fig4 Network diagram
#save(fedges, file = "data/singapore_edges_for_network_plot.rdata")
# Make data frame of nodes
nodes.df <- data.frame(id=paste("case",spdata$CaseID,sep=""), label=spdata$CaseID, group=spdata$presumed_reason_group)
# Save this object so in case we can use it to plot Fig4 Network diagram
#save(nodes.df, file = "data/singapore_nodes_for_network_plot.rdata")
glimpse(nodes.df)
spdata$graphID = paste("case",spdata$CaseID,sep="")
visNetwork(nodes.df, fedges) %>% visLegend()
```
Now we estimate the serial interval using the ICC method; for this we first construct a graph. The "interval case to case" data are from identifying a putative first infector each small cluster in the graph, and finding the times between symptom onset in the first observed case and the others. See Vink et al.
```{r}
sgraph = graph_from_edgelist(as.matrix(fedges[,1:2]), directed = FALSE)
ccs = components(sgraph)
ccs
spdata$component=vapply(spdata$graphID, function(x)
{ if (x %in% names(ccs$membership)) { return(ccs$membership[match(x, names(ccs$membership))])
} else {
return(NA)}}, FUN.VALUE = 3)
```
Now knowing the components of the graph I can extract the ICC intervals.
I did this in a few ways (commented out lines): taking the first case for each cluster to be the first reported symptoms (I get a 5 day serial interval); the first start exposure time (now there are negative ICCs so I get a 4.5 day serial interval) and the latest end exposure time.
Extract ICC interval data: a function
```{r}
getICCs <- function(thisdata, ccs, K, orderby= "onset" ) {
iccs=1
for (n in 1:max(ccs$membership)) {
mycases = which(thisdata$component==n)
if (orderby == "onset")
{ myonsets = sort(thisdata$date_onset_symptoms[mycases])[1:min(K, length(mycases))]}
if (orderby == "exposure") {
myonsets =thisdata$date_onset_symptoms[mycases][order(thisdata$end_source[mycases])][1:min(K,length(mycases))]
# myonsets = spdata$date_onset_symptoms[mycases[order(spdata$start_source[mycases])]] # alternative also ORDERS by earliest exposure
}
iccs =c(iccs, myonsets[-1]-myonsets[1])
}
return(iccs[-1])
}
```
```{r}
icc3 = getICCs(spdata,ccs,3)
icc4 = getICCs(spdata,ccs,4)
icc5 = getICCs(spdata,ccs,5)
icc6 = getICCs(spdata,ccs,6)
icc_expose = getICCs(spdata, ccs, 4, orderby ="exposure")
```
#### Serial inteval estimates (Table 1 accounting for intermediates and Appendix Table 3)
Note that the first 4 rows is with using the first 3 to 6 cases per cluster, based on ordering by date of symptom onset; while the last row is with 4 cases per cluster, but ordered by date in the 'end_source' (end of presumed exposure window) column instead.
```{r}
source("TianjinSI_VinkWallinga_CC.R")
myest3 = serial_mix_est(data=icc3, N=100, startmu=10, startsig =4)
myest4 = serial_mix_est(data=icc4, N=100, startmu=10, startsig =4)
myest5 = serial_mix_est(data=icc5, N=100, startmu=10, startsig =4)
myest6 = serial_mix_est(data=icc6, N=100, startmu=10, startsig =4)
myest_exp= serial_mix_est(data=icc_expose, N=100, startmu=10, startsig =4)
mm=rbind(myest3, myest4, myest5,myest6, myest_exp)
colnames(mm)=c("mu","sig")
mm=as.data.frame(mm)
mm$NumCasesPerCluster=c( 3,4, 5, 6, 4)
mm$ordering = c("Onset","Onset","Onset","Onset","LastExposure")
print(mm[,c(4,3,1,2)])
```
```{r,eval=FALSE}
days = seq(from=0, to=10, by=0.1)
sp.density= dnorm(days, mean = myest4[1], sd = myest4[2])
ggplot(data=data.frame(days=days, density=sp.density), aes(x=days,y=density)) + geom_line() + ggtitle("ICC estimate of the Singapore cluster serial interval")
#ggsave(file="final_figures/sing_serialint.pdf", height = 4, width = 6)
```
Notice that the serial interval gets longer if we include more cases per cluster (because the mixture of 4 pathways in Vink et al does not include longer transmission chains, which forces the assumption that everyone in the cluster was infected by the initial case, which in turn lengthens the estimated serial interval). We do not know the true infection pathways but it is reasonable not to constrain the model to enforce that most are infected by the first few cases.
**The mean SI (using first 4 cases per cluster, accounting for intermediates) is `r myest4[1]`.** The standard deviation of the serial intervals is `r myest4[2]`.
#### Determining the confidence interval for the mean serial interval
We need CIs for the mean. For this we use bootstrapping. The bootstrapping is done on the serial interval estimate with 4 cases per cluster and ordered by date of symptom onset.
```{r, eval=FALSE}
# bootstrap analysis
Nboot=100
bestimates=myest4
# NOTE this loop had errors a few times; I just restarted it.
for (kk in 1:Nboot) {
bdata = sample(x=icc4, size = length(icc4), replace = T)
bestimates = rbind(bestimates, serial_mix_est(data=bdata, N=100, startmu=10, startsig =4))
print(paste("loop iteration #", kk, sep = ": "))
}
#bestimates <- bestimates[-1, ] #Remove the non-bootstrapped row (i.e. the myest4 object)
#save(bestimates, file = "data/sing_boots_100.Rdata")
```
**Appendix Figure 1b**
```{r, eval=TRUE}
load("data/sing_boots_100.Rdata") # in case in Rmd with above evals set to FALSE
hist(bestimates[,1],breaks = 30)
bootdf=data.frame(mu=bestimates[,1], sig=bestimates[,2])
ggplot(bootdf, aes(x=mu, y=sig))+geom_point()
ggplot(bootdf, aes(x=mu))+geom_histogram()
# ggsave(file = "final_figures/FigS1_bootst_SI_sing.pdf", width = 6, height = 4)
```
```{r,eval=TRUE}
#load("data/sing_boots_100.Rdata") # in case in Rmd with above evals set to FALSE
mean(bestimates[,1]) # mean of the mean serial intervals
median(bestimates[,1])
sd(bestimates[,1]) # sd of the MEAN serial intervals
mean(bestimates[,2]) # mean of the sd serial intervals
sd(bestimates[,2]) #sd of the sd serial intervals
```
The mean of the mean serial intervals is `r mean(bestimates[,1])` days and the standard deviation of these means is `r sd(bestimates[,1])`.
**The bootstrapped 95% range for the mean serial interval (accounting for intermediates) is (`r myest4[1]-1.96*sd(bestimates[,1])`, `r myest4[1]+1.96*sd(bestimates[,1])`).**
## Effect of time on serial interval estimates
To see the effects of the passage of time on the raw serial intervals, we will plot all possible infector-infectee pairs and the difference in their dates of symptom onset. To do this, we need a dataframe that has (1) case pairs (= 'undir'), (2) dates of symptom onset for both individuals in that pair, and (3) difference in days between those pairs.
```{r}
# Add an extra column that groups serial interval based on length
#We will do every two days, as otherwise there are too many bars next to each other!
undir_dates <- mutate(undir_dates, si_groups = factor(case_when(abs_serial_interval == 0 ~ "0",
abs_serial_interval == 1 | abs_serial_interval == 2 ~ "1 to 2",
abs_serial_interval == 3 | abs_serial_interval == 4 ~ "3 to 4",
abs_serial_interval == 5 | abs_serial_interval == 6 ~ "5 to 6",
abs_serial_interval == 7 | abs_serial_interval == 8 ~ "7 to 8",
abs_serial_interval == 9 | abs_serial_interval == 10 ~ "9 to 10",
abs_serial_interval > 10 ~ "over 10",
is.na(abs_serial_interval) ~ NA_character_,
T ~ "other")))
```
CC exploring right truncation
```{r}
library(SurvTrunc)
mytest = data.frame(stee=undir_dates$to_sympt_date,
stor =undir_dates$from_sympt_date,
time_diff = 0)
mytest$time_diff= mytest$stee-mytest$stor
mytest$serial = as.numeric(mytest$time_diff)
mytest$rtrunc = lubridate::ymd("2020-02-27") - mytest$stee
mytest$ltrunc = mytest$rtrunc- 54 # not sensitive to this but we need a value
#mytest = dplyr::filter(mytest, serial>0)
mytest$group = factor( mytest$stor < lubridate::ymd("2020-01-30"))
```
The raw means and sd, followed by means for early and late:
```{r}
mean(mytest$serial, na.rm = TRUE)
sd(mytest$serial, na.rm = TRUE)
mean(mytest$serial[which(mytest$group==TRUE)],na.rm = TRUE)
mean(mytest$serial[which(mytest$group==FALSE)],na.rm = TRUE)
```
Now, is this difference due to right truncation, or something else going on?
```{r}
mynpsurv = cdfDT(y=mytest$serial,
l=as.numeric(mytest$ltrunc),
r=as.numeric(mytest$rtrunc),boot = TRUE)
ll = max(which(mynpsurv$F < 0.5))
medapp = with(mynpsurv,
time[ll] + ( time[ll+1]-time[ll])*(0.5-F[ll])/(F[ll+1]-F[ll]))
ll = max(which(mynpsurv$CI.upper.F < 0.5))
medL = with(mynpsurv,
time[ll] + ( time[ll+1]-time[ll])*(0.5-CI.upper.F[ll])/(CI.upper.F[ll+1]-CI.upper.F[ll]))
ll = max(which(mynpsurv$CI.lower.F < 0.5))
medU = with(mynpsurv,
time[ll] + ( time[ll+1]-time[ll])*(0.5-CI.lower.F[ll])/(CI.lower.F[ll+1]-CI.lower.F[ll]))
```
**Table 1 (serial interval estimates, without intermediates)**
```{r}
#
# mean surv time is area under survival curve , approx by
mynpsurv$Survival[1] + sum( diff( mynpsurv$time)*mynpsurv$Survival[-1]) # does depend on trunc time , but if feb 20, then 3.73. not far off of raw mean
# another numerical approx int is
sum(diff(mynpsurv$time)*mynpsurv$Survival[-length(mynpsurv$Survival)])
# i want a range: note that I have CI.lower.F and CI.upper.F so i can get
# a range on Survival and compute these again, yay.
with(mynpsurv,
sum(diff(time)*(1-CI.upper.F[-length(CI.upper.F)])))
with(mynpsurv,
sum(diff(time)*(1-CI.lower.F[-length(CI.lower.F)])))
mycox = coxDT(Surv(serial)~group, data = mytest, L=ltrunc, R = rtrunc)
# no signif diff early to late here
mycox
```
According to the Cox regression there is no signif diff between early and late.
The fact that SIs contract but it is not signif in the cox regression indicates that the SI shortening is likely due to right truncation.
The mean SI here is approximately `r mynpsurv$Survival[1] + sum( diff( mynpsurv$time)*mynpsurv$Survival[-1])` days and the median is around `r medapp`, assuming the right truncation time is Feb 27. If it is assumed earlier both go up a little bit.
#### Graphs of raw serial intervals
Now let's turn this into a dot plot and a bar chart so we can see if and how serial interval changes over time. The dates on the x-axis are the earliest date of symptom onset from each infected pair.
```{r}
### A) Histogram - length of serial interval on the y-axis
#Transform 'undir_dates' into a better format for making a bar chart...need counts of serial intervals as a factor
g_dates <- undir_dates %>%
group_by(earliest_sympt_onset, as.integer(abs_serial_interval)) %>%
summarise(count_si = n()) #Still seems to want this to be continous for some reason...
names(g_dates)[2] <- "serial_interval"
g_dates <- filter(g_dates, !is.na(serial_interval))
g_dates$count_si <- as.factor(g_dates$count_si)
#Define an object for fill colors - based on viridis colors?
count.cols <- c(#"1" = "#21908CFF", #Nicest teal color
"1" = "#22A884FF",
"2" = "#2A788EFF",
"3" = "#414487FF")
#Plot
#Note that the width of the columns are uneven; to fix this need a dataframe with zeros
#for the dates/serial_interval combinations that don't have a count
#But that takes quite a bit of coding so I'm just going to ignore it for now as its not going in the manuscript
ggplot(g_dates, aes(x = earliest_sympt_onset, y = serial_interval)) +
#The columns really need to be BESIDE each other, or it is really hard to see the actual length of the serial intervals
geom_col(aes(fill = count_si), position = "dodge") +
scale_fill_manual(name = "Number of serial intevals \n of each length", values = count.cols) +
scale_x_date(date_breaks = "1 day") +
scale_y_continuous(breaks = seq(from = 0, to = 20, by = 2)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
panel.background = element_rect(fill = "white")) +
labs(title = "Length of serial intervals in Singapore, over time",
y = "Length of serial intervals",
x = "Date of first onset of symptoms within case pairs")
### B) Histogram - count of serial intervals on the y-axis
#Define count by serial interval group
sig_dates <- filter(undir_dates, !is.na(si_groups))
sig_dates <- sig_dates %>%
group_by(earliest_sympt_onset, si_groups) %>%
summarise(count_sig = n()) #Still seems to want this to be continous for some reason...
names(sig_dates)[2] <- "serial_interval_group"
#Define colors
sig_col <- c("0" = "#440154FF",
"1 to 2" = "#443A83FF",
"3 to 4" = "#31688EFF",
"5 to 6" = "#21908CFF",
"7 to 8" = "#35B779FF",
"9 to 10" = "#8FD744FF",
"over 10" = "#FDE725FF")
#Plot
#Note that the width of the columns are uneven; to fix this need a dataframe with zeros
#for the dates/serial_interval combinations that don't have a count
#But that takes quite a bit of coding so I'm just going to ignore it for now as its not going in the manuscript
ggplot(sig_dates, aes(x = earliest_sympt_onset, y = count_sig, fill = serial_interval_group)) +
#The columns really need to be BESIDE each other, or it is really hard to see the actual length of the serial intervals
geom_col(position = "dodge") +
scale_fill_manual(name = "Length of serial intervals", values = sig_col) +
scale_x_date(date_breaks = "1 day") +
scale_y_continuous(breaks = seq(from = 0, to = 4, by = 1)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
panel.background = element_rect(fill = "white")) +
labs(title = "Length of serial intervals in Singapore, over time",
y = "Count of case-pairs",
x = "Date of first onset of symptoms within case pairs")
###~~~~~~~~~~~ C) Cleveland dotplot of raw serial intervals per possible case pair ~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
# This is the plot that has the easiest visual interpretation, so we will use it in our final manuscript
# We want to exclude any case-pairs that have NA for length of serial interval
#i.e. one of the pair does not have a date of symptom onset
undir_dates_org <- undir_dates #Just in case....
undir_dates <- filter(undir_dates, !is.na(raw_serial_interval)) #Removes 5 NAs
#Pivot the to/from dates of symptom onset column to a long format, so that can make a legend based on this variable
undir_dotplot <- pivot_longer(undir_dates,
cols = contains("sympt_date"),
names_to = "pair_member",
values_to = "onset_date")
#Let's rename the values so it makes more sense in the legend
undir_dotplot$pair_member <- str_replace(undir_dotplot$pair_member, pattern = "from_sympt_date", replacement = "Presumed infector")
undir_dotplot$pair_member <- str_replace(undir_dotplot$pair_member, pattern = "to_sympt_date", replacement = "Presumed infectee")
#Make the Cleveland dotplot
p <- ggplot(undir_dotplot, aes(y = reorder(pairID, earliest_sympt_onset))) +
geom_segment(aes(x = earliest_sympt_onset, xend = earliest_sympt_onset + abs_serial_interval, yend = pairID),
color = "#404788FF") +
#geom_point(aes(x = from_sympt_date), shape = 21, fill = "#FAC127FF", color = "#FAC127FF") +
#geom_point(aes(x = to_sympt_date), shape = 23, fill = "#D44842FF", color = "#D44842FF") +
geom_point(aes(x = onset_date, color = pair_member, fill = pair_member, shape = pair_member)) +
scale_x_date(date_breaks = "1 day") +
scale_color_manual(name = "Pair member for \ndate of symptom onset", values = c("#D44842FF", "#FAC127FF")) +
scale_fill_manual(name = "Pair member for \ndate of symptom onset", values = c("#D44842FF", "#FAC127FF")) +
scale_shape_manual(name = "Pair member for \ndate of symptom onset", values = c(23, 21)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
axis.ticks.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(colour = "grey80", linetype = "dashed"),
panel.background = element_rect(fill = "white")) +
labs(title = "Serial intervals of possible case pairs from Singapore, over time",
x = "Date of symptom onset",
y = "Case pairs")
p
# Write to PDF
# pdf("final_figures/Dotplot_raw_serial_intervals_Singapore.pdf",
#family = "Times",
# width = 8.5, height = 11)
# p
# dev.off()
```
#### Direct serial interval estimates: all possible case-pairs
To determine an estimate of the raw serial intevals, we will find the mean, median, and standard deviation of all possible case-pairs. Note that this is the raw data where cases without date of symptom onset have been removed (i.e. NOT imputed!).
```{r, }
#OLD WAY OF DIRECTLY ESTIMATING S.I. from Singapore data (without imputation)
#The simplest serial interval estimate we can make with these data is a direct estimate based on the time of symptoms of the presumed infector, and the time of symptoms of the case. However, this does not account for the fact that the presumed infector is not necessarily the infector. There are missing intermediate cases (with reasonable probability), or two cases could both be infected by a third unknown case.
#directSI = spdata$date_onset_symptoms - spdata$symp_presumed_infector
#directSI = as.numeric(directSI[!is.na(directSI)])
#mean(directSI)
#sd(directSI)
```
**Table 1 serial interval estimates (without intermediates):**
```{r}
### Arrange the dataset by the date of earliest symptom onset for each case pair
undir_dates <- arrange(undir_dates, earliest_sympt_onset)
### Calculate the mean and median for all possible case-pairs
all_scp <- undir_dates %>%
summarize(mean_direct_si = mean(abs_serial_interval),
median_direct_si = median(abs_serial_interval),
sd_direct_si = sd(abs_serial_interval))
```
The mean direct serial interval of the possible case-pairs is `r all_scp$mean_direct_si`, with a standard deviation of `r all_scp$sd_direct_si`: giving a confidence interval around the mean of (`r all_scp$mean_direct_si - 1.96*all_scp$sd_direct_si`, `r all_scp$mean_direct_si + 1.96*all_scp$sd_direct_si`). The median of the direct serial interval `r all_scp$median_direct_si`.
#### Effect of time on direct serial interval estimates: splitting possible case-pairs in half by earliest date of symptom onset
To determine the effect of time on the raw serial intevals, we will split the case-pairs into early and late, and find the median, mean, and standard deviation. Early cases are those case-pairs where the earlier first date of symptom onset for the pair ('earliest_sympt_onset') is on or before Jan 31, 2020, and late cases are those with earliest_sympt_onset after Jan 31, 2020.
Remember that this is the raw data where cases without date of symptom onset have been removed (i.e. NOT imputed data).
```{r}
### Add a column that specifies if case-pairs are part of "early onset" or "late onset"
#based on which half of the case-pairs they fall into
# Need to arrange the dataset by the date of earliest symptom onset for each case pair
undir_dates <- arrange(undir_dates, earliest_sympt_onset)
# Define half of the dataset...if just want to split dataset 50:50 by # of case-pairs, rather than by date
#half <- nrow(undir_dates) / 2
# Define which pairID fall into the early and later half of the dataset...if splitting in half, rather than by date
#***NOTE THAT THIS DOES ARBITARILY SPLIT SOME PAIRS ON THE SAME DAY OF EARLIEST SYMPT ONSET INTO DIFFERENT HALVES***
#early_half <- undir_dates$pairID[1:half]
#early_half <- droplevels(early_half)
#late_half <- undir_dates$pairID[half+1:nrow(undir_dates)] #as half is even, need to add 1
#late_half <- droplevels(late_half)
# Define which pairID fall into the early and later half of the dataset; based on dates prior to or on Jan 31st = early
early_half <- filter(undir_dates, earliest_sympt_onset <= "2020-01-31")
early_half <- early_half$pairID
early_half <- droplevels(early_half)
late_half <- filter(undir_dates, earliest_sympt_onset > "2020-01-31")
late_half <- late_half$pairID
late_half <- droplevels(late_half)
# Make new column indicating which case-pair belongs to which half of the dataset
undir_dates <- mutate(undir_dates, portion_of_pairs = case_when(pairID %in% early_half ~ "early half",
pairID %in% late_half ~"late half",
T ~ "other")) #Sanity check
undir_dates$portion_of_pairs <- factor(undir_dates$portion_of_pairs,
levels = c("late half", "early half"))
### Calculate the mean and median for each of the two halves
e <- undir_dates %>%
filter(portion_of_pairs == "early half") %>%
summarize(mean_early = mean(abs_serial_interval),
median_early = median(abs_serial_interval),
sd_early = sd(abs_serial_interval))
l <- undir_dates %>%
filter(portion_of_pairs == "late half") %>%
summarize(mean_late = mean(abs_serial_interval),
median_late = median(abs_serial_interval),
sd_late = sd(abs_serial_interval))
```
The mean serial interval for the **early** half of the case-pairs is `r e$mean_early`, with a standard deviation of `r e$sd_early`, and a median of `r e$median_early`. The mean serial interval for the **late** half of the case-pairs is `r l$mean_late`, with a standard deviation of `r l$sd_late`, and a median of `r l$median_late`.
**Fig6a:** This version of the Cleveland dotplot shows which half of the case-pairs was used to calculate each summary statistic.
```{r Cleveland dotplot facetted by half for mean calculations}
### Cleveland dotplot of raw serial intervals, facetted by portion of dataset used to do mean, median and sd calculations
#Pivot the to/from dates of symptom onset column to a long format, so that can make a legend based on this variable
undir_dotplot2 <- pivot_longer(undir_dates,
cols = contains("sympt_date"),
names_to = "pair_member",
values_to = "onset_date")
#Let's rename the values so it makes more sense in the legend
undir_dotplot2$pair_member <- str_replace(undir_dotplot2$pair_member, pattern = "from_sympt_date", replacement = "Presumed infector")
undir_dotplot2$pair_member <- str_replace(undir_dotplot2$pair_member, pattern = "to_sympt_date", replacement = "Presumed infectee")
#Make the Cleaveland dotplot; this time FACET by portion_of_pairs
p2 <- ggplot(undir_dotplot2, aes(y = reorder(pairID, earliest_sympt_onset))) +
geom_segment(aes(x = earliest_sympt_onset, xend = earliest_sympt_onset + abs_serial_interval, yend = pairID),
color = "#404788FF") +
geom_point(aes(x = onset_date, color = pair_member, fill = pair_member, shape = pair_member)) +
facet_grid(portion_of_pairs ~ .,
scales = "free_y", space = "free_y") +
scale_x_date(date_breaks = "1 day") +
scale_color_manual(name = "Pair member for \ndate of symptom onset", values = c("#D44842FF", "#FAC127FF")) +
scale_fill_manual(name = "Pair member for \ndate of symptom onset", values = c("#D44842FF", "#FAC127FF")) +
scale_shape_manual(name = "Pair member for \ndate of symptom onset", values = c(23, 21)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
axis.ticks.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(colour = "grey80", linetype = "dashed"),
panel.background = element_rect(fill = "white")) +
labs(title = "Serial intervals of possible case pairs from Singapore, over time",
x = "Date of symptom onset",
y = "Case pairs")
p2
# Write to PDF
# pdf("final_figures/Fig6a_Dotplot_raw_serial_intervals_Singapore_facetted_in_half.pdf",
#family = "Times",
# width = 8.5, height = 11)
# p2
# dev.off()
```
## Imputataion of missing data (Appendix and Appendix Table 5)
We want to see what the effect of cases with missing dates of symptom onset has on our estimates of serial intervals. To do this, we will impute the missing data by:
missing date of symptom onset = confirmation date - mean(confirmation data- symptom onset date)...where the mean is taken over the cases that do have a symptom onset date present. We will use the copy made of the original data so that we can repeat the full estimation process using the data with imputed dates of symptom onset.
```{r imputing missing date of symptom onset}
# Figure out which cases are missing date of symptom onset
no_date <- which(is.na(spdata_org$date_onset_symptoms))
spdata_org$CaseID[no_date] # 10 of them
#Do all of the cases missing date of symptom onset have a date of confirmation?
sum(is.na(spdata_org$date_confirmation[no_date])) #Yes!
# Figure out the mean(confirmation date - symptom onset date) for cases that HAVE a symptom onset date
avg_date_diff <- spdata_org %>%
filter(!is.na(date_onset_symptoms)) %>%
select(CaseID, date_confirmation, date_onset_symptoms) %>%
summarise(mean(date_confirmation - date_onset_symptoms)) %>%
pluck(1)
avg_date_diff #Notice that this is a 'difftime' class variable; may become important later
# Distribution of dates of confirmation for imputed cases
missing_rng <- range(spdata_org$date_confirmation[no_date])
table(spdata_org$date_confirmation[no_date])
# Range of dates of confirmation for original analysis
org_rng <- range(spdata$date_confirmation)
# Impute the missing date of symptom onset values by each cases' date of confirmation - avg_date_diff
imp_data <- spdata_org
imp_data$dso_imputed = if_else(is.na(imp_data$date_onset_symptoms),
imp_data$date_confirmation - avg_date_diff,
imp_data$date_onset_symptoms)
```
**The average difference between date of confirmation and date of symptom onset (for cases in the original analysis aka those cases WITH date of symptom onset) is `r avg_date_diff`. The range of date of confirmation for cases in the original analysis is `r org_rng`, while range of date of confirmation for cases missing date of symptom onset (aka imputed cases) is `r missing_rng`.**
Now we can re-run the serial estimates based on the imputed date of symptom onset column (dso_imputed).
Step 1: The start and end source columns have already been calculated and added to the dataset, so no need to re-calculate.
Step 2: split related cases column so that we can obtain nodes and edges for clusters.
```{r}
imp_nodes <- imp_data$CaseID
## How to extract caseIDs from related_cases column - there are multiple values in some cells, separated by commas
#The +1 is because the comma splits between 2 related caseIDs (i.e. related cases with only 1 entry will have no comma!)
max(str_count(imp_data$related_cases, pattern = ",") + 1,
na.rm = T)
#7 max within one cell
# Split into separate columns
imp_data <- separate(imp_data,
col = related_cases,
into = paste("contactID", 1:7, sep = "_"),
fill = "right")
# Turn into numeric values
imp_data <- mutate(imp_data,
contactID_1 = as.numeric(contactID_1),
contactID_2 = as.numeric(contactID_2),
contactID_3 = as.numeric(contactID_3),
contactID_4 = as.numeric(contactID_4),
contactID_5 = as.numeric(contactID_5),
contactID_6 = as.numeric(contactID_6),
contactID_7 = as.numeric(contactID_7))
# Select down to columns of interest
imp_edges <- select(imp_data, c(CaseID, starts_with("contactID")))
# Remove rows with NAs for at least one contact
imp_edges <- filter(imp_edges, !is.na(imp_edges$contactID_1))
# Create a data frame with case pairs to indicate 'from' and 'to' cases for the definition of our nodes and edges later on
singedges_idf = data.frame(from=2,to=1)
for (n in 1:nrow(imp_edges)) {
for (k in 2:ncol(imp_edges)) {
if (!is.na(imp_edges[n,k])) {
singedges_idf=rbind(singedges_idf, c(imp_edges[[n,k]],imp_edges[[n,1]]))
}
}
}
# Remove the first row we manually added
singedges_idf=singedges_idf[-1,]
# create undirected graph by removing duplicates
undir_i=data.frame(from = pmin(singedges_idf[,1],singedges_idf[,2]),
to = pmax(singedges_idf[,1], singedges_idf[,2]))
undir_i = unique(undir_i)
undir_i = undir_i[-which(undir_i[,1]==undir_i[,2]),]
```
Step 3: ensure that the "from" cases have the earliest date of symptom onset and the "to" cases have the later date of symptom onset
```{r}
# Make a smaller dataset of original imp_data that contains only the CaseID and imputed date of symptom onset
imp_data_sympt <- select(imp_data, CaseID, dso_imputed)
# Add the date of symptom onset -for the caseID of the 'from' case - to the case pairs dataset (undir_i)
#Do some renaming so the join is based on the caseID in the from column and that name of date column reflects this
names(imp_data_sympt) <- str_replace(names(imp_data_sympt), "CaseID", "from")
undir_i_dates <- left_join(undir_i, imp_data_sympt, by = "from")
names(undir_i_dates) <- str_replace(names(undir_i_dates), "dso_imputed", "from_sympt_date")
# Repeat, but add the date of symptom onset for the caseID of the 'to' case
names(imp_data_sympt) <- str_replace(names(imp_data_sympt), "from", "to")
undir_i_dates <- left_join(undir_i_dates, imp_data_sympt, by = "to")
names(undir_i_dates) <- str_replace(names(undir_i_dates), "dso_imputed", "to_sympt_date")
# Now add some extra columns which give us the raw serial interval (i.e. number of days between symptom onset in infector-infectee pairs)
#As well as the absolute value of the serial interval (as some cases in the "from" and "to" columns should be switched around!)
#And finally a 'direction' column in case we need to sort out which directions the arrows should be going in for a network graph and where we have missing dates
undir_i_dates <- mutate(undir_i_dates, earliest_sympt_onset = pmin(to_sympt_date, from_sympt_date, na.rm = T),
raw_serial_interval = to_sympt_date - from_sympt_date, #5 NAs because only 1 case in the pair has a date of symptom onset
abs_serial_interval = abs(raw_serial_interval))
```
Now we need to split the dataset apart so that we can switch around the directionality of presumed transmission for case-pairs where the serial interval is negative. Easiest way to do this is to rename columns and then join back to the other parts of the dataset, based on the column names.
```{r}
# Split dataset into positive (or 0) serial interval vs. negative vs. NA
#A negative serial interval means our "to" and "from" cases are mixed up
pos <- filter(undir_i_dates, raw_serial_interval >= 0)
neg <- filter(undir_i_dates, raw_serial_interval < 0)
# Negative dataset needs the column headers changed to reflect that the 'from' and 'to' columns are backwards
#as we are assuming that the 'case with the earliest onset of symptoms would have been infected first,
#and passed on the infection to the other case in the pair
names(neg)
names(neg)[1] <- "to"
names(neg)[2] <- "from"
names(neg)[3] <- "to_sympt_date"
names(neg)[4] <- "from_sympt_date"
names(neg)
# Now bind the rows of the seperated datasets back together based on column names
#Must use dplyr::bind_rows to bind based on column name rather than position
undir_i_dates <- bind_rows(pos, neg)
# For plotting - Add a column with padded to and from caseID numbers so they print in numerical order
#Add a zero on the left of the number so all numbers have two digits
undir_i_dates$pto <- str_pad(undir_i_dates$to, width = 2, side = "left", pad = "0")
undir_i_dates$pfrom <- str_pad(undir_i_dates$from, width = 2, side = "left", pad = "0")
# For plotting - Make a new column with case pair ID
undir_i_dates <- mutate(undir_i_dates, pairID = factor(paste("case", pfrom, "-", "case", pto, sep = "")))
```
Step 4: make a network diagram. This won't be our manuscript figure (it's not as pretty...) but gives us that picture here without copying that script here too. This involves making a column that gives a cluster group based on the presumed reason of infection.
```{r}
# Turn 'presumed_reason' into lower case and get trim any whitespace so don't have issues with case sensitivity, etc
imp_data$presumed_reason <- str_to_lower(imp_data$presumed_reason)
imp_data$presumed_reason <- str_trim(imp_data$presumed_reason)
#table(imp_data$presumed_reason)
sum(is.na(imp_data$presumed_reason))
# Make a new column where we group the 'presumed_reason' under a label (known relationship, gathering, wuhan travel) for each of the above three groups
imp_data <- mutate(imp_data, presumed_reason_group = case_when(!is.na(str_match(presumed_reason, "wuhan|airport")) ~ "Wuhan travel",
#'airport' case (CaseID 17) does not have 'wuhan' in reason but does have it under 'Case' column that they are from Wuhan
!is.na(str_match(presumed_reason, "symptom onset|via")) ~ "Known relationship",
!is.na(str_match(presumed_reason, "grace")) ~ "Grace Assembly of God",
!is.na(str_match(presumed_reason, "grand")) ~ "Grand Hyatt Singapore",
!is.na(str_match(presumed_reason, "life")) ~ "Life Church",
!is.na(str_match(presumed_reason, "seletar")) ~ "Seletar Aerospace Heights",
!is.na(str_match(presumed_reason, "yong")) ~ "Yong Thai Hang",
is.na(presumed_reason) ~ "Unknown",
TRUE ~ "other")) #should not be any other, so is just a double check this has run correctly, especially as dataset grows
table(imp_data$presumed_reason_group)
# Make data frame of edges; no missing dates (all imputed) so no NAs to filter out first
fedges_i <- select(undir_i_dates, from, to)
fedges_i = data.frame(from = paste("case", fedges_i[ ,1], sep=""),
to = paste("case", fedges_i[ ,2], sep=""))
fedges_i$arrows <- "to"
# Make data frame of nodes
nodes.df.i <- data.frame(id=paste("case",imp_data$CaseID,sep=""), label=imp_data$CaseID, group=imp_data$presumed_reason_group)
glimpse(nodes.df.i)
imp_data$graphID = paste("case", imp_data$CaseID, sep="")
visNetwork(nodes.df.i, fedges_i) %>% visLegend()
```
Step 5: determine the ICC intervals by extracting the components of the network graph list. Need to update the 'getICC' function defined earlier so it uses the column with imputed date of symptom onset (or else get NAs!).
```{r}
sgraph_i = graph_from_edgelist(as.matrix(fedges_i[,1:2]), directed = FALSE)
ccs_imp = components(sgraph_i)
ccs_imp
imp_data$component=vapply(imp_data$graphID, function(x)
{ if (x %in% names(ccs_imp$membership)) { return(ccs_imp$membership[match(x, names(ccs_imp$membership))])
} else {
return(NA)}}, FUN.VALUE = 3)
#Need to have a new function that uses the column with imputed dates of symptom onset
getICCs_imputed <- function(thisdata, ccs, K, orderby= "onset" ) {
iccs=1
for (n in 1:max(ccs$membership)) {
mycases = which(thisdata$component==n)
if (orderby == "onset")
{ myonsets = sort(thisdata$dso_imputed[mycases])[1:min(K, length(mycases))]}
if (orderby == "exposure") {
myonsets =thisdata$dso_imputed[mycases][order(thisdata$end_source[mycases])][1:min(K,length(mycases))]
# myonsets = spdata$dso_imputed[mycases[order(spdata$start_source[mycases])]] # alternative also ORDERS by earliest exposure
}
iccs =c(iccs, myonsets[-1]-myonsets[1])
}
return(iccs[-1])
}
icc3_i = getICCs_imputed(imp_data,ccs_imp,3)
icc4_i = getICCs_imputed(imp_data,ccs_imp,4)
icc5_i = getICCs_imputed(imp_data,ccs_imp,5)
icc6_i = getICCs_imputed(imp_data,ccs_imp,6)
icc_expose_i = getICCs_imputed(imp_data, ccs_imp, 4, orderby ="exposure")
```
Step 6: determine the serial interval estimates by using the method from Vink et al. Use the 'serial_mix_est' function sourced earlier.
```{r}
myest3_i = serial_mix_est(data=icc3_i, N=100, startmu=10, startsig =4)
myest4_i = serial_mix_est(data=icc4_i, N=100, startmu=10, startsig =4)
myest5_i = serial_mix_est(data=icc5_i, N=100, startmu=10, startsig =4)
myest6_i = serial_mix_est(data=icc6_i, N=100, startmu=10, startsig =4)
myest_exp_i = serial_mix_est(data=icc_expose_i, N=100, startmu=10, startsig =4)
mm=rbind(myest3_i, myest4_i, myest5_i, myest6_i, myest_exp_i)
colnames(mm)=c("mu","sig")
mm=as.data.frame(mm)
mm$NumCasesPerCluster=c( 3,4, 5, 6, 4)
mm$ordering = c("Onset","Onset","Onset","Onset","LastExposure")
print(mm[,c(4,3,1,2)])
```
```{r}
### Make a density plot of the ICC estimate
days = seq(from=0, to=10, by=0.1)
sp.density= dnorm(days, mean = myest4_i[1], sd = myest4_i[2])