-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathFig7_network_diagram_revised.Rmd
167 lines (135 loc) · 6.88 KB
/
Fig7_network_diagram_revised.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
---
title: "Cluster Network Data Preprocessing & Plotting"
author: "Emma Garlock; Yen-Hsiang (Brian) Lee"
date: "18/02/2020"
updated: "23/02/2020"
output: html_document
---
Load all the libraries we will need
```{r}
# getwd()
library(ggplot2)
library(tidyverse)
library(ggnetwork)
library(OneR)
library(igraph)
library(here)
library(viridis)
library(paletteer)
```
## Singapore dataset: making Figure 7a (network plot)
Load data
```{r}
ncov_sing1 = read.csv("data/COVID-19_Singapore_formated_dates.csv")
```
You can use the sript below to create a weighted network plot. This allows us to have everything arranged spatially but only maintain the links in the `Related Cases` column when rendering the visualization.
Trying to make weighted edges
```{r}
#make a list so that we can supply weights to the edges
w_edge=ncov_sing1[,c(1:3)]#only need these from the pain dataset
names(w_edge)=c("CaseID","2","1")#change the names
w_edge[w_edge==""] <- NA# make the blanks into NAs for better sorting
w_edge_long=pivot_longer(w_edge,cols=c("2","1"),names_to="weight")%>% #pivot longer
separate( 'value', paste("relations", 1:22, sep=""), sep=",", extra="drop")%>%#separate out all the connections
pivot_longer(cols=relations1:relations22,names_to="relation")%>%#pivot longer again
select(CaseID,value,weight)%>%#just get the columns we need
filter(value !=" " | !is.na(value) |value !="")%>%#filter out any with missing values
filter(weight=="1")#just use weight-1 (these are just to help get things laid out poroperly during plotting, these won't actually show up in the figure. The ones that show up in the figure come later.)
names(w_edge_long)=c("from","to","weight")#change names so things will merge
w_edge_long$to=str_squish(w_edge_long$to)
#using fedges fom the singapore_serial_intervals_revised.Rmd
load("data/singapore_edges_for_network_plot.rdata")
fedges_weighted=fedges%>%select(-arrows) #don't need the information in the
fedges_weighted$weight=2 #assign the more important weight
fedges_weighted$from=as.character(str_extract_all(fedges_weighted$from,"\\(?[0-9,.]+\\)?"))#get just the numeric so that it can match with the other df and with the nodes df
fedges_weighted$to=as.character(str_extract_all(fedges_weighted$to,"\\(?[0-9,.]+\\)?"))#get just the numeric so that it can match with the other df and with the nodes df
plot_edges=rbind(w_edge_long,fedges_weighted) #attach it to the edges we use just to get the plotting laid out nicely
#write.csv(eg_edges2,"data/sing_edges_eg.csv")
```
```{r}
#Load nodes from Singapore
load("data/singapore_nodes_for_network_plot.rdata")
nodes_sing <- nodes.df
plot_edges$test_to=plot_edges$to %in% nodes_sing$label #test to see if there are edges that don't have any node data
plot_edges$test_from=plot_edges$from %in% nodes_sing$label#test to see if there are edges that don't have any node data
plot_edges=plot_edges%>%
filter(test_to=="TRUE")%>%
filter(test_from=="TRUE")%>%
#remove nodes that don't exist in the nodes_sing due to missing data (they still exist inthe relationships column which is why they showed up)
select(from,to,weight)
names(nodes_sing)=c("label","id","group")#change the names of the columns to better reflect what we'll use them for
nodes_sing=nodes_sing%>%select(id,label,group)#rearrange because id (just the number) needs to be the first column
cov_weighted=graph_from_data_frame(plot_edges, vertices = nodes_sing, directed = FALSE)
cov_net_weight=ggnetwork(cov_weighted)
#write.csv(cov_net_weight,("data/singapore_weighted_network_0504.csv"))
```
```{r}
#cov_net_weight=read.csv(("data/singapore_weighted_network_0504.csv"))
cov_net_w = ggplot(cov_net_weight, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(), subset(cov_net_weight,weight==2),linetype="solid") +
geom_nodes(aes(color=group),size=8) +
geom_text(aes(label=name),check_overlap = TRUE) +
geom_text(aes(label=name),subset(cov_net_weight, group == "Grace Assembly of God"),colour="white",check_overlap = TRUE) +
guides(colour = guide_legend(title.position="top", title.hjust = 0.5, override.aes = list(size=3))) +
scale_colour_viridis_d(na.value="lightgrey") +
labs(color="Cluster")+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) + #centre main title
theme(panel.grid = element_blank(),
axis.title = element_blank(),
axis.line = element_blank(),
axis.text=element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal")
cov_net_w
#ggsave("final_figures/Fig7_singapore_cov_network_revised_0517.pdf",plot=cov_net_w,device="pdf",height=8,width=11,units="in")
```
## Tianjin dataset: making Figure 7b (network plot)
There won't be any weighting or anything with the Tianjin one so we can just use the same edge list that was made in tianjin_serial_intervals_revised.Rmd
- tedges
- nodes
But we still have to filter based on nodes that were removed based on missing data
```{r}
#load Tianjin edges and nodes objects made in the "tianjin_serial_intervals_revised.rmd" file
#Do this so we are using the same data for analysis and plotting
load("data/tianjin_edges_for_network_plot.rdata")
load("data/tianjin_nodes_for_network_plot.rdata")
#Filter nodes based on those that were removed due to missing data
tedges$test=tedges$from %in% nodes$id
tedges=tedges%>%filter(test=="TRUE")%>%select(-test)
```
Make the dataset for plotting , and make a new column that is just the number without the "TJ" to make for better node label
```{r}
tj_net=graph_from_data_frame(tedges,vertices = nodes,directed = FALSE)
ntj=ggnetwork(tj_net)
ntj$case_num=as.character(str_extract_all(ntj$name,"\\(?[0-9,.]+\\)?"))
#write.csv(ntj_clean,here("network_diagram/tianjin_cluster_known_edges.csv"))
```
Make the plot
```{r}
#Upload the tianjin_cluster_known_edges.csv for this plot
#ntj_clean=read.csv(here("network_diagram/tianjin_cluster_known_edges.csv"))
tj_cov_net = ggplot(ntj, aes(x = x, y = y, xend = xend, yend = yend))+
geom_edges() +
geom_nodes(aes(color=group),size=8) +
geom_text(aes(label=case_num), check_overlap = TRUE)+
#geom_text(aes(label=case_num),subset(ntj,cluster==" close contact "),colour="white",check_overlap = TRUE)+
scale_color_viridis_d()+
labs(color="Cluster")+
guides(colour = guide_legend(title.position="top", title.hjust = 0.5, override.aes = list(size=3))) +
#ggtitle("Clusters of COVID-19 cases in Tianjin, China")+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.line = element_blank(),
axis.text=element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal")
tj_cov_net
#ggsave("final_figures/Fig7_tianjin_cov_network_revised.pdf",plot=tj_cov_net,device="pdf",height=8,width=11,units="in")
```