Fig7_network_diagram_revised.Rmd

---
title: "Cluster Network Data Preprocessing & Plotting"
author: "Emma Garlock; Yen-Hsiang (Brian) Lee"
date: "18/02/2020"
updated: "23/02/2020"
output: html_document
---

Load all the libraries we will need 
```{r}
# getwd()
library(ggplot2)
library(tidyverse)
library(ggnetwork)
library(OneR)
library(igraph)
library(here)
library(viridis)
library(paletteer)
```


## Singapore dataset: making Figure 7a (network plot)

Load data
```{r}
ncov_sing1 = read.csv("data/COVID-19_Singapore_formated_dates.csv")
```

You can use the sript below to create a weighted network plot. This allows us to have everything arranged spatially but only maintain the links in the `Related Cases` column when rendering the visualization. 
Trying to make weighted edges 

```{r}
#make a list so that we can supply weights to the edges 
w_edge=ncov_sing1[,c(1:3)]#only need these from the pain dataset 
names(w_edge)=c("CaseID","2","1")#change the names 
w_edge[w_edge==""] <- NA# make the blanks into NAs for better sorting 

w_edge_long=pivot_longer(w_edge,cols=c("2","1"),names_to="weight")%>% #pivot longer 
  separate( 'value', paste("relations", 1:22, sep=""), sep=",", extra="drop")%>%#separate out all the connections 
  pivot_longer(cols=relations1:relations22,names_to="relation")%>%#pivot longer again 
  select(CaseID,value,weight)%>%#just get the columns we need 
  filter(value !=" " | !is.na(value) |value !="")%>%#filter out any with missing values 
  filter(weight=="1")#just use weight-1 (these are just to help get things laid out poroperly during plotting, these won't actually show up in the figure. The ones that show up in the figure come later.)

names(w_edge_long)=c("from","to","weight")#change names so things will merge 
w_edge_long$to=str_squish(w_edge_long$to)

#using fedges fom the singapore_serial_intervals_revised.Rmd 
load("data/singapore_edges_for_network_plot.rdata")

fedges_weighted=fedges%>%select(-arrows) #don't need the information in the 
fedges_weighted$weight=2 #assign the more important weight 
fedges_weighted$from=as.character(str_extract_all(fedges_weighted$from,"\\(?[0-9,.]+\\)?"))#get just the numeric so that it can match with the other df and with the nodes df
fedges_weighted$to=as.character(str_extract_all(fedges_weighted$to,"\\(?[0-9,.]+\\)?"))#get just the numeric so that it can match with the other df and with the nodes df
plot_edges=rbind(w_edge_long,fedges_weighted) #attach it to the edges we use just to get the plotting laid out nicely 

#write.csv(eg_edges2,"data/sing_edges_eg.csv")
```


```{r}
#Load nodes from Singapore
load("data/singapore_nodes_for_network_plot.rdata")
nodes_sing <- nodes.df
plot_edges$test_to=plot_edges$to %in% nodes_sing$label #test to see if there are edges that don't have any node data 
plot_edges$test_from=plot_edges$from %in% nodes_sing$label#test to see if there are edges that don't have any node data 

plot_edges=plot_edges%>%
  filter(test_to=="TRUE")%>%
  filter(test_from=="TRUE")%>%
  #remove nodes that don't exist in the nodes_sing due to missing data (they still exist inthe relationships column which is why they showed up)
  select(from,to,weight)

names(nodes_sing)=c("label","id","group")#change the names of the columns to better reflect what we'll use them for 
nodes_sing=nodes_sing%>%select(id,label,group)#rearrange because id (just the number) needs to be the first column 

cov_weighted=graph_from_data_frame(plot_edges, vertices = nodes_sing, directed = FALSE)
cov_net_weight=ggnetwork(cov_weighted)
#write.csv(cov_net_weight,("data/singapore_weighted_network_0504.csv"))
```


```{r}
#cov_net_weight=read.csv(("data/singapore_weighted_network_0504.csv"))
cov_net_w = ggplot(cov_net_weight, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_edges(aes(), subset(cov_net_weight,weight==2),linetype="solid") +
  geom_nodes(aes(color=group),size=8) +
  geom_text(aes(label=name),check_overlap = TRUE) +
  geom_text(aes(label=name),subset(cov_net_weight, group == "Grace Assembly of God"),colour="white",check_overlap = TRUE) +
  guides(colour = guide_legend(title.position="top", title.hjust = 0.5, override.aes = list(size=3))) +
  scale_colour_viridis_d(na.value="lightgrey") +
  labs(color="Cluster")+
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5)) + #centre main title
  theme(panel.grid = element_blank(),
        axis.title = element_blank(),
        axis.line = element_blank(),
        axis.text=element_blank(),
        axis.ticks = element_blank(),
        panel.border = element_blank(),
        legend.position = "bottom",
        legend.direction = "horizontal")
cov_net_w

#ggsave("final_figures/Fig7_singapore_cov_network_revised_0517.pdf",plot=cov_net_w,device="pdf",height=8,width=11,units="in")
```


## Tianjin dataset: making Figure 7b (network plot)

There won't be any weighting or anything with the Tianjin one so we can just use the same edge list that was made in tianjin_serial_intervals_revised.Rmd 
- tedges 
- nodes 
 But we still have to filter based on nodes that were removed based on missing data 
```{r}
#load Tianjin edges and nodes objects made in the "tianjin_serial_intervals_revised.rmd" file 
  #Do this so we are using the same data for analysis and plotting
load("data/tianjin_edges_for_network_plot.rdata")
load("data/tianjin_nodes_for_network_plot.rdata")

#Filter nodes based on those that were removed due to missing data
tedges$test=tedges$from %in% nodes$id
tedges=tedges%>%filter(test=="TRUE")%>%select(-test)
```
 

Make the dataset for plotting , and make a new column that is just the number without the "TJ" to make for better node label 
```{r}
tj_net=graph_from_data_frame(tedges,vertices = nodes,directed = FALSE)
ntj=ggnetwork(tj_net)

ntj$case_num=as.character(str_extract_all(ntj$name,"\\(?[0-9,.]+\\)?"))
#write.csv(ntj_clean,here("network_diagram/tianjin_cluster_known_edges.csv"))

```
Make the plot 
```{r}
#Upload the tianjin_cluster_known_edges.csv for this plot 
#ntj_clean=read.csv(here("network_diagram/tianjin_cluster_known_edges.csv"))
tj_cov_net = ggplot(ntj, aes(x = x, y = y, xend = xend, yend = yend))+
  geom_edges() +
  geom_nodes(aes(color=group),size=8) +
  geom_text(aes(label=case_num), check_overlap = TRUE)+
  #geom_text(aes(label=case_num),subset(ntj,cluster==" close contact "),colour="white",check_overlap = TRUE)+
  scale_color_viridis_d()+
  labs(color="Cluster")+
  guides(colour = guide_legend(title.position="top", title.hjust = 0.5, override.aes = list(size=3))) +
  #ggtitle("Clusters of COVID-19 cases in Tianjin, China")+
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid = element_blank(),
        axis.title = element_blank(),
        axis.line = element_blank(),
        axis.text=element_blank(),
        axis.ticks = element_blank(),
        panel.border = element_blank(),
        legend.position = "bottom",
        legend.direction = "horizontal")


tj_cov_net

#ggsave("final_figures/Fig7_tianjin_cov_network_revised.pdf",plot=tj_cov_net,device="pdf",height=8,width=11,units="in")
```