-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenteroHeatmap_ML.R
292 lines (203 loc) · 8.18 KB
/
enteroHeatmap_ML.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# Installing and loading packages -----------------------------------------
# You can install packages by typing:
# install.packages("gplots")
library(gplots)
library(tidyverse) # We will be using dplyr, purrr, and stringr
library(heatmaply)
library(gplots)
library(here)
# Read data ---------------------------------------------------------------
# You have provided multiple SNP distance matrices
# We want to load all of them into R
# This is how you would load only one file
# You assign it to a variable and then use the read.csv function to parse it and
# load it into R.
groel_gapped <- read.csv(
here('data', 'SNP_table_groel_gapped_jan122018.csv'),
stringsAsFactors = FALSE)
groel_gapped$X <- make.names(groel_gapped$X)
# Nice! We have read in a SNP matrix as a data frame
# The data frame is one of the most important data structures in R
# A data frame is the most common way to store data in R
# A data frame has similarities to an Excel spreadsheet, or a table, but those
# are very different things under the hood
# How do I know that this is a data frame?
# Using the class function, you will get the following information:
# class(groel_gapped)
# [1] "data.frame"
# Let's now read the metadata file
# In this case, we are using a different function called read.table.
# This function reads files that are separated by a user-provided separator character
# In this case, that character is the tab (\t)
metadata <- read_csv('data/sample_metadata.csv')
metadata$`Final ID` <- na_if(metadata$`Final ID`, "")
metadata$Iso <- make.names(metadata$Iso)
# Pre-processing one dataframe -----------------------------------------------
# In this particular format from Geneious, there are points in the data frame
# that have NA values when they are read into R. This means that the values are
# empty, and R assigns them a missing value indicator ('Not available')
# We can replace those NA values by 0 for this particular format
# This is because the NA were generated for identical strains (and thus zero difference)
groel_gapped[is.na(groel_gapped)] <- 0
# Now, let's rename the first column of the groel_ungapped_meta
groel_gapped <- groel_gapped %>%
rename(Iso = X)
# The reason for this is that we are going to use that column to merge it with the
# metadata file using that column
groel_gapped_meta <- left_join(groel_gapped, metadata, by="Iso")
meta_in_groel <- intersect(metadata$Iso,groel_gapped_meta$Iso)
meta_in_groel <-
metadata %>%
filter(Iso %in% meta_in_groel)
# Let's select certain columns to keep for the metadata
groel_gapped_meta <-
groel_gapped_meta %>%
dplyr::select(-Iso,-Location) %>%
dplyr::select(`Final ID`, everything())
# Generate one interactive heatmap ----------------------------------------
groel_gapped_heatmap <- heatmaply(groel_gapped_meta)
# Preliminary steps for generating heatmaps in batch ----------------------
# How about processing all the SNP distance files?
# Let's use some wizardry
# First, let's create a list of the names of the CSV files in this folder
allSNPfiles <- Sys.glob(file.path("./data/SNP*.csv"))
# Now, let's create a list with all the filename prefixes only (e.g. 16SatpApheS_gapped)
allSNPnames <- map(allSNPfiles, function(x) str_split(string = x, pattern = "_")) %>%
flatten() %>%
map(function(x) paste0(x[3],"_",x[4]))
# The code below will read all the CSV files in the allSNPfiles list and load them
# into a list in the R Global environment
allSNPdf <- map(allSNPfiles, function(x){
df <- read.csv(x)
df
}) %>%
set_names(nm = allSNPnames)
# Extract isolate names ---------------------------------------------------
# Let's fix the names so that we only have the isolate names
allSNPdf <- map(allSNPdf, function(x){
df <- x %>%
rename(Iso = X) %>%
mutate(Iso = str_replace(Iso, "(_|\\s|Assembly|\\.).*$", "")) %>%
mutate(Iso = str_replace(Iso, "\\.", "-"))
names(df) <- str_replace(names(df), "(_|\\s|Assembly|\\.).*$", "")
names(df) <- str_replace(names(df), "\\.", "-")
df
})
# In this function we are replacing the NA values with zero
allSNPdf <- map(allSNPdf, function(x){
x[is.na(x)] <- 0
x
})
# We might have data duplicates
# Let's check for those
duplicate_positions <- map(allSNPdf, ~ which(duplicated(.x$Iso))) %>%
discard( ~ length(.x) == 0)
duplicate_positions
# $atpa_gapped
# [1] 26
#
# $pheS_gapped
# [1] 176
allSNPdf$atpa_gapped <- allSNPdf$atpa_gapped[-26,-27]
allSNPdf$pheS_gapped <- allSNPdf$pheS_gapped[-176,-177]
allSNPdf <- map(allSNPdf, function(x) {
df <- x %>%
mutate(Iso = make.names(Iso))
df
})
# Merge isolate metadata with SNP distances -------------------------------
allSNPdfMeta <- map(allSNPdf, function(x){
meta <- left_join(x, metadata, by="Iso")
meta
})
allSNPdfMeta <- map(allSNPdfMeta, function(x){
meta <- dplyr::select(x, `Final ID`, everything()) %>%
dplyr::select(-dplyr::contains("Iso")) %>%
dplyr::select(-dplyr::contains("Location")) %>%
rename(Species=`Final ID`)
})
allSNPdfMetaSub <- discard(allSNPdfMeta, str_detect(names(allSNPdfMeta), "ungapped"))
# Generate all heatmaps ---------------------------------------------------
allSNPdfHeatmaps <- imap(allSNPdfMetaSub, function(x, y) {
hm <- heatmaply(
x,
dendrogram = "both",
plot_method = "plotly",
main = y,
# margins = c(50,32,NA,11),
fontsize_row = 8,
fontsize_col = 11,
column_text_angle = 45,
key.title = "SNP distance",
# colorbar_xpos = -1,
colorbar_ypos = 2,
showticklabels = FALSE
)
hm
})
# Export heatmaps as PNG --------------------------------------------------
allSNPdfHeatmaps %>%
iwalk( ~ export(
p = .x,
file = paste0(here("snp_heatmaps/"),
.y,
".png"),
vwidth = 1246,
vheight = 814
))
# Export heatmaps as HTML -------------------------------------------------
allSNPdfHeatmaps %>%
iwalk(
~ htmlwidgets::saveWidget(as_widget(.x), paste0(here("snp_heatmaps/"),.y,".html"))
)
# Cluster analysis for Comparing Partitions: one example ------------------
# Purpose: extract clusters at all thresholds for analysis with the Comparing
# Partitions tool (http://www.comparingpartitions.info/)
# Example with one dataset
groel_gapped <- groel_gapped[,2:ncol(groel_gapped)]
groel_gapped_rep_na <- lapply(groel_gapped, function(x){
x <- str_replace_na(x, replacement = 0)
x
})
groel_gapped_rep_na <- do.call("rbind", groel_gapped_rep_na)
dist_groel_gapped <- as.dist(groel_gapped_rep_na)
all_dists <- unique(as.vector(dist_groel_gapped))
hc_dist_groel_gapped <- hclust(dist_groel_gapped, method = "complete")
# clusters_groel_strains <- metadata[match(hc_dist_groel_gapped$labels, metadata$Iso),]
# write.csv(clusters_groel_strains, 'reference_clusters.csv')
# hc_dist_groel_gapped$labels <- hc_dist_groel_gapped$labels[hc_dist_groel_gapped$order]
clusters_groel <- cutree(hc_dist_groel_gapped, h = all_dists)
write.csv(clusters_groel, file = here('cluster_analysis', 'clusters_at_all_SNP_thresholds.csv'))
clusters_groel_species <- cutree(hc_dist_groel_gapped, k = 7)
# dend_hc <- as.dendrogram(hc_dist_groel_gapped)
clusters_groel_sum <- apply(clusters_groel, 2, max)
clusters_groel_sum <- data.frame(
distances = as.numeric(names(clusters_groel_sum)),
clusters = clusters_groel_sum
) %>%
arrange(distances)
# Cluster analysis for Comparing Partitions: in batch ---------------------
all_dists <-
allSNPdfMetaSub %>%
map( ~ (.x[, 2:ncol(.x)] %>%
as.dist(.)))
all_clusts <-
all_dists %>%
map( ~ (hclust(.x, method = "complete")))
all_flat_clusters <-
all_clusts %>%
map( ~ (cutree(.x, k = 7))) # Check for high and low thresholds
all_flat_clusters_df <-
all_flat_clusters %>%
map( ~ enframe(.x, name = "Iso", value = "Cluster"))
# all_flat_clusters_df <-
# all_flat_clusters_df %>%
# map(~ .x[match(metadata$Iso, .x$Iso), ] %>%
# na.omit())
all_flat_clusters_df <-
all_flat_clusters_df %>%
map_dfr(~ left_join(.x, metadata,by="Iso"), .id = 'Scheme')
butter_spread <- spread(all_flat_clusters_df, key = Scheme, value = Cluster)
butter_spread <- butter_spread[match(metadata$Iso, butter_spread$Iso),]
butter_spread <- butter_spread %>% na.omit()
write.csv(butter_spread, here('cluster_analysis', 'all_seven_clusters.csv', row.names = FALSE, quote = FALSE)