-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_MedInfCIN2024_Vis.Rmd
383 lines (268 loc) · 10.2 KB
/
2_MedInfCIN2024_Vis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
---
title: "Data visualization with ggplot and pheatmap"
output:
html_document:
toc: yes
toc_float: yes
editor_options:
markdown:
wrap: 80
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Overview
Get acquainted with data visualization with ggplot and pheatmap.
You can find a useful cheatsheet for `ggplot2` and other common R packages in
the RStudio website: <https://www.rstudio.com/resources/cheatsheets/>
## 1) Loading packages and data
Please install the following packages from CRAN:
- ggplot2
- pheatmap
- dplyr
- tidyr
- stringr
- uwot
### Loading necessary packages
```{r packages, message=F}
# plotting
library(ggplot2)
library(pheatmap)
# manipulation of data frames
library(dplyr)
library(tidyr)
# manipulation of strings
library(stringr)
# for UMAP transformation
library(uwot)
```
### Load the data
Load the expression and annotation data you saved during your previous
exercise.If you don't have them in your current working directory you can find
them in the data folder of the course repository.
```{r load, results=F, message=F}
data <- readRDS("data/lapatinib_expression.rds")
meta <- readRDS("data/lapatinib_phenotype.rds")
```
## 4) Generating barplot
Let's create a barplot to see how many samples we have per tissue. First we will
count how many samples per tissue we have in our data set and generate a list of
colors for each one.
```{r barplot_tissue, results=F, message=F}
# Remove special characters from metadata column names
colnames(meta) <- str_replace_all(colnames(meta), ":", ".")
# Create summary table
tissue.sum <- data.frame(table(meta$tissue.ch1))
# Create ggplot item and map the axes
ggp <- ggplot(tissue.sum, aes(x = Var1, y = Freq))
# add a bar plot
ggp + geom_col()
# Change x and y labels
ggp + geom_col() +
xlab("Tissue type") +
ylab("Frequency")
# Add colors
# Recreate ggplot item
ggp <- ggplot(tissue.sum, aes(x = Var1, y = Freq, fill = Var1))
ggp + geom_col() +
xlab("Tissue type") +
ylab("Frequency")
# Change data order -- working with factors
# Look at current factor
tissue.sum$Var1
# What happens if we sort the table and check the factor again
tissue.sum[order(tissue.sum$Freq, decreasing = FALSE), ]$Var1
# Create correct levels order
new.levels <- as.character(tissue.sum[order(tissue.sum$Freq, decreasing = FALSE), ]$Var1)
tissue.sum$Var1 <- factor(tissue.sum$Var1, new.levels)
# Recreate ggplot item
ggp <- ggplot(tissue.sum, aes(x = Var1, y = Freq, fill = Var1))
ggp + geom_col() +
xlab("Tissue type") +
ylab("Frequency")
########
# TASK #
########
## Replot with the tissue types in decreasing order (largest first)
```
#Summarising data with group_by
The pipeline `%>%` operator passes the object/variable/output from its left side
as a first argument to the function on its right side
```{r groups, results=F, message=F}
# Select specific columns
meta.drugcount <- meta %>% subset(select = c(tissue.ch1, drug_conc)) %>%
# Group by columns of interest
group_by(tissue.ch1, drug_conc) %>%
# Count number of occurences of each drug concentration by evaluating the length of the group
mutate(count = length(drug_conc)) %>%
# Remove duplicate rows
unique()
########
# TASK #
########
## Create meta.drugtime using the above technique and the drug_tpoint column
```
## 5) Basic statistics
Let us first compute some summary statistics and plot them across samples in
order to get a preliminary idea of our expression data.
```{r descriptive_stats, results=F, message=F}
# Calculate key stats of expression data
c_mean <- colMeans(data)
c_median <- apply(data, 2, median)
c_std <- apply(data, 2, sd)
# Create numeric values of samples (equivalent to sample names)
rng <- 1:length(c_mean)
# Put into dataframe
sumstats <- data.frame(
c_mean = c_mean,
c_median = c_median,
c_std = c_std,
rng = rng
)
# Create geom_point plot using x as a range of sample "numbers" and y as log2(expr), adding both median and mean values
statplot <- ggplot(sumstats, aes(x = rng, y = c_median)) +
# Add points for median (setting legend label to median)
geom_point(aes(color = "median")) +
# Add points for mean
geom_point(aes(x = rng, y = c_mean, color = "mean")) +
# Set y range from lower to upper std deviations
ylim(range(c(c_mean - c_std, c_mean + c_std))) +
# Change y axis label
ylab("log2(expr)") +
# Change x axis label
xlab("Samples")
statplot
# Adding error bars
statplot +
geom_errorbar(aes(ymin = c_mean - c_std, ymax = c_mean + c_std), width = .2)
```
## 6) Boxplots
Let's see now how this data looks using a box plot, coloring by tissue:
```{r distribution2, results=F, message=F}
# Box plots usually calculate their own summary statistics, so they need to be given the entire sample matrix
# However we first need to put that into long format
long.data <- data %>% pivot_longer(everything())
# Select first four samples only (otherwise boxplot would be unreadable)
smps <- meta$title[1:4]
short <- long.data[long.data$name %in% smps, ]
# Create boxplot
ggplot(short, aes(x = name, y = value)) +
geom_boxplot()
# Merge summary stats with meta data
merged.meta <- merge(long.data, meta, by.x = "name", by.y = "title", all.x = TRUE)
# Create boxplot
ggplot(merged.meta, aes(x = tissue.ch1, y = value)) +
geom_boxplot()
########
# TASK #
########
## Create box plot showing expression statistics per drug concentration
# Did it work? are you sure?
```
## 7) Histogram + density
Now we select a sample and see how the expression data distribution looks like.
To do this we can plot the histogram or the density plot (or both).
```{r distrib, results=F, message=F}
# Set sample that we want to look at
sample_name <- "A498_lapatinib_10000nM_24h"
# Select that sample from our data
data.smp <- long.data[long.data$name == sample_name, ]
# Create histogram of expression values with density
ggplot(data.smp, aes(x = value)) +
geom_histogram(aes(y = ..density..), color = "black", fill = "grey", bins = 50) +
geom_density(alpha = .2, fill = "#FF6666")
# Now lets do this for 4 samples and show it in a facet_wrap
# Select that sample from our data (using smps variable we created before)
data.smp <- long.data[long.data$name %in% smps, ]
# Create histogram of expression values with density
ggplot(data.smp, aes(x = value)) +
geom_histogram(aes(y = ..density..), color = "black", fill = "grey", bins = 50) +
geom_density(alpha = .2, fill = "#FF6666") +
facet_wrap(~name)
```
### Finished with our quality/summary stats, lets clean our environment
```{r clean, results=F, message=F}
rm(list = setdiff(ls(), c("data", "meta")))
```
## 8) Heatmap + hierarchical clustering
Now we will plot the heatmap of our expression profiles along with the
hierarchical clustering. On top of that, we will show the tissue of origin with
colors (which can nicely be done in a single function in base R):
```{r hmap, results=F, message=F}
## Create an annotation df
# Create patient name column (sapply: apply over list, '[': get element, 1: first element)
meta$patient <- sapply(strsplit(meta$title, "_"), `[`, 1)
anno <- meta %>% subset(select = c(title, patient, tissue.ch1))
row.names(anno) <- anno$title
anno <- anno[-1]
# Create short version for quick plotting
data.short <- data[1:500, ]
# Plot data
pheatmap(as.matrix(data.short), annotation_col = anno, show_rownames = FALSE, show_colnames = FALSE)
```
## 9) Scatter plot and correlation
Melanoma samples seem to cluster together, let's see how well they correlate:
```{r scatter_melanoma, results=F, message=F}
# Find names of melanoma samples
usecols <- meta[meta$tissue.ch1 == "Melanoma", ]$title
## Plot correlation of first two samples
# Select data for first two samples
data.corr <- data %>% subset(select = c(usecols[1], usecols[2]))
# Plot
ggplot(data.corr, aes_string(x = usecols[1], y = usecols[2])) +
geom_point()
# Find correlation of all samples
cor(data[, usecols])
# Find mean of correlations
cor(data[, usecols]) %>% mean()
```
## 10) PCA on gene expression profiles of treated cell lines
Next, a Principal Component Analysis (PCA) is performed on the data to assess
the main sources of variability across the gene expression profiles. In order to
do this, we first define a simple function that shows the most relevant
information from the PCA (i.e. the first 2 principal components and how much
variance each one explains).
```{r, func_pca, results=F, message=F}
# Perform PCA using prcomp function
# Note: the matrix must be transposed, so samples are rows
pca.n <- prcomp(t(data))
# Generating function created by colleague that plots PCA data
plotPCA <- function(pca, pchs = 21, colour = NULL, PCs = c("PC1", "PC2")) {
tmp <- summary(pca)
# importance: variance explained
PCx <- format(round(tmp$importance[2, PCs[1]] * 100, digits = 2), nsmall = 2)
PCy <- format(round(tmp$importance[2, PCs[2]] * 100, digits = 2), nsmall = 2)
x <- data.frame(pca$x)
# Merge PCs back to metadata for annotation (may need to change by.y for other metadatas)
merged <- x %>%
subset(select = PCs) %>%
merge(., meta, by.x = "row.names", by.y = "title")
# Plot
ggplot(merged, aes_string(x = PCs[1], y = PCs[2], colour = colour)) +
geom_point() +
xlab(paste(PCs[1], "(", PCx, "% var. expl.)", sep = "")) +
ylab(paste(PCs[2], "(", PCy, "% var. expl.)", sep = ""))
}
```
Now let's plot it!
```{r plot_pca, results=F, message=F}
# Run previously generated function
plotPCA(pca.n, PCs = c("PC1", "PC2"), colour = "tissue.ch1")
########
# TASK #
########
# Try plotting the PCA using a different annotation column such as patient
```
## 11) UMAP
Uniform Manifold Approximation and Projection is a dimensionality reduction algorithm that has gained a lot
of importance in the last years. Unlike PCA it is a method for non-linear dimensionality reduction. UMAP tries to preserve the global similarity relationship between the data points while projecting them in a lower dimensional space for visualization purposes (here in 2D).
```{r umap, results=F, message=F}
umap_data <- umap(t(data))
# Bind umap values with metadata
merged_umap <- cbind(umap_data, meta)
# Add colnames for umap values
colnames(merged_umap)[1:2] <- c("UMAP1", "UMAP2")
# Plot
ggplot(merged_umap, aes(x = UMAP1, y = UMAP2, colour = tissue.ch1)) +
geom_point()
```