-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathSTAMPS_Intro2R_OptionC.R
275 lines (233 loc) · 9.34 KB
/
STAMPS_Intro2R_OptionC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
##########################################
#
# Introduction to R
# Option C: For users with a lot of experience with R
# A tutorial for STAMPS 2017
#
#########################################
#
# This tutorial was written in July 2017
# A collaborative effort by
# Julia Fukuyama, Lan Huong Nguyen,
# Kris Sankaran, and Amy Willis
#
# [please feel free to contribute!]
#
#########################################
#
# Disclaimer: TA support is primarily for Options A and B.
# Please make every effort to resolve any difficulties that you encounter
# on your own. That said, we hope that you learn something new from this
# tutorial!
#
#########################################
# Set your working directory and load data
setwd("~/Documents/MyProjectDirectory")
covariates <- read.table("FWS_covariates.txt", header=TRUE, sep="\t", as.is=TRUE)
abundances <- read.table("FWS_OTUs.txt", header=TRUE, row.names = 1, sep="\t", as.is=TRUE)
# take a moment to have a look at the data here
###############################################
#
# more ggplot, reshape, cast, melt
# By Kris Sankaran
#
###############################################
library(ggplot2)
library(reshape2) # melt and cast
library(dplyr) # filter, group_by, and mutate
# let's try to plot the abundance of the dominant 2
# taxa against each other, and colour the points
# based on Season
# We need to reshape the data so that what will go
# in the x and y for the scatterplot are their own
# columns. We need to *melt* the data. This takes it
# from "wide" (one sample per column) to tall
# (different samples stacked on top of one another).
# First let's retrieve the taxonomic information,
# using strsplit to split the rownames according to
# the ";" character
taxa <- strsplit(rownames(abundances), split = ";")
taxa <- do.call(rbind, taxa) # bind the list into a matrix
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")
# We can use the %>% operation so that we don't need
# to keep track of temporary variables. Melt converts
# the data from wide to tall
melted_abundances <- data.frame(taxa, abundances) %>%
melt(
id.vars = colnames(taxa),
variable.name = "SampleName",
value.name = "abundance"
)
# We can incorporate the season information using
# left_join, and get the taxa abundances using
# group_by_at and summarise
melted_abundances <- melted_abundances %>%
group_by_at(colnames(taxa)) %>%
mutate(total = sum(abundance)) %>%
left_join(covariates) %>%
ungroup()
# To get the most abundant taxa, we filter down
# to the top two totals. Then, we cast the data.frame
# so that there is one column for what will become
# each axis in the plot
top_abundances <- melted_abundances %>%
filter(total >= sort(rowSums(abundances), decreasing = TRUE)[2]) %>%
dcast(SampleName + Season ~ Genus, value.var = "abundance")
# Now we can make the plot! Notice that most of
# the work was in defining data with the appropriate
# input format
ggplot(top_abundances) +
geom_point(aes(x = Burkholderia, y = Ralstonia, col = Season))
###############################################
#
# source and system
#
###############################################
# Exercise: Write a script to separate out the different
# taxonomy data in shell
# and run it through R
# Hint: Use source() to run an R script in this session
# Also works for URLs!
?source
# Hint: Use system() to run a system command.
# In the alpha diversity lab we will see an example
# of this (running CatchAll via the command line)
?system
###############################################
#
# parallelisation
# By Lan Huong Nguyen
#
###############################################
# In Option B we calculated the relative abundance of the taxa
# using for loops
# Now try to perform the same but in parallel
# If you haven't done it before, install the 'foreach' and 'doParallel' packages
# install.packages("foreach")
# install.packages("doParallel")
# Load the packages
library(doParallel)
# Set the number of cores
no_cores <- detectCores() - 1
cl<-makeCluster(no_cores)
registerDoParallel(cl)
# Use foreach function to compute the relative abundances
relative_abundances <-
foreach(current_index = 1:(dim(abundances)[2]),
.combine = cbind) %dopar% {
number_reads <- sum(abundances[, current_index])
abundances[, current_index]/number_reads
}
# Check if results match the standard calculations
all(relative_abundances == scale(abundances, center = F, scale = colSums(abundances)))
# Note that:'.combine' argument tells foreach function how to combine results
# from multiple threads. '.combine' can be specified as a function or
# a character string e.g. c, cbind, rbind, list, '+', '*'
# Also observe that variables from the local environment are by default available
# for each thread without specification. This is not true of the variables
# in the parent environment, e.g. the following gives an error:
test <- function () {
foreach(current_index = 1:(dim(abundances)[2]),
.combine = cbind) %dopar% {
number_reads <- sum(abundances[, current_index])
abundances[, current_index]/number_reads
}
}
relative_abundances <- test()
# To export specific variables to the cluster use the 'export' argument
test <- function () {
foreach(current_index = 1:(dim(abundances)[2]),
.combine = cbind,
.export = "abundances") %dopar% {
number_reads <- sum(abundances[, current_index])
abundances[, current_index]/number_reads
}
}
relative_abundances <- test()
# Check if results match the standard calculations
all(relative_abundances == scale(abundances, center = F, scale = colSums(abundances)))
# When you are done working in paralel close the cluster so that resources
# are returned to the operating system
stopCluster(cl)
# In R lapply() is a faster and cleaner way to perform for loop tasks.
# You can speed things up even further by using parallel equivalent of
# lapply() and sapply(), parLapply() and parSapply() respectively. These
# functions are available in the package, 'parallel'.
# install.packages("parallel")
library(parallel)
# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(no_cores)
# Export variable 'abundances' to the cluster
clusterExport(cl, "abundances")
# If you are using some specific packages inside parLapply, you need to load
# them through clusterEvalQ(), e.g. clusterEvalQ(cl, library(phyloseq))
# Use parLapply() function
relative_abundances <-
parLapply(cl,
1:(dim(abundances)[2]),
function(current_index) {
print(class(current_index))
number_reads <- sum(abundances[, current_index])
abundances[, current_index]/number_reads
})
# Close the cluster
stopCluster(cl)
# Currently, 'relative_abundances' is a list. To converti it to a matrix do
# the following:
relative_abundances <- do.call("cbind", relative_abundances)
# Check if results match the standard calculations
all(relative_abundances == scale(abundances, center = F, scale = colSums(abundances)))
# For more details on parallelisms check the following link:
# http://gforge.se/2015/02/how-to-go-parallel-in-r-basics-tips/
###############################################
#
# Markdown
#
###############################################
## Rmarkdown is a great way to combine R code with text, either to
## share your work with others or to keep notes for yourself.
## Markdown is a lightweight text formatting system. It gives you
## control over some simple things, like headings, bold/italic, and
## links, but the idea is that when you write a markdown document you
## focus on the text and not on the formatting. Markdown documents are
## readable as plain text, and can later be formatted as html.
##
## There is a short tutorial at http://www.markdowntutorial.com/ that
## allows you to go through some examples. You should work through it,
## as it only takes a couple of minutes.
##
## Once you know about markdown, Rmarkdown is a simple extension. It
## allows you to place R code into markdown documents, and the R code
## and markdown text can then be processed with knitr to create an
## html document containing both. Knitr will run the R code, create
## any output or plots described in the code, and create a single html
## document containing everything. Adding R code to a markdown
## document is just a matter of putting
##
## ```{r chunk_name}
##
## before the R code (without the leading ##, and with chunk_name
## replaced with a descriptive name for what the code does), and
## putting
##
## ```
##
## below the R code to indicate the end of a chunk (again, without the
## leading ##).
##
## Rmarkdown (Rmd) documents can be converted to html using the render
## command in rmarkdown package, so if you have a file called
## my_rmarkdown.Rmd it can be converted to html using
## render("my_rmarkdown.Rmd"). If you are using Rstudio, there is a
## button that will do this for you.
##
## Exercise 1: There is an example document called
## rmarkdown_example.Rmd in this directory. Find it and use knitr to
## convert it to an html document, and look at the result.
##
## Exercise 2: Make your own Rmd document with some of the code and
## plots from the option B tutorial. Try modifying the width and
## height of the figures (look at the fig.width and fig.height options
## on this web page: https://yihui.name/knitr/options/).