-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathSTAMPS_Intro2R_OptionB.R
286 lines (227 loc) · 9.54 KB
/
STAMPS_Intro2R_OptionB.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
##########################################
#
# Introduction to R
# Option B: For users with some experience with R
# A tutorial for STAMPS 2017
#
#########################################
#
# This tutorial is written by Amy Willis, July 2017
# Contributions also made by [please feel free to contribute!]
#
#########################################
#
# If you downloaded this from the website,
# be sure to rename as "STAMPS_Intro2R_OptionB.R",
# R and R Studio will not interpret is as an R script with ".txt" suffix
#########################################
#
# Disclaimer: The level of hand-holding in this tutorial will be
# less than in Option A. There will be some expectation on you to
# attempt to solve your issues. By all means request help from the TAs,
# but in order to obtain mastery of any language it is important to practice
# correcting yourself, and now is a great opportunity to practice :)
#
#########################################
###############################################
#
# Get some context
#
###############################################
# Set your working directory (refer to Option A if you're not sure how)
setwd("~/Documents/MyProjectDirectory")
# If you already worked through Option A, feel free to skip until the next section
# In Option A, we loaded in some covariate and OTU data and briefly
# made some plots and looked at some summaries
covariates <- read.table("FWS_covariates.txt", header=TRUE, sep="\t", as.is=TRUE)
abundances <- read.table("FWS_OTUs.txt", header=TRUE, row.names = 1, sep="\t", as.is=TRUE)
# take a moment to have a look at the data here
###############################################
#
# ggplot
#
###############################################
# Last night you learned how to install a package from CRAN
# We're now going to use the package ggplot2
# If this doesn't work for you, look at the Installing R website
library(ggplot2)
# check that the following produces a plot
ggplot(data.frame("x"=c(1,2,3), "y"=c(4,4.6, 5)))+geom_point(aes(x,y))
# and don't worry about what this means for now
# base graphics in R, such as plot() and boxplot(), are not very pretty
# enter: ggplot! (everyone calls the package ggplot2 "ggplot")
# ggplot makes simple things difficult, difficult things simple, and
# everything beautiful
# ggplot has a different structure to base graphics
# here is how you make a ggplot
abundances <- data.frame(abundances)
ggplot(abundances)
# nothing happened! why?
# we need to add components, such as a scatterplot or a histogram,
# to build up a plot
ggplot(abundances, aes(x = JPA_Jan)) +
geom_histogram()
# this says that "abundances" is the data frame that we want to plot,
# JPA_Jan is the only variable that we care about
# and that we want a histogram
# obviously there are lots of taxa that were not observed in this sample, hence the zeros
# let's restrict our attention to taxa that we did see
# Subsetting 2 different variables is done like this
ggplot(subset(abundances, JPA_Jan > 0), aes(JPA_Jan)) +
geom_histogram()
# the first argument is the data frame of interest
# aes() specifies which column we want
# and the geom_histogram() says that we want a histogram
# The range of this histogram is huge, so we want to focus on the distribution of
# OTUs that were observed once or more but less than 60 times, we would do
ggplot(subset(abundances, JPA_Jan > 0 & JPA_Jan < 60), aes(JPA_Jan)) +
geom_histogram()
# a scatterplot shows how correlated the abundances are between january and february
ggplot(subset(abundances, JPA_Jan > 0 & JPA_Feb > 0),
aes(JPA_Jan, JPA_Feb)) +
geom_point()
# again, ggplot() specifies what we want plotted
# geom_point() specifies that we want a scatterplot
ggplot(subset(abundances, JPA_Jan > 0 & JPA_Feb > 0),
aes(JPA_Jan, JPA_Feb)) +
geom_point() +
theme_classic() # changes the styling
ggplot(abundances, aes(JPA_Jan, JPA_Feb)) +
geom_point() +
labs(title="My first scatterplot") + # adds a title
xlab("January OTU counts") + # adds labels
ylab("February OTU counts")
# one tedious thing about ggplot is that all of the information that
# you want to plot must be in the same data frame
# So incorporating Season (from the data frame "covariates")
# to colour the points our scatterplot is not trivial
# We need to create a new data frame that contains all this information
# together...
# Exercise: Use the internet to learn how to change the colour
# of the points. Create a new data frame that contains a
# logical [TRUE/FALSE] variable indicating if the January abundance
# is higher than 4000. Use this variable to differentially colour the single
# observation that fits this category
# Don't look at the solution until you've given it a good attempt
# (this is equally good practice for troubleshooting your problems)
# but the solution is available in STAMPS_Intro2R_OptionB_solutions.R
###############################################
#
# loops
#
###############################################
# R is a vectorised language, so for a function that takes 1 argument,
# you can apply across a vector
abundances[, 1] + 5
exp(abundances[, 1])
# what about for more complex functions?
# In Option A we calculated the relative abundance of the taxa
# using scale()
# This was a good way to do it, are we going to try to do the
# same thing with a loop
relative_abundances <- abundances # create a matrix of the same size as abundances
for (current_index in 1:(dim(abundances)[2])) { # cycling through the columns...
number_reads <- sum(abundances[, current_index]) # find the column total
relative_abundance <- abundances[, current_index]/number_reads # use it to find relative abundance
relative_abundances[, current_index] <- relative_abundance # save the current column into our new matrix
}
# we can track that this matches our original calculation with
all(relative_abundances == scale(abundances, center = F, scale = colSums(abundances)))
# what does all() do?
?all
# Exercise: use a for loop to find which genus was observed most frequently
# when combining all samples
###############################################
#
# The apply family
#
###############################################
# Unfortunately, for loops, while intuitive and broadly useful,
# are relatively slow in R
# apply() is a much better way of doing this!
fast_total_reads <- apply(abundances, 2, sum)
# what does the second argument 2 do? Compare to second argument of 1
# Find out with ?apply
# sweep() is related to apply()
sweep(abundances, 2, fast_total_reads, "/")
# "go down the columns of abundances, applying "/" (division) to each column,
# with the second argument of the division being the element of
# fast_total_reads corresponding to the column number of abundances
all(sweep(abundances, 2, fast_total_reads, "/") == relative_abundances)
# careful! dividing a matrix by a vector divides across the rows,
# not the columns!
all(abundances/fast_total_reads == relative_abundances)
# mapply(), tapply(), lapply() are all variations on apply for different
# situations
# In writing this tutorial, I just learnt about prop.table,
# which turns a matrix into its relative abundances across a fixed margin.
# Cool!
all(prop.table(as.matrix(abundances), margin = 2) == relative_abundances)
###############################################
#
# Lists
#
###############################################
# Lists are great! Lists allow you to combine many different types of data
# of many different sizes into groups
# We're going to make a list corresponding to all of the information
# about the first sample
sample1 <- list()
sample1$name <- covariates$SampleName[1]
sample1$season <- covariates$Season[1]
sample1$place <- covariates$Location[1]
sample1
sample1$counts <- abundances[,1]
# to see all the information in a list, go
sample1
# and to see a particular element of the list,
# place the name of that element after the sign $
sample1$name
# you can also make a list of lists!
all_samples <- list()
all_samples$sample1 <- sample1
all_samples$sample2 <- list()
all_samples$sample2$name <- covariates$SampleName[2]
all_samples
# Exercise: Use a loop to loop through all of the samples,
# creating a list of the information corresponding to that sample.
# (name, location, month, relative abundance table, abundance table)
# Hint:
all_samples[[1]]
all_samples[[2]]
# Double brackets can also be used to refer to the elements of a list,
# instead of naming them individually
names(all_samples)
# Bonus: do this with a apply loop (maybe after finishing the function
# writing section)
###############################################
#
# Function writing
#
###############################################
# writing functions is easy in R!
my_first_function <- function() {
cat("Hello World!")
}
my_first_function()
# functions can have no arguments, or several arguments:
my_second_function <- function(times) {
counter <- 1
while (counter <= times) { # this is a different type of loop
cat("Hello World!\n")
counter <- counter + 1
}
}
my_second_function(3)
# If you are dealing with one OTU table, turning it into
# relative abundance table is pretty easy. But what if you have
# to do it for many OTU tables?
# Exercise: Write a function that takes in an OTU table
# and returns the relative abundance table
# Note that you can construct functions inline:
all(apply(abundances, 2, function(x) x/sum(x)) == relative_abundances)
# Save your work!
# Congratulations! You are at a level with your R understanding
# that will get you through most of STAMPS! If you feel so inclined,
# have a look through Option C, but at this point, you probably deserve
# a cold beverage and a break!