-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompile_raw.R
40 lines (32 loc) · 1.36 KB
/
compile_raw.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#
# Step 2: compile sample-specific files
#
# # # # # # # # # # # # # # # # # # # # # # #
# 'treatments.csv' is not available from MG-RAST
treatments <- read.csv("data/treatments.csv", row.names=1)
# 'orders.csv' is generated by 'get_valid_taxa.R'
# We are not interested in Eukaryotic or viral order counts
allOrders <- read.csv("data/orders.csv")
ordersOfInterest <- subset(allOrders, superkingdom != "Eukaryota" & superkingdom != "Viruses")
ordersOfInterest <- ordersOfInterest$rast.name
# Library for rbind.fill
library(plyr)
# For each raw data file:
treatments$reads <- NA
counts <- data.frame()
samples <- rownames(treatments)
for (sample in 1:length(samples)) {
sampleCounts <- data.frame(t(read.table(paste(c("data/organism_order_hits", samples[sample], "tsv"), collapse="."), sep="\t", row.names=1)))
# include only those orders we selected above
sampleCounts <- sampleCounts[,intersect(ordersOfInterest, colnames(sampleCounts))]
counts <- rbind.fill(counts, sampleCounts)
treatments[samples[sample],"reads"] <- sum(sampleCounts[1,])
}
# Clean up count data by replacing NA's with 0's, sort columns alphabeticaly
counts[is.na(counts)] <- 0
counts <- counts[, order(colnames(counts))]
# Combine treatment information with corresponding count information
result <- cbind(treatments, counts)
# Save output to disk
write.csv(result, file="data/order_counts.csv")
q(save="no")