compile_raw.R

#
#	Step 2: compile sample-specific files
#
# # # # # # # # # # # # # # # # # # # # # # #

# 'treatments.csv' is not available from MG-RAST
treatments <- read.csv("data/treatments.csv", row.names=1)

# 'orders.csv' is generated by 'get_valid_taxa.R'
# We are not interested in Eukaryotic or viral order counts
allOrders <- read.csv("data/orders.csv")
ordersOfInterest <- subset(allOrders, superkingdom != "Eukaryota" & superkingdom != "Viruses")
ordersOfInterest <- ordersOfInterest$rast.name

# Library for rbind.fill
library(plyr)

# For each raw data file:
treatments$reads <- NA
counts <- data.frame()
samples <- rownames(treatments)
for (sample in 1:length(samples)) {
	sampleCounts <- data.frame(t(read.table(paste(c("data/organism_order_hits", samples[sample], "tsv"), collapse="."), sep="\t", row.names=1)))
	# include only those orders we selected above
	sampleCounts <- sampleCounts[,intersect(ordersOfInterest, colnames(sampleCounts))]
	counts <- rbind.fill(counts, sampleCounts)

	treatments[samples[sample],"reads"] <- sum(sampleCounts[1,])
}

# Clean up count data by replacing NA's with 0's, sort columns alphabeticaly
counts[is.na(counts)] <- 0
counts <- counts[, order(colnames(counts))]

# Combine treatment information with corresponding count information
result <- cbind(treatments, counts)

# Save output to disk
write.csv(result, file="data/order_counts.csv")
q(save="no")