Skip to content

Commit

Permalink
Added batch-correction and data details to worked metabo example md
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalzer committed Sep 17, 2024
1 parent 2ca8aed commit 443855e
Show file tree
Hide file tree
Showing 2 changed files with 260 additions and 4 deletions.
249 changes: 249 additions & 0 deletions docs/pages/worked-examples/demo_set3only.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
library(BatchCorrMetabolomics)
data(BC)

set.3.lod <- min(set.3[!is.na(set.3)])
minBatchOccurrence.Ave <- 2
minBatchOccurrence.Line <- 4


TOF.ngenotypes <- nlevels(set.3.Y$SCode)-1 ## take away the ref samples
TOF.nNA <- apply(set.3, 2, function(x) sum(is.na(x)))
TOF.nref <- sum(set.3.Y$SCode == "ref")


conditions <- c("", "0", "1", "2", "c")
experiments <- c(t(outer(c("Q", "S"), conditions, paste, sep = "")))
methods <- rep("lm", length(experiments))
methods[grep("c", experiments)] <- "tobit"
imputeValues <- rep(NA, length(experiments))
imputeValues[grep("0", experiments)] <- 0
imputeValues[grep("1", experiments)] <- set.3.lod / 2
imputeValues[grep("2", experiments)] <- set.3.lod
imputeValues[grep("c", experiments)] <- set.3.lod - .01
refSamples <- list("Q" = which(set.3.Y$SCode == "ref"),
"S" = which(set.3.Y$SCode != "ref"))
strategies <- rep(c("Q", "S"), each = length(conditions))

## leave out the censored regressions for A
exp.idx <- (1:length(experiments))[-grep("c", experiments)]
suppressMessages(allResultsAve <-
lapply(exp.idx,
function(ii)
apply(set.3, 2, doBC,
ref.idx = refSamples[[ strategies[[ii]] ]],
batch.idx = set.3.Y$Batch,
minBsamp = minBatchOccurrence.Ave,
correctionFormula = formula("X ~ B"),
seq.idx = set.3.Y$SeqNr,
method = methods[ii],
imputeVal = imputeValues[ii])))
names(allResultsAve) <- experiments[exp.idx]

suppressMessages(allResultsLine <-
lapply(seq(along = experiments),
function(ii)
apply(set.3, 2, doBC,
ref.idx = refSamples[[ strategies[[ii]] ]],
batch.idx = set.3.Y$Batch,
minBsamp = minBatchOccurrence.Line,
seq.idx = set.3.Y$SeqNr,
method = methods[ii],
imputeVal = imputeValues[ii])))
names(allResultsLine) <- experiments

library("RUVSeq")

idx <- which(set.3.Y$SCode == "ref")
replicates.ind <- matrix(-1, nrow(set.3) - length(idx) + 1, length(idx))
replicates.ind[1,] <- idx
replicates.ind[-1,1] <- (1:nrow(set.3))[-idx]

allResultsRUV <-
lapply(0:2,
function(ImpVal) {
huhn <- set.3
huhn[is.na(huhn)] <- ImpVal * set.3.lod / 2
woppa <- t(RUVs(t(huhn),
1:ncol(huhn),
k = 3,
replicates.ind,
round = FALSE, isLog = TRUE)$normalizedCounts)
woppa[is.na(set.3)] <- NA
woppa
})


# figure 2

par(mfrow = c(1,1))
huhnPCA <- evaluateCorrection(set.3, set.3.Y, what = "PCA",
plot = TRUE, legend.loc = "bottomright")
title(main = paste("Uncorrected Interbatch Distance:", round(huhnPCA, 3)))

huhnPCA.A <- evaluateCorrection(allResultsAve[["Q"]], set.3.Y, what = "PCA",
plot = TRUE, legend.loc = "bottomright")
title(main = paste("Corrected Interbatch Distance:", round(huhnPCA.A, 3)))

# huhnPCA.A <- evaluateCorrection(allResultsAve[["Q0"]], set.3.Y, what = "PCA",
# plot = TRUE, legend.loc = "bottomright")
# title(main = paste("Q0: Interbatch distance:", round(huhnPCA.A, 3)))



results.ave <- results.line <- matrix(NA, length(experiments) + 1, 2)
dimnames(results.line) <- dimnames(results.ave) <-
list(c("No correction", experiments), c("PCA", "duplo"))

results.line[1,1] <- results.ave[1,1] <-
evaluateCorrection(set.3, set.3.Y, what = "PCA", plot = FALSE)
results.line[1,2] <- results.ave[1,2] <-
evaluateCorrection(set.3, set.3.Y, what = "duplo", plot = FALSE)

for (exp in experiments[exp.idx]) {
x <- allResultsAve[[exp]]
woppa <- set.3
woppa[!is.na(x)] <- x[!is.na(x)]
results.ave[exp, 1] <-
evaluateCorrection(woppa, set.3.Y, "PCA", plot = FALSE)
results.ave[exp, 2] <-
evaluateCorrection(woppa, set.3.Y, "duplo", plot = FALSE)
}

for (exp in experiments) {
x <- allResultsLine[[exp]]
woppa <- set.3
woppa[!is.na(x)] <- x[!is.na(x)]
results.line[exp, 1] <-
evaluateCorrection(woppa, set.3.Y, "PCA", plot = FALSE)
results.line[exp, 2] <-
evaluateCorrection(woppa, set.3.Y, "duplo", plot = FALSE)
}

results.ruv <-
cbind(sapply(allResultsRUV,
evaluateCorrection, set.3.Y,
what = "PCA", plot = FALSE),
sapply(allResultsRUV,
evaluateCorrection, set.3.Y,
what = "duplo", plot = FALSE))
dimnames(results.ruv) <- list(paste("R", 0:2, sep = ""),
c("PCA", "duplo"))

# figure 6
floodresults.ave <- rbind(results.ave, results.ruv)
floodresults.ave.label <- factor(substr(rownames(floodresults.ave), 1, 1))
floodresults.line <- rbind(results.line, results.ruv)
floodresults.line.label <- factor(substr(rownames(floodresults.line), 1, 1))
PCA.range <- range(c(floodresults.ave[,1], floodresults.line[,1]),
na.rm = TRUE)
duplo.range <- range(c(floodresults.ave[,2], floodresults.line[,2]),
na.rm = TRUE)
par(mfrow = c(1,1))
plot(floodresults.ave[,1], floodresults.ave[,2],
main = "Data set III - only batch correction",
ylab= "Repeatability", xlab = "Interbatch distance",
xlim = PCA.range, ylim = duplo.range,
col = as.integer(floodresults.ave.label))
text(floodresults.ave[,1], floodresults.ave[,2],
pos = ifelse(floodresults.ave.label == "N", 2, 4),
col = as.integer(floodresults.ave.label),
labels = rownames(floodresults.ave))

plot(floodresults.line[,1], floodresults.line[,2],
main = "Data set III - batch and order correction",
xlim = PCA.range, ylim = duplo.range,
ylab= "Repeatability", xlab = "Interbatch distance",
col = as.integer(floodresults.line.label))
text(floodresults.line[,1], floodresults.line[,2],
pos = ifelse(floodresults.line.label %in% c("N", "Q"), 2, 4),
col = as.integer(floodresults.line.label),
labels = rownames(floodresults.line))


# log peak area mean
library("dplyr")
library("ggplot2")

raw <- data.frame(set.3)
#corrected <- data.frame(allResultsAve[["Q"]])
corrected <- data.frame(allResultsRUV[[0]])
row.names(corrected) <- row.names(raw)

raw[is.na(raw)] <- 0
corrected[is.na(corrected)] <- 0

raw['mean'] <- apply(raw, 1, mean)
corrected['mean'] <- apply(corrected, 1, mean)
raw_plus_meta <- merge(raw, set.3.Y, by="row.names", all=TRUE)
corrected_plus_meta <- merge(corrected, set.3.Y, by="row.names", all=TRUE)

plot_areas_batchwise<- function(df,title) {
gm = mean(df$mean)
ggplot() +
geom_point(data=df, mapping=aes(x=SeqNr, y=mean, color=Batch)) +
xlab("injection order") + ylab("log mean peak area") + ggtitle(title) +
geom_hline(yintercept=gm, linetype="dashed", color = "black") +
stat_smooth(method="lm", formula=y~1, se=FALSE)
}

plot_areas_batchwise(raw_plus_meta,"Raw")
plot_areas_batchwise(corrected_plus_meta,"Corrected")

raw_refs <- raw_plus_meta[ which(raw_plus_meta$SCode=='ref'),]
raw_samples <- raw_plus_meta[ which(raw_plus_meta$SCode!='ref'),]
corrected_refs <- corrected_plus_meta[ which(corrected_plus_meta$SCode=='ref'),]
corrected_samples <- corrected_plus_meta[ which(corrected_plus_meta$SCode!='ref'),]

plot_areas_batchwise(raw_refs,"raw refs")
plot_areas_batchwise(corrected_refs,"corrected refs")
plot_areas_batchwise(raw_samples,"raw samples")
plot_areas_batchwise(corrected_samples,"corrected samples")



raw_plus_meta$Batch <- as.character(raw_plus_meta$Batch)
raw_plus_meta[ which(raw_plus_meta$SCode=='ref'),]$Batch = "QC"
raw_plus_meta$Batch <- as.factor(raw_plus_meta$Batch)

corrected_plus_meta$Batch <- as.character(corrected_plus_meta$Batch)
corrected_plus_meta[ which(corrected_plus_meta$SCode=='ref'),]$Batch = "QC"
corrected_plus_meta$Batch <- as.factor(corrected_plus_meta$Batch)

plot_areas_batchwise(raw_plus_meta,"raw")
plot_areas_batchwise(corrected_plus_meta,"corrected")


getPCA <- function(X, Y, GRAPH)
{
nbatches <- nlevels(Y$Batch)
noref.idx <- which(Y$SCode != "ref")
Xsample <- X[noref.idx,]
YSample <- Y[noref.idx,]

Xsample <- Xsample[, apply(Xsample, 2, function(x) !all(is.na(x)))]

## replace NA values with column means
for (i in 1:ncol(Xsample))
Xsample[is.na(Xsample[,i]),i] <- mean(Xsample[,i], na.rm = TRUE)

Xsample <- Xsample[, apply(Xsample, 2, sd, na.rm = TRUE) > 0]
## Original is using ChemometricsWithR which reports an unintelligible result object, hence switching to something well maintained to get the PCA scores
X.PCA <- FactoMineR::PCA(Xsample, scale.unit = TRUE, graph=GRAPH)

return (X.PCA)

}



set3.uncorrected.PCA <- getPCA(set.3, set.3.Y,GRAPH = FALSE)
set3.uncorrected.PCAscores <- data.frame(set3.uncorrected.PCA$ind$coord)
set3.uncorrected.PCAscores <- merge(set3.uncorrected.PCAscores, set.3.Y, by="row.names", all=TRUE)
write.csv(set3.uncorrected.PCAscores,"set3.uncorrected.PCA.csv", row.names = TRUE)

set3.corrected.PCA <- getPCA(corrected, set.3.Y,GRAPH = FALSE)
set3.corrected.PCAscores <- data.frame(set3.corrected.PCA$ind$coord)
set3.corrected.PCAscores <- merge(set3.corrected.PCAscores, set.3.Y, by="row.names", all=TRUE)
write.csv(set3.corrected.PCAscores,"set3.corrected.PCA.csv", row.names = TRUE)


15 changes: 11 additions & 4 deletions docs/pages/worked-examples/metabo-batches.mzQC.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,19 @@ permalink: /examples/metabo-batches/
Here, we describe details of a metabolomics mzQC JSON document used to describe a Studies' quality before and after batch correction methods are applied.
For description of the general structure of mzQC, see the Single-Run Example of mzQC.
Find the complete file at the bottom of this document or in the example folder.
The mzQC file is made from the acquisions of GC-ToF-MS polar metabolite data of an Arabidopsis nucleotype-plasmotype diallel study as described in [Improved batch correction in untargeted MS-based metabolomics](https://dx.doi.org/10.1007%2Fs11306-016-1015-8).
The mzQC file is made from the acquisions of GC-ToF-MS polar metabolite data of an Arabidopsis nucleotype-plasmotype diallel study as described in [Improved batch correction in untargeted MS-based metabolomics](https://dx.doi.org/10.1007%2Fs11306-016-1015-8) by Wehrens, et al.(2016).

```
"description": "This is a metabolomics batch-correction use-case study for mzQC using the method described in 'Improved batch correction in untargeted MS-based metabolomics' by Wehrens, et al.(2016) [https://doi.org/10.1007%2Fs11306-016-1015-8]. The data comes from set3 of the supplementary material.",
"description": "This is a metabolomics batch-correction use-case study for mzQC using the method described in 'Improved batch correction in untargeted MS-based metabolomics' by Wehrens, et al.(2016) [https://doi.org/10.1007%2Fs11306-016-1015-8]. The data comes from set3 of the supplementary material.",
```
## Data
This example has 240 runs, for which each a `qualityMetric` object is present in `qualityMetrics`, for sake of brevity only containing the number of successfully identified primary metabolites.
The data is taken from *set3* of the publication only, extracted from `BC.RData` and stored as `set3.peakarea.csv` and `set3.uncorrected.PCA.csv` (same data).
All data from [Wehrens, et al.(2016) can be found on GitHub](https://github.com/rwehrens/BatchCorrMetabolomics).
## Batch correction
Batch correction was applied as described in the supplementary with BatchCorrMetabolomics (see github repo) scripted in [`demo_set3only.R`](demo_set3only.R). Output is `set3.corrected.PCA.csv`.
## mzQC
From the above information and data, we can formulate a mzQC file.
```
{
"metadata": {
Expand Down Expand Up @@ -61,7 +68,7 @@ This example has 240 runs, for which each a `qualityMetric` object is present in
},
...
```
To be able to estimate the effect of the batch correction, we first describe all runs that are batch corrected by a principle component analysis of their peak area features. Hence, we use a `setQuality` composed of all involved `inputFiles`, and add related metrics in the qualityMetric section.
To be able to estimate the effect of the batch correction, we first describe all runs that are batch corrected by a principal component analysis of their peak area features. Hence, we use a `setQuality` composed of all involved `inputFiles`, and add related metrics in the qualityMetric section.
```
{
"accession":"MS:4000092",
Expand Down Expand Up @@ -106,4 +113,4 @@ before | after


### This is the mzQC file once again, in full:
**[metabo-batches.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/metabo-batches.mzQC)**
**[metabo-batches.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/metabo-batches.mzQC)**

0 comments on commit 443855e

Please sign in to comment.