Skip to content

Commit

Permalink
Major updates: modify ancombc2 variance formula and sensitivity analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
FrederickHuangLin committed Jul 6, 2023
1 parent 92b41f2 commit e3dc16d
Show file tree
Hide file tree
Showing 27 changed files with 1,472 additions and 1,228 deletions.
16 changes: 8 additions & 8 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: ANCOMBC
Type: Package
Title: Microbiome differential abudance and correlation analyses with bias
correction
Version: 2.2.0
Version: 2.2.1
Description: ANCOMBC is a package containing differential abundance (DA) and
correlation analyses for microbiome data. Specifically, the package includes
Analysis of Compositions of Microbiomes with Bias Correction 2 (ANCOM-BC2),
Expand All @@ -14,7 +14,7 @@ Description: ANCOMBC is a package containing differential abundance (DA) and
sequencing efficiencies (taxon-specific biases). Methodologies included in
the ANCOMBC package are designed to correct these biases and construct
statistically consistent estimators.
Date: 2023-04-18
Date: 2023-07-05
Authors@R: c(
person(given = "Huang",
family = "Lin",
Expand All @@ -23,12 +23,12 @@ Authors@R: c(
comment = c(ORCID = "0000-0002-4892-7871")))
License: Artistic-2.0
Imports: mia, stats,
CVXR, DescTools, Hmisc, MASS, Rdpack, S4Vectors, SingleCellExperiment,
SummarizedExperiment, TreeSummarizedExperiment, doParallel, doRNG, dplyr,
emmeans, energy, foreach, lme4, lmerTest, magrittr, nloptr, parallel, rlang,
rngtools, tibble, tidyr, utils
Suggests: knitr, rmarkdown, testthat,
DT, caret, microbiome, tidyverse
CVXR, DescTools, Hmisc, MASS, Matrix, Rdpack, S4Vectors,
SingleCellExperiment, SummarizedExperiment, TreeSummarizedExperiment,
doParallel, doRNG, energy, foreach, gtools, lme4, lmerTest, multcomp,
nloptr, parallel, utils
Suggests: dplyr, knitr, rmarkdown, testthat,
DT, tidyr, tidyverse, microbiome, magrittr
biocViews:
DifferentialExpression,
Microbiome,
Expand Down
19 changes: 5 additions & 14 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ export(secom_dist)
export(secom_linear)
export(sim_plnm)
import(stats, except = filter)
import(stats, except = filter)

importFrom(CVXR,Minimize)
importFrom(CVXR,Problem)
importFrom(CVXR,Variable)
Expand All @@ -17,6 +15,7 @@ importFrom(CVXR,solve)
importFrom(DescTools,Winsorize)
importFrom(Hmisc,rcorr)
importFrom(MASS,ginv)
importFrom(Matrix,nearPD)
importFrom(Rdpack,reprompt)
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,SimpleList)
Expand All @@ -27,30 +26,22 @@ importFrom(SummarizedExperiment,rowData)
importFrom(TreeSummarizedExperiment,TreeSummarizedExperiment)
importFrom(doParallel,registerDoParallel)
importFrom(doRNG,"%dorng%")
importFrom(dplyr,bind_rows)
importFrom(dplyr,filter)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,right_join)
importFrom(emmeans,contrast)
importFrom(emmeans,emmeans)
importFrom(energy,dcor)
importFrom(energy,dcor.test)
importFrom(foreach,"%:%")
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(foreach,registerDoSEQ)
importFrom(gtools,smartbind)
importFrom(lme4,lmerControl)
importFrom(lmerTest,lmer)
importFrom(magrittr,"%>%")
importFrom(mia,agglomerateByRank)
importFrom(mia,makeTreeSummarizedExperimentFromPhyloseq)
importFrom(mia,taxonomyRanks)
importFrom(multcomp,adjusted)
importFrom(multcomp,glht)
importFrom(multcomp,mcp)
importFrom(nloptr,neldermead)
importFrom(parallel,makeCluster)
importFrom(parallel,stopCluster)
importFrom(rlang,.data)
importFrom(rngtools,setRNG)
importFrom(tibble,rownames_to_column)
importFrom(tidyr,pivot_longer)
importFrom(utils,combn)
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
version 2.2.xx (last update 2023-07-05)
+ Modify ancombc2 function and the sensitivity analysis

version 2.0.xx (last update 2022-12-17)
+ Add ancombc2 function

Expand Down
55 changes: 28 additions & 27 deletions R/ancom.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#' In this example, taxon A is declared to be differentially abundant between
#' g1 and g2, g1 and g3, and consequently, it is globally differentially
#' abundant with respect to this group variable.
#' Such taxa are not further analyzed using ANCOM-BC, but the results are
#' Such taxa are not further analyzed using ANCOM, but the results are
#' summarized in the overall summary. For more details about the structural
#' zeros, please go to the
#' \href{https://doi.org/10.3389/fmicb.2017.02114}{ANCOM-II} paper.
Expand All @@ -26,16 +26,15 @@
#' recommended to set \code{neg_lb = TRUE} when the sample size per group is
#' relatively large (e.g. > 30).
#'
#' @param data the input data. A
#' \code{phyloseq}, \code{SummarizedExperiment}, or
#' \code{TreeSummarizedExperiment} object, which consists of
#' a feature table (microbial count table), a sample metadata, a
#' taxonomy table (optional), and a phylogenetic tree (optional). The row names
#' of the metadata must match the sample names of the feature table, and the
#' row names of the taxonomy table must match the taxon (feature) names of the
#' feature table. See \code{?phyloseq::phyloseq},
#' \code{?SummarizedExperiment::SummarizedExperiment}, or
#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment} for more details.
#' @param data the input data. The \code{data} parameter should be either a
#' \code{phyloseq} or a \code{TreeSummarizedExperiment} object, which
#' consists of a feature table (microbial count table), a sample metadata table,
#' a taxonomy table (optional), and a phylogenetic tree (optional).
#' Ensure that the row names of the metadata table match the sample names in the
#' feature table, and the row names of the taxonomy table match the taxon
#' (feature) names in the feature table. For detailed information, refer to
#' \code{?phyloseq::phyloseq} or
#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment}.
#' @param assay_name character. Name of the count table in the data object
#' (only applicable if data object is a \code{(Tree)SummarizedExperiment}).
#' Default is "counts".
Expand All @@ -50,15 +49,18 @@
#' Options include "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
#' "fdr", "none". See \code{?stats::p.adjust} for more details.
#' @param prv_cut a numerical fraction between 0 and 1. Taxa with prevalences
#' less than \code{prv_cut} will be excluded in the analysis. For instance,
#' suppose there are 100 samples, if a taxon has nonzero counts presented in
#' less than 10 samples, it will not be further analyzed. Default is 0.10.
#' (the proportion of samples in which the taxon is present)
#' less than \code{prv_cut} will be excluded in the analysis. For example,
#' if there are 100 samples, and a taxon has nonzero counts present in less than
#' 100*prv_cut samples, it will not be considered in the analysis.
#' Default is 0.10.
#' @param lib_cut a numerical threshold for filtering samples based on library
#' sizes. Samples with library sizes less than \code{lib_cut} will be
#' excluded in the analysis. Default is 0, i.e. do not discard any sample.
#' @param main_var character. The name of the main variable of interest.
#' @param adj_formula character string representing the formula for
#' covariate adjustment. Default is \code{NULL}.
#' covariate adjustment. Please note that you should NOT include the
#' \code{main_var} in the formula. Default is \code{NULL}.
#' @param rand_formula the character string expresses how the microbial absolute
#' abundances for each taxon depend on the random effects in metadata. ANCOM
#' follows the \code{lmerTest} package in formulating the random effects. See
Expand Down Expand Up @@ -152,12 +154,9 @@
#' @importFrom S4Vectors DataFrame SimpleList
#' @importFrom lmerTest lmer
#' @importFrom lme4 lmerControl
#' @importFrom dplyr mutate
#' @importFrom parallel makeCluster stopCluster
#' @importFrom foreach foreach %dopar% registerDoSEQ
#' @importFrom doParallel registerDoParallel
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @importFrom Rdpack reprompt
#'
#' @export
Expand All @@ -166,6 +165,9 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,
lib_cut = 0, main_var, adj_formula = NULL, rand_formula = NULL,
lme_control = lme4::lmerControl(), struc_zero = FALSE,
neg_lb = FALSE, alpha = 0.05, n_cl = 1){
message("'ancom' has been fully evolved to 'ancombc2'. \n",
"Explore the enhanced capabilities of our refined method!")

# 1. Data pre-processing
# TSE data construction
tse_obj = tse_construct(data = data, assay_name = assay_name,
Expand Down Expand Up @@ -293,14 +295,14 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,
}

idx1 = NULL

if (n_cl > 1) {
cl = parallel::makeCluster(n_cl)
doParallel::registerDoParallel(cl)
} else {
foreach::registerDoSEQ()
}

if (main_cat == 0) {
result = foreach(idx1 = seq_len(n_tax), .combine = comb, .multicombine = TRUE) %dopar% {
alr_data = apply(comp_table, 1, function(x) x - comp_table[idx1, ])
Expand Down Expand Up @@ -385,7 +387,7 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,
list(p_vec, beta_vec)
}
}

if (n_cl > 1) {
parallel::stopCluster(cl)
}
Expand All @@ -405,13 +407,12 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,

# 5. Primary results
q_data = apply(p_data, 2, function(x) p.adjust(x, method = p_adj_method))
W = apply(q_data, 2, function(x) sum(x < alpha))
W = apply(q_data, 2, function(x) sum(x <= alpha))
res_comp = data.frame(taxon = taxon_id, W, row.names = NULL, check.names = FALSE)
res_comp = res_comp %>%
mutate(detected_0.9 = ifelse(.data$W > 0.9 * (n_tax - 1), TRUE, FALSE),
detected_0.8 = ifelse(.data$W > 0.8 * (n_tax - 1), TRUE, FALSE),
detected_0.7 = ifelse(.data$W > 0.7 * (n_tax - 1), TRUE, FALSE),
detected_0.6 = ifelse(.data$W > 0.6 * (n_tax - 1), TRUE, FALSE))
res_comp$detected_0.9 = ifelse(res_comp$W > 0.9 * (n_tax - 1), TRUE, FALSE)
res_comp$detected_0.8 = ifelse(res_comp$W > 0.8 * (n_tax - 1), TRUE, FALSE)
res_comp$detected_0.7 = ifelse(res_comp$W > 0.7 * (n_tax - 1), TRUE, FALSE)
res_comp$detected_0.6 = ifelse(res_comp$W > 0.6 * (n_tax - 1), TRUE, FALSE)

# Combine the information of structural zeros
if (struc_zero){
Expand Down
68 changes: 38 additions & 30 deletions R/ancombc.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,15 @@
#' recommended to set \code{neg_lb = TRUE} when the sample size per group is
#' relatively large (e.g. > 30).
#'
#' @param data the input data. A
#' \code{phyloseq}, \code{SummarizedExperiment}, or
#' \code{TreeSummarizedExperiment} object, which consists of
#' a feature table (microbial count table), a sample metadata, a
#' taxonomy table (optional), and a phylogenetic tree (optional). The row names
#' of the metadata must match the sample names of the feature table, and the
#' row names of the taxonomy table must match the taxon (feature) names of the
#' feature table. See \code{?phyloseq::phyloseq},
#' \code{?SummarizedExperiment::SummarizedExperiment}, or
#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment} for more details.
#' @param data the input data. The \code{data} parameter should be either a
#' \code{phyloseq} or a \code{TreeSummarizedExperiment} object, which
#' consists of a feature table (microbial count table), a sample metadata table,
#' a taxonomy table (optional), and a phylogenetic tree (optional).
#' Ensure that the row names of the metadata table match the sample names in the
#' feature table, and the row names of the taxonomy table match the taxon
#' (feature) names in the feature table. For detailed information, refer to
#' \code{?phyloseq::phyloseq} or
#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment}.
#' @param assay_name character. Name of the count table in the data object
#' (only applicable if data object is a \code{(Tree)SummarizedExperiment}).
#' Default is "counts".
Expand All @@ -49,22 +48,30 @@
#' input \code{data}.
#' @param phyloseq a \code{phyloseq} object. Will be deprecated.
#' @param formula the character string expresses how microbial absolute
#' abundances for each taxon depend on the variables in metadata.
#' abundances for each taxon depend on the variables in metadata. When
#' specifying the \code{formula}, make sure to include the \code{group} variable
#' in the formula if it is not NULL.
#' @param p_adj_method character. method to adjust p-values. Default is "holm".
#' Options include "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
#' "fdr", "none". See \code{?stats::p.adjust} for more details.
#' @param prv_cut a numerical fraction between 0 and 1. Taxa with prevalences
#' less than \code{prv_cut} will be excluded in the analysis. For instance,
#' suppose there are 100 samples, if a taxon has nonzero counts presented in
#' less than 10 samples, it will not be further analyzed. Default is 0.10.
#' (the proportion of samples in which the taxon is present)
#' less than \code{prv_cut} will be excluded in the analysis. For example,
#' if there are 100 samples, and a taxon has nonzero counts present in less than
#' 100*prv_cut samples, it will not be considered in the analysis.
#' Default is 0.10.
#' @param lib_cut a numerical threshold for filtering samples based on library
#' sizes. Samples with library sizes less than \code{lib_cut} will be
#' excluded in the analysis. Default is 0, i.e. do not discard any sample.
#' @param group character. the name of the group variable in metadata.
#' \code{group} should be discrete. Specifying \code{group} is required for
#' detecting structural zeros and performing global test.
#' Default is NULL. If the \code{group} of interest contains only two
#' categories, leave it as NULL.
#' The \code{group} parameter should be a character string representing the name
#' of the group variable in the metadata. The \code{group} variable should be
#' discrete, meaning it consists of categorical values. Specifying the
#' \code{group} variable is required if you are interested in detecting
#' structural zeros and performing global tests. However, if these analyses are
#' not of interest to you, you can leave the \code{group} parameter as NULL.
#' If the \code{group} variable of interest contains only two categories, you
#' can also leave the \code{group} parameter as NULL. Default is NULL.
#' @param struc_zero logical. whether to detect structural zeros based on
#' \code{group}. Default is FALSE.
#' @param neg_lb logical. whether to classify a taxon as a structural zero using
Expand Down Expand Up @@ -224,8 +231,9 @@ ancombc = function(data = NULL, assay_name = "counts",
neg_lb = FALSE, tol = 1e-05, max_iter = 100,
conserve = FALSE, alpha = 0.05, global = FALSE,
n_cl = 1, verbose = FALSE){
message("'ancombc' is deprecated \n", "Use 'ancombc2' instead")

message("'ancombc' has been fully evolved to 'ancombc2'. \n",
"Explore the enhanced capabilities of our refined method!")

if (n_cl > 1) {
cl = parallel::makeCluster(n_cl)
doParallel::registerDoParallel(cl)
Expand Down Expand Up @@ -294,7 +302,7 @@ ancombc = function(data = NULL, assay_name = "counts",

para = iter_mle(x = x, y = y, meta_data = meta_data,
formula = formula, theta = NULL, tol = tol,
max_iter = max_iter, verbose = FALSE, type = "ancombc")
max_iter = max_iter, verbose = FALSE)
beta = para$beta
vcov_hat = para$vcov_hat
var_hat = para$var_hat
Expand Down Expand Up @@ -344,7 +352,7 @@ ancombc = function(data = NULL, assay_name = "counts",
W = beta_hat/se_hat
p = 2 * pnorm(abs(W), mean = 0, sd = 1, lower.tail = FALSE)
q = apply(p, 2, function(x) p.adjust(x, method = p_adj_method))
diff_abn = q < alpha & !is.na(q)
diff_abn = q <= alpha & !is.na(q)

beta_prim = cbind(taxon = data.frame(taxon = tax_name),
data.frame(beta_hat, check.names = FALSE,
Expand Down Expand Up @@ -377,11 +385,11 @@ ancombc = function(data = NULL, assay_name = "counts",
if (verbose) {
message("ANCOM-BC global test ...")
}
res_global = ancombc_global(x = x, group = group,
beta_hat = beta_hat,
vcov_hat = vcov_hat,
p_adj_method = p_adj_method,
alpha = alpha)
res_global = ancombc_global_F(x = x, group = group,
beta_hat = beta_hat,
vcov_hat = vcov_hat,
p_adj_method = p_adj_method,
alpha = alpha)
} else { res_global = NULL }

# 8. Combine the information of structural zeros
Expand All @@ -400,7 +408,7 @@ ancombc = function(data = NULL, assay_name = "counts",
res$se[, group_ind] = res$se[, group_ind] * zero_mask
res$p_val[, group_ind] = res$p_val[, group_ind] * zero_mask
res$q_val[, group_ind] = res$q_val[, group_ind] * zero_mask
res$diff_abn[, group_ind] = data.frame(res$q_val[, group_ind] < alpha &
res$diff_abn[, group_ind] = data.frame(res$q_val[, group_ind] <= alpha &
!is.na(res$q_val[, group_ind]),
check.names = FALSE)

Expand All @@ -410,7 +418,7 @@ ancombc = function(data = NULL, assay_name = "counts",
sum(x) > 0 & sum(x) < ncol(zero_idx))
res_global[, "p_val"] = res_global[, "p_val"] * zero_mask
res_global[, "q_val"] = res_global[, "q_val"] * zero_mask
res_global[, "diff_abn"] = res_global[, "q_val"] < alpha &
res_global[, "diff_abn"] = res_global[, "q_val"] <= alpha &
!is.na(res_global[, "q_val"])
}
}
Expand All @@ -423,7 +431,7 @@ ancombc = function(data = NULL, assay_name = "counts",
if (n_cl > 1) {
parallel::stopCluster(cl)
}

return(out)
}

Expand Down
Loading

0 comments on commit e3dc16d

Please sign in to comment.