Major updates: modify ancombc2 variance formula and sensitivity analysis

microbiome · Jul 6, 2023 · e3dc16d · e3dc16d
1 parent 92b41f2
commit e3dc16d
Show file tree

Hide file tree

Showing 27 changed files with 1,472 additions and 1,228 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: ANCOMBC
 Type: Package
 Title: Microbiome differential abudance and correlation analyses with bias 
     correction
-Version: 2.2.0
+Version: 2.2.1
 Description: ANCOMBC is a package containing differential abundance (DA) and 
     correlation analyses for microbiome data. Specifically, the package includes 
     Analysis of Compositions of Microbiomes with Bias Correction 2 (ANCOM-BC2),
@@ -14,7 +14,7 @@ Description: ANCOMBC is a package containing differential abundance (DA) and
     sequencing efficiencies (taxon-specific biases). Methodologies included in 
     the ANCOMBC package are designed to correct these biases and construct 
     statistically consistent estimators.
-Date: 2023-04-18
+Date: 2023-07-05
 Authors@R: c(
     person(given = "Huang", 
            family = "Lin", 
@@ -23,12 +23,12 @@ Authors@R: c(
            comment = c(ORCID = "0000-0002-4892-7871")))
 License: Artistic-2.0
 Imports: mia, stats, 
-    CVXR, DescTools, Hmisc, MASS, Rdpack, S4Vectors, SingleCellExperiment,
-    SummarizedExperiment, TreeSummarizedExperiment, doParallel, doRNG, dplyr, 
-    emmeans, energy, foreach, lme4, lmerTest, magrittr, nloptr, parallel, rlang, 
-    rngtools, tibble, tidyr, utils
-Suggests: knitr, rmarkdown, testthat, 
-    DT, caret, microbiome, tidyverse
+    CVXR, DescTools, Hmisc, MASS, Matrix, Rdpack, S4Vectors, 
+    SingleCellExperiment, SummarizedExperiment, TreeSummarizedExperiment, 
+    doParallel, doRNG, energy, foreach, gtools, lme4, lmerTest, multcomp, 
+    nloptr, parallel, utils
+Suggests: dplyr, knitr, rmarkdown, testthat, 
+    DT, tidyr, tidyverse, microbiome, magrittr
 biocViews: 
     DifferentialExpression, 
     Microbiome, 

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,8 +7,6 @@ export(secom_dist)
 export(secom_linear)
 export(sim_plnm)
 import(stats, except = filter)
-import(stats, except = filter)
-
 importFrom(CVXR,Minimize)
 importFrom(CVXR,Problem)
 importFrom(CVXR,Variable)
@@ -17,6 +15,7 @@ importFrom(CVXR,solve)
 importFrom(DescTools,Winsorize)
 importFrom(Hmisc,rcorr)
 importFrom(MASS,ginv)
+importFrom(Matrix,nearPD)
 importFrom(Rdpack,reprompt)
 importFrom(S4Vectors,DataFrame)
 importFrom(S4Vectors,SimpleList)
@@ -27,30 +26,22 @@ importFrom(SummarizedExperiment,rowData)
 importFrom(TreeSummarizedExperiment,TreeSummarizedExperiment)
 importFrom(doParallel,registerDoParallel)
 importFrom(doRNG,"%dorng%")
-importFrom(dplyr,bind_rows)
-importFrom(dplyr,filter)
-importFrom(dplyr,left_join)
-importFrom(dplyr,mutate)
-importFrom(dplyr,right_join)
-importFrom(emmeans,contrast)
-importFrom(emmeans,emmeans)
 importFrom(energy,dcor)
 importFrom(energy,dcor.test)
 importFrom(foreach,"%:%")
 importFrom(foreach,"%dopar%")
 importFrom(foreach,foreach)
 importFrom(foreach,registerDoSEQ)
+importFrom(gtools,smartbind)
 importFrom(lme4,lmerControl)
 importFrom(lmerTest,lmer)
-importFrom(magrittr,"%>%")
 importFrom(mia,agglomerateByRank)
 importFrom(mia,makeTreeSummarizedExperimentFromPhyloseq)
 importFrom(mia,taxonomyRanks)
+importFrom(multcomp,adjusted)
+importFrom(multcomp,glht)
+importFrom(multcomp,mcp)
 importFrom(nloptr,neldermead)
 importFrom(parallel,makeCluster)
 importFrom(parallel,stopCluster)
-importFrom(rlang,.data)
-importFrom(rngtools,setRNG)
-importFrom(tibble,rownames_to_column)
-importFrom(tidyr,pivot_longer)
 importFrom(utils,combn)
diff --git a/NEWS b/NEWS
@@ -1,3 +1,6 @@
+version 2.2.xx (last update 2023-07-05)
+        + Modify ancombc2 function and the sensitivity analysis
+
 version 2.0.xx (last update 2022-12-17)
         + Add ancombc2 function
 

diff --git a/R/ancom.R b/R/ancom.R
@@ -14,7 +14,7 @@
 #' In this example, taxon A is declared to be differentially abundant between
 #' g1 and g2, g1 and g3, and consequently, it is globally differentially
 #' abundant with respect to this group variable.
-#' Such taxa are not further analyzed using ANCOM-BC, but the results are
+#' Such taxa are not further analyzed using ANCOM, but the results are
 #' summarized in the overall summary. For more details about the structural
 #' zeros, please go to the
 #' \href{https://doi.org/10.3389/fmicb.2017.02114}{ANCOM-II} paper.
@@ -26,16 +26,15 @@
 #' recommended to set \code{neg_lb = TRUE} when the sample size per group is
 #' relatively large (e.g. > 30).
 #'
-#' @param data the input data. A
-#' \code{phyloseq}, \code{SummarizedExperiment}, or
-#' \code{TreeSummarizedExperiment} object, which consists of
-#' a feature table (microbial count table), a sample metadata, a
-#' taxonomy table (optional), and a phylogenetic tree (optional). The row names
-#' of the metadata must match the sample names of the feature table, and the
-#' row names of the taxonomy table must match the taxon (feature) names of the
-#' feature table. See \code{?phyloseq::phyloseq},
-#' \code{?SummarizedExperiment::SummarizedExperiment}, or
-#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment} for more details.
+#' @param data the input data. The \code{data} parameter should be either a
+#' \code{phyloseq} or a \code{TreeSummarizedExperiment} object, which
+#' consists of a feature table (microbial count table), a sample metadata table,
+#' a taxonomy table (optional), and a phylogenetic tree (optional).
+#' Ensure that the row names of the metadata table match the sample names in the
+#' feature table, and the row names of the taxonomy table match the taxon
+#' (feature) names in the feature table. For detailed information, refer to
+#' \code{?phyloseq::phyloseq} or
+#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment}.
 #' @param assay_name character. Name of the count table in the data object
 #' (only applicable if data object is a \code{(Tree)SummarizedExperiment}).
 #' Default is "counts".
@@ -50,15 +49,18 @@
 #' Options include "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
 #' "fdr", "none". See \code{?stats::p.adjust} for more details.
 #' @param prv_cut a numerical fraction between 0 and 1. Taxa with prevalences
-#' less than \code{prv_cut} will be excluded in the analysis. For instance,
-#' suppose there are 100 samples, if a taxon has nonzero counts presented in
-#' less than 10 samples, it will not be further analyzed. Default is 0.10.
+#' (the proportion of samples in which the taxon is present)
+#' less than \code{prv_cut} will be excluded in the analysis. For example,
+#' if there are 100 samples, and a taxon has nonzero counts present in less than
+#' 100*prv_cut samples, it will not be considered in the analysis.
+#' Default is 0.10.
 #' @param lib_cut a numerical threshold for filtering samples based on library
 #' sizes. Samples with library sizes less than \code{lib_cut} will be
 #' excluded in the analysis. Default is 0, i.e. do not discard any sample.
 #' @param main_var character. The name of the main variable of interest.
 #' @param adj_formula  character string representing the formula for
-#' covariate adjustment. Default is \code{NULL}.
+#' covariate adjustment. Please note that you should NOT include the
+#' \code{main_var} in the formula. Default is \code{NULL}.
 #' @param rand_formula the character string expresses how the microbial absolute
 #' abundances for each taxon depend on the random effects in metadata. ANCOM
 #' follows the \code{lmerTest} package in formulating the random effects. See
@@ -152,12 +154,9 @@
 #' @importFrom S4Vectors DataFrame SimpleList
 #' @importFrom lmerTest lmer
 #' @importFrom lme4 lmerControl
-#' @importFrom dplyr mutate
 #' @importFrom parallel makeCluster stopCluster
 #' @importFrom foreach foreach %dopar% registerDoSEQ
 #' @importFrom doParallel registerDoParallel
-#' @importFrom magrittr %>%
-#' @importFrom rlang .data
 #' @importFrom Rdpack reprompt
 #'
 #' @export
@@ -166,6 +165,9 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,
                  lib_cut = 0, main_var, adj_formula = NULL, rand_formula = NULL,
                  lme_control = lme4::lmerControl(), struc_zero = FALSE,
                  neg_lb = FALSE, alpha = 0.05, n_cl = 1){
+    message("'ancom' has been fully evolved to 'ancombc2'. \n",
+            "Explore the enhanced capabilities of our refined method!")
+
     # 1. Data pre-processing
     # TSE data construction
     tse_obj = tse_construct(data = data, assay_name = assay_name,
@@ -293,14 +295,14 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,
     }
 
     idx1 = NULL
-    
+
     if (n_cl > 1) {
       cl = parallel::makeCluster(n_cl)
       doParallel::registerDoParallel(cl)
     } else {
       foreach::registerDoSEQ()
     }
-    
+
     if (main_cat == 0) {
         result = foreach(idx1 = seq_len(n_tax), .combine = comb, .multicombine = TRUE) %dopar% {
             alr_data = apply(comp_table, 1, function(x) x - comp_table[idx1, ])
@@ -385,7 +387,7 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,
             list(p_vec, beta_vec)
         }
     }
-    
+
     if (n_cl > 1) {
       parallel::stopCluster(cl)
     }
@@ -405,13 +407,12 @@ ancom = function(data = NULL, assay_name = "counts", tax_level = NULL,
 
     # 5. Primary results
     q_data = apply(p_data, 2, function(x) p.adjust(x, method = p_adj_method))
-    W = apply(q_data, 2, function(x) sum(x < alpha))
+    W = apply(q_data, 2, function(x) sum(x <= alpha))
     res_comp = data.frame(taxon = taxon_id, W, row.names = NULL, check.names = FALSE)
-    res_comp = res_comp %>%
-        mutate(detected_0.9 = ifelse(.data$W > 0.9 * (n_tax - 1), TRUE, FALSE),
-               detected_0.8 = ifelse(.data$W > 0.8 * (n_tax - 1), TRUE, FALSE),
-               detected_0.7 = ifelse(.data$W > 0.7 * (n_tax - 1), TRUE, FALSE),
-               detected_0.6 = ifelse(.data$W > 0.6 * (n_tax - 1), TRUE, FALSE))
+    res_comp$detected_0.9 = ifelse(res_comp$W > 0.9 * (n_tax - 1), TRUE, FALSE)
+    res_comp$detected_0.8 = ifelse(res_comp$W > 0.8 * (n_tax - 1), TRUE, FALSE)
+    res_comp$detected_0.7 = ifelse(res_comp$W > 0.7 * (n_tax - 1), TRUE, FALSE)
+    res_comp$detected_0.6 = ifelse(res_comp$W > 0.6 * (n_tax - 1), TRUE, FALSE)
 
     # Combine the information of structural zeros
     if (struc_zero){

diff --git a/R/ancombc.R b/R/ancombc.R
@@ -28,16 +28,15 @@
 #' recommended to set \code{neg_lb = TRUE} when the sample size per group is
 #' relatively large (e.g. > 30).
 #'
-#' @param data the input data. A
-#' \code{phyloseq}, \code{SummarizedExperiment}, or
-#' \code{TreeSummarizedExperiment} object, which consists of
-#' a feature table (microbial count table), a sample metadata, a
-#' taxonomy table (optional), and a phylogenetic tree (optional). The row names
-#' of the metadata must match the sample names of the feature table, and the
-#' row names of the taxonomy table must match the taxon (feature) names of the
-#' feature table. See \code{?phyloseq::phyloseq},
-#' \code{?SummarizedExperiment::SummarizedExperiment}, or
-#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment} for more details.
+#' @param data the input data. The \code{data} parameter should be either a
+#' \code{phyloseq} or a \code{TreeSummarizedExperiment} object, which
+#' consists of a feature table (microbial count table), a sample metadata table,
+#' a taxonomy table (optional), and a phylogenetic tree (optional).
+#' Ensure that the row names of the metadata table match the sample names in the
+#' feature table, and the row names of the taxonomy table match the taxon
+#' (feature) names in the feature table. For detailed information, refer to
+#' \code{?phyloseq::phyloseq} or
+#' \code{?TreeSummarizedExperiment::TreeSummarizedExperiment}.
 #' @param assay_name character. Name of the count table in the data object
 #' (only applicable if data object is a \code{(Tree)SummarizedExperiment}).
 #' Default is "counts".
@@ -49,22 +48,30 @@
 #' input \code{data}.
 #' @param phyloseq a \code{phyloseq} object. Will be deprecated.
 #' @param formula the character string expresses how microbial absolute
-#' abundances for each taxon depend on the variables in metadata.
+#' abundances for each taxon depend on the variables in metadata. When
+#' specifying the \code{formula}, make sure to include the \code{group} variable
+#' in the formula if it is not NULL.
 #' @param p_adj_method character. method to adjust p-values. Default is "holm".
 #' Options include "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
 #' "fdr", "none". See \code{?stats::p.adjust} for more details.
 #' @param prv_cut a numerical fraction between 0 and 1. Taxa with prevalences
-#' less than \code{prv_cut} will be excluded in the analysis. For instance,
-#' suppose there are 100 samples, if a taxon has nonzero counts presented in
-#' less than 10 samples, it will not be further analyzed. Default is 0.10.
+#' (the proportion of samples in which the taxon is present)
+#' less than \code{prv_cut} will be excluded in the analysis. For example,
+#' if there are 100 samples, and a taxon has nonzero counts present in less than
+#' 100*prv_cut samples, it will not be considered in the analysis.
+#' Default is 0.10.
 #' @param lib_cut a numerical threshold for filtering samples based on library
 #' sizes. Samples with library sizes less than \code{lib_cut} will be
 #' excluded in the analysis. Default is 0, i.e. do not discard any sample.
 #' @param group character. the name of the group variable in metadata.
-#' \code{group} should be discrete. Specifying \code{group} is required for
-#' detecting structural zeros and performing global test.
-#' Default is NULL. If the \code{group} of interest contains only two
-#' categories, leave it as NULL.
+#' The \code{group} parameter should be a character string representing the name
+#' of the group variable in the metadata. The \code{group} variable should be
+#' discrete, meaning it consists of categorical values. Specifying the
+#' \code{group} variable is required if you are interested in detecting
+#' structural zeros and performing global tests. However, if these analyses are
+#' not of interest to you, you can leave the \code{group} parameter as NULL.
+#' If the \code{group} variable of interest contains only two categories, you
+#' can also leave the \code{group} parameter as NULL. Default is NULL.
 #' @param struc_zero logical. whether to detect structural zeros based on
 #' \code{group}. Default is FALSE.
 #' @param neg_lb logical. whether to classify a taxon as a structural zero using
@@ -224,8 +231,9 @@ ancombc = function(data = NULL, assay_name = "counts",
                    neg_lb = FALSE, tol = 1e-05, max_iter = 100,
                    conserve = FALSE, alpha = 0.05, global = FALSE,
                    n_cl = 1, verbose = FALSE){
-    message("'ancombc' is deprecated \n", "Use 'ancombc2' instead")
-
+    message("'ancombc' has been fully evolved to 'ancombc2'. \n",
+            "Explore the enhanced capabilities of our refined method!")
+
     if (n_cl > 1) {
       cl = parallel::makeCluster(n_cl)
       doParallel::registerDoParallel(cl)
@@ -294,7 +302,7 @@ ancombc = function(data = NULL, assay_name = "counts",
 
     para = iter_mle(x = x, y = y, meta_data = meta_data,
                     formula = formula, theta = NULL, tol = tol,
-                    max_iter = max_iter, verbose = FALSE, type = "ancombc")
+                    max_iter = max_iter, verbose = FALSE)
     beta = para$beta
     vcov_hat = para$vcov_hat
     var_hat = para$var_hat
@@ -344,7 +352,7 @@ ancombc = function(data = NULL, assay_name = "counts",
     W = beta_hat/se_hat
     p = 2 * pnorm(abs(W), mean = 0, sd = 1, lower.tail = FALSE)
     q = apply(p, 2, function(x) p.adjust(x, method = p_adj_method))
-    diff_abn = q < alpha & !is.na(q)
+    diff_abn = q <= alpha & !is.na(q)
 
     beta_prim = cbind(taxon = data.frame(taxon = tax_name),
                       data.frame(beta_hat, check.names = FALSE,
@@ -377,11 +385,11 @@ ancombc = function(data = NULL, assay_name = "counts",
         if (verbose) {
             message("ANCOM-BC global test ...")
         }
-        res_global = ancombc_global(x = x, group = group,
-                                    beta_hat = beta_hat,
-                                    vcov_hat = vcov_hat,
-                                    p_adj_method = p_adj_method,
-                                    alpha = alpha)
+        res_global = ancombc_global_F(x = x, group = group,
+                                      beta_hat = beta_hat,
+                                      vcov_hat = vcov_hat,
+                                      p_adj_method = p_adj_method,
+                                      alpha = alpha)
     } else { res_global = NULL }
 
     # 8. Combine the information of structural zeros
@@ -400,7 +408,7 @@ ancombc = function(data = NULL, assay_name = "counts",
         res$se[, group_ind] = res$se[, group_ind] * zero_mask
         res$p_val[, group_ind] = res$p_val[, group_ind] * zero_mask
         res$q_val[, group_ind] = res$q_val[, group_ind] * zero_mask
-        res$diff_abn[, group_ind] = data.frame(res$q_val[, group_ind] < alpha &
+        res$diff_abn[, group_ind] = data.frame(res$q_val[, group_ind] <= alpha &
                                                    !is.na(res$q_val[, group_ind]),
                                   check.names = FALSE)
 
@@ -410,7 +418,7 @@ ancombc = function(data = NULL, assay_name = "counts",
                 sum(x) > 0 & sum(x) < ncol(zero_idx))
             res_global[, "p_val"] = res_global[, "p_val"] * zero_mask
             res_global[, "q_val"] = res_global[, "q_val"] * zero_mask
-            res_global[, "diff_abn"] = res_global[, "q_val"] < alpha &
+            res_global[, "diff_abn"] = res_global[, "q_val"] <= alpha &
                 !is.na(res_global[, "q_val"])
         }
     }
@@ -423,7 +431,7 @@ ancombc = function(data = NULL, assay_name = "counts",
     if (n_cl > 1) {
       parallel::stopCluster(cl)
     }
-    
+
     return(out)
 }