From 513cce4fa042d3bc684e86c603abbb3bf12f3abc Mon Sep 17 00:00:00 2001 From: stemangiola Date: Sat, 20 Jul 2024 14:05:40 +1000 Subject: [PATCH 1/4] use matrix multiplication --- R/methods_SE.R | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/R/methods_SE.R b/R/methods_SE.R index 89edcddf..8cf96454 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -188,17 +188,18 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se) my_counts_scaled = list( - assays(.data) %>% - as.list() %>% - .[[1]] %>% - multiply_by( - rep(multiplier, rep(nrow(.),length(multiplier))) - ) + assay(.data) %*% + diag(multiplier) + ) %>% setNames(value_scaled) + colnames(my_counts_scaled[[1]]) = assay(.data) |> colnames() + + as.list() %>% + .[[1]]colnames() # Add the assay - assays(.data) = assays(.data) %>% c(my_counts_scaled) + assays(.data, withDimnames=FALSE) = assays(.data) %>% c(my_counts_scaled) .data %>% @@ -313,7 +314,7 @@ setMethod("scale_abundance", as.matrix() if(is.null(target_distribution)) target_distribution = preprocessCore::normalize.quantiles.determine.target(.data_norm) - + .data_norm = .data_norm |> preprocessCore::normalize.quantiles.use.target( From ddff6cbbd0ed52a8b581d9e69ee42f50502934cf Mon Sep 17 00:00:00 2001 From: stemangiola Date: Fri, 2 Aug 2024 23:48:46 +1000 Subject: [PATCH 2/4] add scaling argument to DE --- R/functions_SE.R | 21 ++++++--- R/methods.R | 52 ++++++++++----------- R/methods_SE.R | 2 - man/quantile_normalise_abundance-methods.Rd | 6 +-- man/test_differential_abundance-methods.Rd | 2 +- 5 files changed, 44 insertions(+), 39 deletions(-) diff --git a/R/functions_SE.R b/R/functions_SE.R index 262d1468..8a1693fc 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -744,9 +744,9 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # Replace `:` with ___ because it creates error with edgeR if(design |> colnames() |> str_detect(":") |> any()) { message("tidybulk says: the interaction term `:` has been replaced with `___` in the design matrix, in order to work with edgeR.") - colnames(design) = design |> colnames() |> str_replace(":", "___") + colnames(design) = design |> colnames() |> str_replace(":", "___") } - + # Print the design column names in case I want contrasts message( sprintf( @@ -1070,6 +1070,7 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, #' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) #' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) #' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") +#' @param .scaling_factor A tidyeval (column name) for the precalculated TMM scaling #' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. #' @param ... Additional arguments for glmmSeq #' @@ -1085,6 +1086,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, test_above_log2_fold_change = NULL, scaling_method = "TMM", + .scaling_factor = NULL, omit_contrast_in_colnames = FALSE, prefix = "", .dispersion = NULL, @@ -1092,6 +1094,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, .abundance = enquo(.abundance) .dispersion = enquo(.dispersion) + .scaling_factor = enquo(.scaling_factor) # Check if contrasts are of the same form if( @@ -1145,8 +1148,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, object = .formula |> lme4::nobars(), data = metadata ) - - if(quo_is_symbolic(.dispersion)) + + if(.dispersion |> quo_is_symbolic()) dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe() else dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts)) @@ -1161,9 +1164,13 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, dispersion = dispersion[rownames(counts)] # Scaling - sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method) - - + if(.scaling_factor |> quo_is_symbolic()) + sizeFactors = .data |> pivot_sample() |> pull(!!.scaling_factor) + else + sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method) + + + glmmSeq_object = glmmSeq( .formula, countdata = counts , diff --git a/R/methods.R b/R/methods.R index 4bf025e5..3283a0c3 100755 --- a/R/methods.R +++ b/R/methods.R @@ -594,14 +594,14 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance) #' @details Tranform the feature abundance across samples so to have the same quantile distribution (using preprocessCore). #' #' Underlying method -#' +#' #' If `limma_normalize_quantiles` is chosen -#' +#' #' .data |>limma::normalizeQuantiles() -#' +#' #' If `preprocesscore_normalize_quantiles_use_target` is chosen -#' -#' .data |> +#' +#' .data |> #' preprocessCore::normalize.quantiles.use.target( #' target = preprocessCore::normalize.quantiles.determine.target(.data) #' ) @@ -638,7 +638,7 @@ setGeneric("quantile_normalise_abundance", function(.data, .abundance = NULL, method = "limma_normalize_quantiles", target_distribution = NULL, - + action = "add") { @@ -695,7 +695,7 @@ setGeneric("quantile_normalise_abundance", function(.data, } if(is.null(target_distribution)) target_distribution = preprocessCore::normalize.quantiles.determine.target(.data_norm) - + .data_norm_quant = .data_norm |> preprocessCore::normalize.quantiles.use.target( @@ -1154,14 +1154,14 @@ setGeneric("reduce_dimensions", function(.data, # adjust top for the max number of features I have if(top > .data |> distinct(!!.feature) |> nrow()){ warning(sprintf( - "tidybulk says: the \"top\" argument %s is higher than the number of features %s", - top, + "tidybulk says: the \"top\" argument %s is higher than the number of features %s", + top, .data |> distinct(!!.feature) |> nrow() )) - + top = min(top, .data |> distinct(!!.feature) |> nrow()) } - + # Validate data frame if(do_validate()) { validation(.data, !!.element, !!.feature, !!.abundance) @@ -2604,14 +2604,14 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' @param significance_threshold DEPRECATED - A real between 0 and 1 (usually 0.05). #' @param fill_missing_values DEPRECATED - A boolean. Whether to fill missing sample/transcript values with the median of the transcript. This is rarely needed. #' @param .contrasts DEPRECATED - This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param ... Further arguments passed to some of the internal functions. Currently, it is needed just for internal debug. +#' @param ... Further arguments passed to some of the internal experimental functions. For example for glmmSeq, it is possible to pass .dispersion, and .scaling_factor column tidyeval to skip the caluclation of dispersion and scaling and use precalculated values. This is helpful is you want to calculate those quantities on many genes and do DE testing on fewer genes. .scaling_factor is the TMM value that can be obtained with tidybulk::scale_abundance. #' #' #' @details This function provides the option to use edgeR \url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom \url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights \url{https://doi.org/10.1093/nar/gkv412} or DESeq2 \url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing. #' All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula. #' #' Underlying method for edgeR framework: -#' +#' #' .data |> #' #' # Filter @@ -2638,7 +2638,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' #' #' Underlying method for DESeq2 framework: -#' +#' #' keep_abundant( #' factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]), #' minimum_counts = minimum_counts, @@ -2657,16 +2657,16 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' counts = #' .data %>% #' assay(my_assay) -#' +#' #' # Create design matrix for dispersion, removing random effects #' design = #' model.matrix( #' object = .formula |> lme4::nobars(), #' data = metadata #' ) -#' +#' #' dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts)) -#' +#' #' glmmSeq( .formula, #' countdata = counts , #' metadata = metadata |> as.data.frame(), @@ -2674,8 +2674,8 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' progress = TRUE, #' method = method |> str_remove("(?i)^glmmSeq_" ), #' ) -#' -#' +#' +#' #' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate). #' #' @@ -3980,12 +3980,12 @@ setGeneric("test_gene_rank", function(.data, # DEPRECATION OF reference function if (is_present(.sample) & !is.null(.sample)) { - + # Signal the deprecation to the user deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", details = "The argument .sample is now deprecated and not needed anymore.") } - + # Get column names .arrange_desc = enquo(.arrange_desc) .entrez = enquo(.entrez) @@ -4023,14 +4023,14 @@ setGeneric("test_gene_rank", function(.data, .data |> select(!!.entrez, !!.arrange_desc) |> - distinct() |> - + distinct() |> + # Select one entrez - NEEDED? - with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |> + with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |> - # arrange + # arrange arrange(desc(!!.arrange_desc)) |> - + # Format deframe() |> entrez_rank_to_gsea(species, gene_collections = gene_sets ) |> diff --git a/R/methods_SE.R b/R/methods_SE.R index 8cf96454..59af5326 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -195,8 +195,6 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se) setNames(value_scaled) colnames(my_counts_scaled[[1]]) = assay(.data) |> colnames() - as.list() %>% - .[[1]]colnames() # Add the assay assays(.data, withDimnames=FALSE) = assays(.data) %>% c(my_counts_scaled) diff --git a/man/quantile_normalise_abundance-methods.Rd b/man/quantile_normalise_abundance-methods.Rd index 5c527cd8..d7938677 100644 --- a/man/quantile_normalise_abundance-methods.Rd +++ b/man/quantile_normalise_abundance-methods.Rd @@ -111,10 +111,10 @@ Underlying method If `limma_normalize_quantiles` is chosen .data |>limma::normalizeQuantiles() - + If `preprocesscore_normalize_quantiles_use_target` is chosen - -.data |> + +.data |> preprocessCore::normalize.quantiles.use.target( target = preprocessCore::normalize.quantiles.determine.target(.data) ) diff --git a/man/test_differential_abundance-methods.Rd b/man/test_differential_abundance-methods.Rd index 01472dbe..d8beb5cf 100755 --- a/man/test_differential_abundance-methods.Rd +++ b/man/test_differential_abundance-methods.Rd @@ -149,7 +149,7 @@ test_differential_abundance( \item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} -\item{...}{Further arguments passed to some of the internal functions. Currently, it is needed just for internal debug.} +\item{...}{Further arguments passed to some of the internal experimental functions. For example for glmmSeq, it is possible to pass .dispersion, and .scaling_factor column tidyeval to skip the caluclation of dispersion and scaling and use precalculated values. This is helpful is you want to calculate those quantities on many genes and do DE testing on fewer genes. .scaling_factor is the TMM value that can be obtained with tidybulk::scale_abundance.} \item{significance_threshold}{DEPRECATED - A real between 0 and 1 (usually 0.05).} From f39f763b1c1b3e059c7af7ad67c2e72aa8b579d0 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Tue, 17 Sep 2024 22:10:40 +0300 Subject: [PATCH 3/4] Update DESCRIPTION --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 34c8a527..bad82c8f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: tidybulk Title: Brings transcriptomics to the tidyverse -Version: 1.17.3 +Version: 1.17.4 Authors@R: c(person("Stefano", "Mangiola", email = "mangiolastefano@gmail.com", role = c("aut", "cre")), person("Maria", "Doyle", email = "Maria.Doyle@petermac.org", From a0bdd3ff739b9f05576d6e9154f736b3f95ed127 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Wed, 16 Oct 2024 20:18:57 +1030 Subject: [PATCH 4/4] fix tests --- R/methods_SE.R | 2 ++ .../testthat/test-bulk_methods_SummarizedExperiment.R | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/methods_SE.R b/R/methods_SE.R index 9a260e2e..d3aeac8d 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -2895,6 +2895,8 @@ setMethod("describe_transcript", "RangedSummarizedExperiment", .describe_transcr combination_of_factors_of_NON_interest = # Factors se[1,1, drop=FALSE] |> + colData() |> + as_tibble(rownames = ".sample") |> select(...) |> suppressWarnings() |> colnames() |> diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index 84bc5a8f..3aae6428 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -747,7 +747,7 @@ test_that("gene over representation",{ species="Homo sapiens" ) - expect_equal( ncol(res), 10 ) + expect_equal( ncol(res), 13 ) @@ -854,8 +854,8 @@ test_that("Only reduced dimensions UMAP - no object",{ test_that("resolve_complete_confounders_of_non_interest",{ - library(tidySummarizedExperiment) - library(tidybulk) + #library(tidySummarizedExperiment) + library(SummarizedExperiment) # Sample annotations sample_annotations <- data.frame( @@ -890,7 +890,9 @@ test_that("resolve_complete_confounders_of_non_interest",{ se |> resolve_complete_confounders_of_non_interest(A, B, C) |> - distinct(.sample, A, B, C) |> + colData() |> + _[, c("A", "B", "C")] |> + as_tibble(rownames = ".sample") |> expect_identical(expected_tibble )