From 073ebe65a6af2f4938d0be6dd2e954abbd897c31 Mon Sep 17 00:00:00 2001 From: "Sabanes Bove, Daniel {MDBR~Basel}" Date: Thu, 1 Jul 2021 09:33:51 +0200 Subject: [PATCH] 141: Polish documentation of hermes. (#176) --- DESCRIPTION | 1 + R/HermesData-class.R | 25 +- R/HermesData-methods.R | 77 ++++-- R/HermesData-validate.R | 2 +- R/assertthat.R | 42 ++-- R/calc_cor.R | 32 ++- R/connections.R | 16 +- R/data.R | 16 +- R/differential.R | 28 ++- R/graphs.R | 12 +- R/normalization.R | 220 ++++++++---------- R/pca.R | 27 ++- R/pca_cor_samplevar.R | 48 +++- R/quality.R | 102 ++++---- R/top_genes.R | 17 +- _pkgdown.yml | 1 - man/HermesData-class.Rd | 25 +- man/annotation.Rd | 16 +- man/assertions.Rd | 32 ++- man/calc_cor.Rd | 32 ++- man/calc_pca.Rd | 28 ++- man/cbind.Rd | 11 +- man/connect_biomart.Rd | 2 +- man/control_normalize.Rd | 10 +- man/control_quality.Rd | 6 +- man/diff_expression.Rd | 14 +- man/draw_genes_barplot.Rd | 6 +- man/draw_libsize_densities.Rd | 2 +- man/draw_libsize_qq.Rd | 4 +- man/expression_set.Rd | 6 +- man/filter.Rd | 6 + man/genes.Rd | 3 + man/get_quality_flags.Rd | 40 ---- man/h_cpm.Rd | 27 --- man/h_diff_expr_deseq2.Rd | 6 + man/h_diff_expr_voom.Rd | 6 + man/h_get_annotation_biomart.Rd | 5 +- man/h_has_req_annotations.Rd | 3 + man/h_pca_df_r2_matrix.Rd | 33 ++- man/h_pca_var_rsquared.Rd | 6 + man/h_rpkm.Rd | 30 --- man/h_tpm.Rd | 27 --- man/h_voom.Rd | 27 --- man/metadata.Rd | 2 + man/multi_assay_experiment.Rd | 9 +- man/normalize.Rd | 51 +++- man/pca_cor_samplevar.Rd | 13 +- ...{add_quality_flags.Rd => quality_flags.Rd} | 82 ++++--- man/query.Rd | 9 +- man/rbind.Rd | 16 +- man/samples.Rd | 3 + man/subset.Rd | 14 ++ man/summarized_experiment.Rd | 2 +- man/summary.Rd | 1 + man/top_genes.Rd | 17 +- man/validate.Rd | 2 +- tests/testthat/test-HermesData-methods.R | 2 +- 57 files changed, 728 insertions(+), 574 deletions(-) delete mode 100644 man/get_quality_flags.Rd delete mode 100644 man/h_cpm.Rd delete mode 100644 man/h_rpkm.Rd delete mode 100644 man/h_tpm.Rd delete mode 100644 man/h_voom.Rd rename man/{add_quality_flags.Rd => quality_flags.Rd} (52%) diff --git a/DESCRIPTION b/DESCRIPTION index 3bc5cc8a..3fa6f469 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,6 +46,7 @@ Imports: utils.nest (>= 0.2.9) Suggests: DT, + grid, knitr, test.nest (>= 0.2.9), testthat (>= 2.0), diff --git a/R/HermesData-class.R b/R/HermesData-class.R index 9a08d473..cd7e670c 100644 --- a/R/HermesData-class.R +++ b/R/HermesData-class.R @@ -55,9 +55,11 @@ NULL #' @importFrom S4Vectors setValidity2 #' #' @examples -#' # Convert to `SummarizedExperiment` using the default naive range mapper. -#' se <- makeSummarizedExperimentFromExpressionSet(expression_set) -#' # Then convert to `HermesData`. +#' # Convert an `ExpressionSet` to a `RangedSummarizedExperiment`. +#' ranged_summarized_experiment <- makeSummarizedExperimentFromExpressionSet(expression_set) +#' +#' # Then convert to `RangedHermesData`. +#' HermesData(ranged_summarized_experiment) .HermesData <- setClass( # nolint "HermesData", contains = "SummarizedExperiment", @@ -94,17 +96,15 @@ S4Vectors::setValidity2("AnyHermesData", function(object) { # HermesData-constructors ---- #' @rdname HermesData-class -#' @param object (`SummarizedExperiment`)\cr input to create [`HermesData`] from. +#' @param object (`SummarizedExperiment`)\cr input to create the [`HermesData`] object from. #' If this is a `RangedSummarizedExperiment`, then the result will be #' [`RangedHermesData`]. #' @export #' @examples -#' # Create objects starting from a `SummarizedExperiment.` +#' +#' # Create objects starting from a `SummarizedExperiment`. #' hermes_data <- HermesData(summarized_experiment) #' hermes_data -#' ranged_summarized_experiment <- as(summarized_experiment, "RangedSummarizedExperiment") -#' ranged_hermes_data <- HermesData(ranged_summarized_experiment) -#' ranged_hermes_data HermesData <- function(object) { # nolint assert_that( is_class(object, "SummarizedExperiment"), @@ -143,13 +143,10 @@ HermesData <- function(object) { # nolint #' is passed instead of `rowData`, then the result will be a [`RangedHermesData`] object. #' @export #' @examples -#' # Create objects from a matrix and additional arguments. +#' +#' # Create objects from a matrix. Note that additional arguments are not required but possible. #' counts_matrix <- assay(summarized_experiment) -#' HermesDataFromMatrix( -#' counts = counts_matrix, -#' rowData = rowData(summarized_experiment), -#' colData = colData(summarized_experiment) -#' ) +#' counts_hermes_data <- HermesDataFromMatrix(counts_matrix) HermesDataFromMatrix <- function(counts, ...) { # nolint assert_that(is.matrix(counts)) diff --git a/R/HermesData-methods.R b/R/HermesData-methods.R index 6382433e..c4dda1a1 100644 --- a/R/HermesData-methods.R +++ b/R/HermesData-methods.R @@ -7,12 +7,15 @@ #' This method combines [`AnyHermesData`] objects with the same samples but different #' features of interest (rows in assays). #' -#' @note Note that this just inherits -#' [SummarizedExperiment::rbind,SummarizedExperiment-method()]. When binding a -#' [`AnyHermesData`] object with a [`SummarizedExperiment::SummarizedExperiment`] -#' object, then the result will be a -#' [`SummarizedExperiment::SummarizedExperiment`] object (the more general -#' class). +#' @note +#' - Note that this just inherits +#' [SummarizedExperiment::rbind,SummarizedExperiment-method()]. When binding a +#' [`AnyHermesData`] object with a [`SummarizedExperiment::SummarizedExperiment`] +#' object, then the result will be a +#' [`SummarizedExperiment::SummarizedExperiment`] object (the more general +#' class). +#' - Note that we need to have unique gene IDs (row names) and the same prefix +#' across the combined object. #' #' @name rbind #' @@ -20,14 +23,13 @@ #' #' @return The combined [`AnyHermesData`] object. #' +#' @seealso [`cbind`] to column bind objects. +#' #' @examples -#' a <- HermesData(summarized_experiment[1:2542]) -#' b <- HermesData(summarized_experiment[2543: 5085]) +#' a <- HermesData(summarized_experiment[1:2542, ]) +#' b <- HermesData(summarized_experiment[2543:5085, ]) #' result <- rbind(a, b) #' class(result) -#' -#' result2 <- rbind(summarized_experiment, b) -#' class(result2) NULL # cbind ---- @@ -39,12 +41,14 @@ NULL #' This method combines [`AnyHermesData`] objects with the same ranges but different #' samples (columns in assays). #' -#' @note Note that this just inherits -#' [SummarizedExperiment::cbind,SummarizedExperiment-method()]. When binding a -#' [`AnyHermesData`] object with a [`SummarizedExperiment::SummarizedExperiment`] -#' object, then the result will be a -#' [`SummarizedExperiment::SummarizedExperiment`] object (the more general -#' class). +#' @note +#' - Note that this just inherits +#' [SummarizedExperiment::cbind,SummarizedExperiment-method()]. When binding a +#' [`AnyHermesData`] object with a [`SummarizedExperiment::SummarizedExperiment`] +#' object, then the result will be a +#' [`SummarizedExperiment::SummarizedExperiment`] object (the more general +#' class). +#' - Note that the combined object needs to have unique sample IDs (column names). #' #' @name cbind #' @@ -52,14 +56,13 @@ NULL #' #' @return The combined [`AnyHermesData`] object. #' +#' @seealso [`rbind`] to row bind objects. +#' #' @examples #' a <- HermesData(summarized_experiment[, 1:10]) #' b <- HermesData(summarized_experiment[, 11:20]) #' result <- cbind(a, b) #' class(result) -#' -#' result2 <- cbind(summarized_experiment[, 1:10], b) -#' class(result2) NULL # metadata ---- @@ -75,6 +78,7 @@ NULL #' @name metadata #' #' @param x (`AnyHermesData`)\cr object to access the metadata from. +#' @param value (`list`)\cr the list to replace the current metadata with. #' #' @return The metadata which is a list. #' @importFrom S4Vectors `metadata<-` @@ -100,7 +104,7 @@ NULL #' @rdname annotation #' @aliases annotation #' -#' @param object (`AnyHermesData`)\cr object to access the counts from. +#' @param object (`AnyHermesData`)\cr object to access the annotations from. #' @param ... not used. #' #' @return The [`S4Vectors::DataFrame`] with the gene annotations: @@ -128,7 +132,7 @@ setMethod( ) #' @rdname annotation -#' @note - The returned column names are available in the exported +#' @format The annotation column names are available in the exported #' character vector `.row_data_annotation_cols`. #' @export .row_data_annotation_cols <- c( @@ -142,9 +146,9 @@ setMethod( "ProteinTranscript" ) -#' @param value (`matrix`)\cr what should the counts assay be replaced with. +#' @param value (`DataFrame`)\cr what should the annotations be replaced with. #' -#' @note - When trying to replace the annotation with completely missing values for any genes, +#' @note When trying to replace the annotation with completely missing values for any genes, #' a warning will be given and the corresponding gene IDs will be saved in the #' attribute `annotation.missing.genes`. #' @@ -257,6 +261,8 @@ setGeneric("prefix", def = function(object, ...) { #' #' @return The character vector with the gene IDs. #' +#' @seealso [samples()] to access the sample IDs. +#' #' @export setGeneric("genes", def = function(object) standardGeneric("genes")) @@ -289,6 +295,8 @@ setMethod( #' @rdname samples #' @aliases samples #' +#' @seealso [genes()] to access the gene IDs. +#' #' @importFrom Biobase samples #' @export #' @examples @@ -317,12 +325,25 @@ setMethod( #' @name subset #' #' @param x (`AnyHermesData`)\cr object to subset from. +#' @param subset (`expression`)\cr logical expression based on the `rowData` columns to +#' select genes. +#' @param select (`expression`)\cr logical expression based on the `colData` columns to +#' select samples. +#' #' @return The subsetted [`AnyHermesData`] object. #' #' @examples #' a <- HermesData(summarized_experiment) #' a +#' +#' # Subset both genes and samples. #' subset(a, subset = LowExpressionFlag, select = DISCSTUD == "N") +#' +#' # Subset only genes. +#' subset(a, subset = Chromosome == "2") +#' +#' # Subset only samples. +#'subset(a, select = AGE > 18) NULL # filter ---- @@ -353,6 +374,7 @@ setGeneric("filter", function(object, ...) standardGeneric("filter")) #' @return Named logical vector with one value for each gene in `object`, which is `TRUE` if all #' required annotation columns are filled, and otherwise `FALSE`. #' +#' @seealso [filter()] where this is used internally. #' @export #' #' @examples @@ -392,13 +414,19 @@ h_has_req_annotations <- function(object, #' @examples #' a <- HermesData(summarized_experiment) #' dim(a) +#' #' # Filter genes and samples on default QC flags. #' result <- filter(a) #' dim(result) +#' #' # Filter only genes without low expression. #' result <- filter(a, what = "genes") +#' #' # Filter only samples with low depth and technical failure. #' result <- filter(a, what = "samples") +#' +#' # Filter only genes, and require certain annotations to be present. +#' result <- filter(a, what = "genes", annotation_required = c("StartBP", "EndBP", "WidthBP")) setMethod( f = "filter", signature = signature(object = "AnyHermesData"), @@ -584,6 +612,7 @@ setMethod( #' @export #' #' @examples +#' #' # Just calling the summary method like this will use the `show()` method. #' summary(object) setMethod( diff --git a/R/HermesData-validate.R b/R/HermesData-validate.R index 97e88aac..8a769c52 100644 --- a/R/HermesData-validate.R +++ b/R/HermesData-validate.R @@ -106,7 +106,7 @@ validate_names <- function(object) { } #' @describeIn validate validates that the object prefix is a string -#' without whitespace, special characters or digits. +#' and only contains alphabetic characters. validate_prefix <- function(object) { prefix <- object@prefix msg <- NULL diff --git a/R/assertthat.R b/R/assertthat.R index 6cc8e4a6..2ea17823 100644 --- a/R/assertthat.R +++ b/R/assertthat.R @@ -5,6 +5,8 @@ #' We provide additional assertion functions which can be used together with #' [assertthat::assert_that()]. #' +#' @param x an object to check. +#' #' @name assertions #' @import assertthat NULL @@ -12,47 +14,49 @@ NULL # is_class ---- #' @describeIn assertions checks the class. -#' @param object any object. -#' @param class2 (`character` or class definition)\cr the class to which `object` could belong. +#' @param class2 (`character` or class definition)\cr the class to which `x` could belong. #' @export #' @examples +#' # Assert a general class. #' a <- 5 #' is_class(a, "character") -is_class <- function(object, class2) { - is(object, class2) +is_class <- function(x, class2) { + is(x, class2) } on_failure(is_class) <- function(call, env) { - obj_name <- deparse(call$object) + obj_name <- deparse(call$x) class <- eval(call$class2, env) paste(obj_name, "is not of class", class) } # is_hermes_data ---- -#' @describeIn assertions checks the class. -#' @param object any object. +#' @describeIn assertions checks whether `x` is an [`AnyHermesData`] object. #' @export #' @examples +#' +#' # Assert a `AnyHermesData` object. #' is_hermes_data(HermesData(summarized_experiment)) #' is_hermes_data(42) -is_hermes_data <- function(object) { - is_class(object, "AnyHermesData") +is_hermes_data <- function(x) { + is_class(x, "AnyHermesData") } on_failure(is_hermes_data) <- function(call, env) { - obj_name <- deparse(call$object) + obj_name <- deparse(call$x) paste(obj_name, "is not a HermesData or RangedHermesData object") } # is_counts_vector ---- #' @describeIn assertions checks for a vector of counts (positive integers). -#' @param x vector to check. #' @export #' @examples -#' a <- 5 -#' is_class(a, "character") +#' +#' # Assert a counts vector. +#' a <- 5L +#' is_counts_vector(a) is_counts_vector <- function(x) { is.integer(x) && all(x > 0) && noNA(x) && not_empty(x) } @@ -69,6 +73,8 @@ on_failure(is_counts_vector) <- function(call, env) { #' @importFrom utils.nest is_character_vector is_fully_named_list #' @export #' @examples +#' +#' # Assert a list containing certain elements. #' b <- list(a = 5, b = 3) #' is_list_with(b, c("a", "c")) #' is_list_with(b, c("a", "b")) @@ -89,12 +95,14 @@ on_failure(is_list_with) <- function(call, env) { # one_provided ---- -#' @describeIn assertions checks that exactly one of two inputs is not `NULL`. +#' @describeIn assertions checks that exactly one of the two inputs `one`, `two` is not `NULL`. #' @param one first input. #' @param two second input. #' @export #' #' @examples +#' +#' # Assert that exactly one of two arguments is provided. #' a <- 10 #' b <- 10 #' one_provided(a, b) @@ -115,11 +123,13 @@ on_failure(one_provided) <- function(call, env) { # is_constant ---- -#' @describeIn assertions checks for a column being constant. -#' @param x An object to check. +#' @describeIn assertions checks whether the vector `x` is constant (only supports `numeric`, `factor`, +#' `character`, `logical`). `NA`s are removed first. #' @export #' #' @examples +#' +#' # Assert a constant vector. #' is_constant(c(1, 2)) #' is_constant(c(NA, 1)) #' is_constant(c("a", "a")) diff --git a/R/calc_cor.R b/R/calc_cor.R index 3d72898d..06c98045 100644 --- a/R/calc_cor.R +++ b/R/calc_cor.R @@ -5,16 +5,19 @@ NULL #' #' @description `r lifecycle::badge("experimental")` #' -#' This calculates the correlation matrix between the sample vectors of counts from -#' a specified assay, as a [`HermesDataCor`] object which is an extension of a [`matrix`] with -#' additional quality flags in the slot `flag_data`: This contains the -#' `TechnicalFailureFlag` and `LowDepthFlag` columns describing the original input samples. +#' The `correlate()` method can calculate the correlation matrix between the sample vectors of +#' counts from a specified assay. This produces a [`HermesDataCor`] object, which is an extension +#' of a [`matrix`] with additional quality flags in the slot `flag_data` +#' (containing the `TechnicalFailureFlag` and `LowDepthFlag` columns describing the original +#' input samples). +#' +#' An `autoplot()` method then afterwards can produce the corresponding heatmap. #' #' @rdname calc_cor #' @aliases calc_cor #' #' @param object (`AnyHermesData`)\cr object to calculate the correlation. -#' @param assay_name (`string`)\cr the assay name where the counts are located in. +#' @param assay_name (`string`)\cr the name of the assay to use. #' @param method (`string`)\cr the correlation method, see [stats::cor()] for details. #' #' @return A [`HermesDataCor`] object. @@ -25,8 +28,12 @@ NULL #' #' @examples #' object <- HermesData(summarized_experiment) +#' +#' # Calculate the sample correlation matrix. #' correlate(object) -#' result <- correlate(object, method = "pearson") +#' +#' # We can specify another correlation coefficient to be calculated. +#' result <- correlate(object, method = "spearman") setMethod( f = "correlate", signature = "AnyHermesData", @@ -55,7 +62,7 @@ setMethod( slots = c(flag_data = "DataFrame") ) -#' @describeIn calc_cor This plot method uses the [ComplexHeatmap::Heatmap()] function +#' @describeIn calc_cor This `autoplot()` method uses the [ComplexHeatmap::Heatmap()] function #' to plot the correlations between samples saved in a [`HermesDataCor`] object. #' #' @param flag_colors (named `character`)\cr a vector that specifies the colors for `TRUE` and `FALSE` @@ -69,8 +76,19 @@ setMethod( #' @export #' #' @examples +#' +#' # Plot the correlation matrix. #' autoplot(result) +#' +#' # We can customize the heatmap. #' autoplot(result, show_column_names = FALSE, show_row_names = FALSE) +#' +#' # Including changing the axis label text size. +#' autoplot( +#' result, +#' row_names_gp = grid::gpar(fontsize = 8), +#' column_names_gp = grid::gpar(fontsize = 8) +#' ) setMethod( f = "autoplot", signature = c(object = "HermesDataCor"), diff --git a/R/connections.R b/R/connections.R index c0206df4..59ae1a20 100644 --- a/R/connections.R +++ b/R/connections.R @@ -2,7 +2,7 @@ #' #' @description `r lifecycle::badge("experimental")` #' -#' Creates a connection object of class [`ConnectionBiomart`] which contains +#' `connect_biomart()` creates a connection object of class [`ConnectionBiomart`] which contains #' the `biomaRt` object of class [`biomaRt::Mart`][biomaRt::Mart-class] and the prefix of the object #' which is used downstream for the query. #' @@ -44,8 +44,9 @@ connect_biomart <- function(prefix = c("ENSG", "GeneID")) { #' Helper function to query annotations from `biomaRt`, for cleaned up gene IDs of #' a specific ID variable and given [`biomaRt::Mart`][biomaRt::Mart-class]. #' -#' @param gene_ids (`character`)\cr gene IDs. -#' @param id_var (`string`)\cr gene ID variable, e.g. `entrezgene_id`. +#' @param gene_ids (`character`)\cr gene IDs, e.g. `10329`. +#' @param id_var (`string`)\cr corresponding gene ID variable name in Biomart, +#' e.g. `entrezgene_id`. #' @param mart (`Mart`)\cr given [`biomaRt::Mart`][biomaRt::Mart-class] object. #' #' @return A data frame with columns: @@ -110,7 +111,7 @@ h_get_annotation_biomart <- function(gene_ids, #' #' @description `r lifecycle::badge("experimental")` #' -#' This generic function is the interface for querying gene annotations from +#' The generic function `query()` is the interface for querying gene annotations from #' a data base connection. #' #' @details @@ -119,15 +120,16 @@ h_get_annotation_biomart <- function(gene_ids, #' for other data bases, e.g. company internal data bases. Please make sure to #' follow the required format of the returned value. #' - The Biomart queries might not return information for all the genes. This can be -#' due to different versions being used in the gene IDs and the queries Ensembl data base. +#' due to different versions being used in the gene IDs and the queried Ensembl data base. #' #' @param genes (`character`)\cr gene IDs. #' @param connection (connection class)\cr data base connection object. #' #' @return A [`S4Vectors::DataFrame`] with the gene annotations. It is required that: #' - The `rownames` are identical to the input `genes`. -#' - The `colnames` are equal to the annotation columns, see [`annotation`]. -#' - Therefore, missing information needs to be properly included with `NA` entries. +#' - The `colnames` are equal to the annotation columns [`.row_data_annotation_cols`]. +#' - Therefore, missing information needs to be properly included in the `DataFrame` +#' with `NA` entries. #' #' @export setGeneric( diff --git a/R/data.R b/R/data.R index ec33acb2..bb594fd9 100644 --- a/R/data.R +++ b/R/data.R @@ -8,8 +8,11 @@ #' @format A [`Biobase::ExpressionSet`] object with 20 samples covering 5085 #' features (Entrez gene IDs). #' @source This is an artificial dataset designed to resemble real data. -#' @seealso summarized_experiment which contains similar data as -#' [`SummarizedExperiment::SummarizedExperiment`]. +#' @seealso +#' - [SummarizedExperiment::makeSummarizedExperimentFromExpressionSet()] to convert into a +#' [`SummarizedExperiment::SummarizedExperiment`]. +#' - [`summarized_experiment`] which contains similar data already as a +#' [`SummarizedExperiment::SummarizedExperiment`]. "expression_set" #' Example `SummarizedExperiment` Data @@ -22,7 +25,7 @@ #' @format A [SummarizedExperiment::SummarizedExperiment] object with 20 samples covering #' 5085 features (Entrez gene IDs). #' @source This is an artificial dataset designed to resemble real data. -#' @seealso expression_set which contains similar data as [`Biobase::ExpressionSet`]. +#' @seealso [`expression_set`] which contains similar data as a [`Biobase::ExpressionSet`]. "summarized_experiment" #' Example `MultiAssayExperiment` Data @@ -31,8 +34,9 @@ #' #' This example [`MultiAssayExperiment::MultiAssayExperiment`] can be used as test data. #' -#' @format A [`MultiAssayExperiment::MultiAssayExperiment`] object with 3 separate SE objects. The first SE object -#' contains 5 samples and covers 1000 features (Entrez gene IDs). The second SE object contains 9 samples with -#' 2500 features. The third SE object contains 6 samples with 1300 features. +#' @format A [`MultiAssayExperiment::MultiAssayExperiment`] object with 3 separate SE objects. +#' - The first SE object contains 5 samples and covers 1000 features (Entrez gene IDs). +#' - The second SE object contains 9 samples with 2500 features. +#' - The third SE object contains 6 samples with 1300 features. #' @source This is an artificial dataset designed to resemble real data. "multi_assay_experiment" diff --git a/R/differential.R b/R/differential.R index 121ed96d..bb39b79e 100644 --- a/R/differential.R +++ b/R/differential.R @@ -23,9 +23,15 @@ #' #' @examples #' object <- HermesData(summarized_experiment) +#' +#' # Create the design matrix corresponding to the factor of interest. #' design <- model.matrix(~SEX, colData(object)) +#' +#' # Then perform the differential expression analysis. #' result <- h_diff_expr_voom(object, design) #' head(result) +#' +#' # Sometimes we might want to specify method details. #' result2 <- h_diff_expr_voom(object, design, trend = TRUE, robust = TRUE) #' head(result2) h_diff_expr_voom <- function(object, design, ...) { @@ -77,9 +83,15 @@ h_diff_expr_voom <- function(object, design, ...) { #' #' @examples #' object <- HermesData(summarized_experiment) +#' +#' # Create the design matrix corresponding to the factor of interest. #' design <- model.matrix(~SEX, colData(object)) +#' +#' # Then perform the `DESeq2` differential expression analysis. #' result <- h_diff_expr_deseq2(object, design) #' head(result) +#' +#' # Change of the `fitType` can be required in some cases. #' result2 <- h_diff_expr_deseq2(object, design, fitType = "local") #' head(result2) h_diff_expr_deseq2 <- function(object, design, ...) { @@ -118,9 +130,11 @@ h_diff_expr_deseq2 <- function(object, design, ...) { #' #' @description `r lifecycle::badge("experimental")` #' -#' This function performs differential expression analysis +#' The `diff_expression()` function performs differential expression analysis #' using a method of preference. #' +#' A corresponding `autoplot()` method is visualizing the results as a volcano plot. +#' #' @details Possible method choices are: #' - `voom`: uses [limma::voom()], see [h_diff_expr_voom()] for details. #' - `deseq2`: uses [DESeq2::DESeq()], see [h_diff_expr_deseq2()] for details. @@ -134,11 +148,11 @@ h_diff_expr_deseq2 <- function(object, design, ...) { #' #' @return A [`HermesDataDiffExpr`] object which is a data frame with the following columns for each gene #' in the [`HermesData`] object: -#' - `log2_fc` (estimate of the log2 fold change between the 2 levels of the -#' provided factor) +#' - `log2_fc` (the estimate of the log2 fold change between the 2 levels of the +#' provided factor) #' - `stat` (the test statistic, which one depends on the method used) -#' - `p_val` (the raw p-value), -#' - `adj_p_val` (the adjusted p-value) values from differential expression analysis for each feature / gene . +#' - `p_val` (the raw p-value) +#' - `adj_p_val` (the multiplicity adjusted p-value value) #' #' @note #' - We provide the [df_char_to_factor()] utility function that makes it easy to convert the @@ -163,7 +177,7 @@ h_diff_expr_deseq2 <- function(object, design, ...) { #' res2 <- diff_expression(object, group = "SEX", method = "deseq2") #' head(res2) #' -#' # Passing method arguments to the internally used helper functions. +#' # Pass method arguments to the internally used helper functions. #' res3 <- diff_expression(object, group = "SEX", method = "voom", robust = TRUE, trend = TRUE) #' head(res3) #' res4 <- diff_expression(object, group = "SEX", method = "deseq2", fitType = "local") @@ -247,7 +261,7 @@ S4Vectors::setValidity2( #' #' @examples #' -#' # Creating the corresponding volcano plots. +#' # Create the corresponding volcano plots. #' autoplot(res1) #' autoplot(res3) setMethod( diff --git a/R/graphs.R b/R/graphs.R index 1041b29f..31d80a3c 100644 --- a/R/graphs.R +++ b/R/graphs.R @@ -53,8 +53,10 @@ draw_libsize_hist <- function(object, #' result <- HermesData(summarized_experiment) #' draw_libsize_qq(result) #' draw_libsize_qq(result, color = "blue", linetype = "solid") +#' #' # We can also add sample names as labels. -#' draw_libsize_qq(result) + geom_text(label = colnames(result), stat = "qq") +#' library(ggrepel) +#' draw_libsize_qq(result) + geom_text_repel(label = colnames(result), stat = "qq") draw_libsize_qq <- function(object, color = "grey", linetype = "dashed") { @@ -92,7 +94,7 @@ draw_libsize_qq <- function(object, #' @examples #' result <- HermesData(summarized_experiment) #' draw_libsize_densities(result) -#' draw_libsize_densities(result, FALSE) +#' draw_libsize_densities(result, log = FALSE) draw_libsize_densities <- function(object, log = TRUE) { assert_that( @@ -186,7 +188,11 @@ draw_nonzero_boxplot <- function(object, #' object <- HermesData(summarized_experiment) #' #' # Display chromosomes 1-22, X, Y, and MT. Other chromosomes are displayed in "Others". -#' draw_genes_barplot(object) +#' # To increase readability, we can have flip the coordinate axes. +#' draw_genes_barplot(object) + coord_flip() +#' +#' # Alternatively we can also rotate the x-axis tick labels. +#' draw_genes_barplot(object) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) #' #' # Display chromosomes 1 and 2. Other chromosomes are displayed in "Others". #' draw_genes_barplot(object, chromosomes = c("1", "2")) diff --git a/R/normalization.R b/R/normalization.R index 02c1b5f1..a453f79d 100644 --- a/R/normalization.R +++ b/R/normalization.R @@ -5,14 +5,14 @@ #' This control function allows for easy customization of the normalization settings. #' #' @param log (`flag`)\cr whether `log2` values are returned, otherwise original scale is used. -#' @param lib_sizes (`numeric`)\cr library sizes, the default is the vector with the sum of the -#' counts for each of the samples. -#' @param prior_count (`count`)\cr average count to be added to each observation to avoid -#' taking log of zero, used only when `log = TRUE`. +#' @param lib_sizes (`NULL` or `counts`)\cr library sizes, if `NULL` the vector with the sum of the +#' counts for each of the samples will be used. +#' @param prior_count (non-negative `number`)\cr average count to be added to each observation to +#' avoid taking log of zero, used only when `log = TRUE`. #' #' @return List with the above settings used to perform the normalization procedure. #' -#' @note To be used with the `normalize()` function. +#' @note To be used with the [normalize()] function. #' #' @export #' @examples @@ -33,22 +33,92 @@ control_normalize <- function(log = TRUE, ) } -#' Counts per Million (CPM) Normalization +# normalize ---- + +#' Normalization of `AnyHermesData` Objects #' #' @description `r lifecycle::badge("stable")` #' -#' This helper function calculates the CPM normalized counts. +#' The `normalize()` method is normalizing the input [`AnyHermesData`] according to one or more +#' specified normalization methods. The results are saved as additional assays +#' in the object. +#' +#' Possible normalization methods (which are implemented with separate helper functions): +#' - `cpm`: Counts per Million (CPM). Separately by sample, the original counts of the genes +#' are divided by the library size of this sample, and multiplied by one million. This is the +#' appropriate normalization for between-sample comparisons. +#' - `rpkm`: Reads per Kilobase of transcript per Million reads mapped (RPKM). Each gene count is +#' divided by the gene width (in kilobases) and then again divided by the library sizes of each +#' sample (in millions). This allows for within-sample comparisons, as it takes +#' into account the gene lengths - longer genes will always have more counts than shorter genes. +#' - `tpm`: Transcripts per Million (TPM). This addresses the problem of RPKM being inconsistent +#' across samples (which can be seen that the sum of all RPKM values will vary from sample to +#' sample). Therefore here we divide the RPKM by the sum of all RPKM values for each sample, +#' and multiply by one million. +#' - `voom`: VOOM normalization. This is essentially just a slight variation of CPM where +#' a `prior_count` of 0.5 is combined with `lib_sizes` increased by 1 for each sample. Note that +#' this is not required for the corresponding differential expression analysis, but just provided +#' as a complementary experimental normalization approach here. #' -#' @param object (`HermesData`) \cr input. -#' @param control (`list`) \cr list of settings used to perform the normalization procedure. -#' @return A numeric matrix with normalized counts using the CPM method. +#' @rdname normalize +#' @aliases normalize +#' +#' @param object (`AnyHermesData`)\cr object to normalize. +#' @param methods (`character`)\cr which normalization methods to use, see details. +#' @param control (named `list`)\cr settings produced by [control_normalize()]. +#' @param ... not used. +#' +#' @return The [`AnyHermesData`] object with additional assays containing the normalized counts. +#' The `control` is saved in the `metadata` of the object for future reference. +#' +#' @seealso [control_normalize()] to define the normalization method settings. +#' +#' @importFrom BiocGenerics normalize +#' @export +#' @examples +#' a <- HermesData(summarized_experiment) +#' +#' # By default, log values are used with a prior count of 1 added to original counts. +#' result <- normalize(a) +#' assayNames(result) +#' tpm <- assay(result, "tpm") +#' tpm[1:3, 1:3] +#' +#' # We can also work on original scale. +#' result_orig <- normalize(a, control = control_normalize(log = FALSE)) +#' tpm_orig <- assay(result_orig, "tpm") +#' tpm_orig[1:3, 1:3] +setMethod( + f = "normalize", + signature = "AnyHermesData", + definition = function(object, + methods = c("cpm", "rpkm", "tpm", "voom"), + control = control_normalize(), + ...) { + method_choices <- c("cpm", "rpkm", "tpm", "voom") + assert_that(all(methods %in% method_choices)) + methods <- match.arg(methods, choices = method_choices, several.ok = TRUE) + for (method in methods) { + fun_name <- paste0("h_", method) + assay(object, method) <- do.call( + fun_name, + list(object = object, control = control), + envir = as.environment("package:hermes") + ) + } + metadata(object) <- c(metadata(object), list(control_normalize = control)) + object + } +) + +#' @describeIn normalize calculates the Counts per Million (CPM) normalized counts. #' #' @export #' @importFrom edgeR cpm #' @examples -#' h <- HermesData(summarized_experiment) -#' cont <- control_normalize() -#' counts_cpm <- h_cpm(h, cont) +#' +#' # Separate calculation of the CPM normalized counts. +#' counts_cpm <- h_cpm(a) #' str(counts_cpm) h_cpm <- function(object, control = control_normalize()) { @@ -64,24 +134,14 @@ h_cpm <- function(object, ) } -#' Reads per Kilobase per Million (RPKM) Normalization -#' -#' @description `r lifecycle::badge("stable")` -#' -#' This helper function calculates the RPKM normalized counts. -#' -#' @param object (`AnyHermesData`)\cr input object. -#' @param control (`list`)\cr list of settings used to perform the normalization procedure. -#' @return A numeric matrix with normalized counts using the RPKM method. -#' -#' @note To be used with the `normalize()` function. +#' @describeIn normalize calculates the Reads per Kilobase per Million (RPKM) normalized counts. #' #' @export #' @importFrom edgeR rpkm #' @examples -#' h <- HermesData(summarized_experiment) -#' cont <- control_normalize(log = FALSE, lib_sizes = rep(1e6L, 20)) -#' counts_rpkm <- h_rpkm(h, cont) +#' +#' # Separate calculation of the RPKM normalized counts. +#' counts_rpkm <- h_rpkm(a) #' str(counts_rpkm) h_rpkm <- function(object, control = control_normalize()) { @@ -99,21 +159,13 @@ h_rpkm <- function(object, ) } -#' Transcripts per Million (TPM) Normalization -#' -#' @description `r lifecycle::badge("stable")` -#' -#' This helper function calculates the TPM normalized counts. -#' -#' @param object (`HermesData`)\cr input. -#' @param control (`list`)\cr list of settings used to perform the normalization procedure. -#' @return A numeric matrix with normalized counts using the TPM method. +#' @describeIn normalize calculates the Transcripts per Million (TPM) normalized counts. #' #' @export #' @examples -#' h <- HermesData(summarized_experiment) -#' cont <- control_normalize() -#' counts_tpm <- h_tpm(h, cont) +#' +#' # Separate calculation of the TPM normalized counts. +#' counts_tpm <- h_tpm(a) #' str(counts_tpm) h_tpm <- function(object, control = control_normalize()) { @@ -127,24 +179,15 @@ h_tpm <- function(object, } } -#' VOOM Normalization -#' -#' @description `r lifecycle::badge("experimental")` -#' -#' This helper function calculates the VOOM normalized counts. -#' -#' @param object (`HermesData`)\cr input. -#' @param control (`list`)\cr list of settings used to perform the normalization procedure. -#' -#' @return A numeric matrix with normalized counts using the VOOM method. +#' @describeIn normalize calculates the VOOM normalized counts. `r lifecycle::badge("experimental")` #' #' @export #' @importFrom limma voom #' @importFrom S4Vectors isEmpty #' @examples -#' h <- HermesData(summarized_experiment) -#' cont <- control_normalize() -#' counts_voom <- h_voom(h, cont) +#' +#' # Separate calculation of the VOOM normalized counts. +#' counts_voom <- h_voom(a) #' str(counts_voom) h_voom <- function(object, control = control_normalize()) { @@ -166,76 +209,3 @@ h_voom <- function(object, 2^norm_log2 } } - -#' Normalization of `AnyHermesData` Objects -#' -#' @description `r lifecycle::badge("stable")` -#' -#' This method is normalizing the input [`AnyHermesData`] according to one or more -#' specified normalization methods. The results are saved as additional assays -#' in the object. -#' -#' Possible normalization methods are: -#' - `cpm`: Counts per Million (CPM). Separately by sample, the original counts of the genes -#' are divided by the library size of this sample, and multiplied by one million. This is the -#' appropriate normalization for between-sample comparisons. -#' - `rpkm`: Reads per Kilobase of transcript per Million reads mapped (RPKM). Each gene count is -#' divided by the gene width (in kilobases) and then again divided by the library sizes of each -#' sample (in millions). This allows for within-sample comparisons, as it takes -#' into account the gene lengths - longer genes will always have more counts than shorter genes. -#' - `tpm`: Transcripts per Million (TPM). This addresses the problem of RPKM being inconsistent -#' across samples (which can be seen that the sum of all RPKM values will vary from sample to -#' sample). Therefore here we divide the RPKM by the sum of all RPKM values for each sample, -#' and multiply by one million. -#' - `voom`: VOOM normalization. This is essentially just a slight variation of CPM where -#' a `prior_count` of 0.5 is combined with `lib_sizes` increased by 1 for each sample. It is used -#' as a starting point for the corresponding differential expression analysis. -#' -#' @rdname normalize -#' @aliases normalize -#' -#' @param object (`AnyHermesData`)\cr object to normalize. -#' @param methods (`character`)\cr which normalization methods to use, see details. -#' @param control (named `list`)\cr settings produced by [control_normalize()]. -#' @param ... not used. -#' -#' @return The [`AnyHermesData`] object with additional assays containing the normalized counts. -#' The `control` is saved in the `metadata` of the object for future reference. -#' -#' @importFrom BiocGenerics normalize -#' @export -#' @examples -#' a <- HermesData(summarized_experiment) -#' -#' # By default, log values are used with a prior count of 1 added to original counts. -#' result <- normalize(a) -#' assayNames(result) -#' tpm <- assay(result, "tpm") -#' tpm[1:3, 1:3] -#' -#' # We can also work on original scale. -#' result_orig <- normalize(a, control = control_normalize(log = FALSE)) -#' tpm_orig <- assay(result_orig, "tpm") -#' tpm_orig[1:3, 1:3] -setMethod( - f = "normalize", - signature = "AnyHermesData", - definition = function(object, - methods = c("cpm", "rpkm", "tpm", "voom"), - control = control_normalize(), - ...) { - method_choices <- c("cpm", "rpkm", "tpm", "voom") - assert_that(all(methods %in% method_choices)) - methods <- match.arg(methods, choices = method_choices, several.ok = TRUE) - for (method in methods) { - fun_name <- paste0("h_", method) - assay(object, method) <- do.call( - fun_name, - list(object = object, control = control), - envir = as.environment("package:hermes") - ) - } - metadata(object) <- c(metadata(object), list(control_normalize = control)) - object - } -) diff --git a/R/pca.R b/R/pca.R index e9117206..4d5b0db0 100644 --- a/R/pca.R +++ b/R/pca.R @@ -2,27 +2,29 @@ #' #' @description `r lifecycle::badge("experimental")` #' -#' Perform principal components analysis of the gene count vectors across all -#' samples. +#' The `calc_pca()` function performs principal components analysis of the gene count +#' vectors across all samples. +#' +#' A corresponding `autoplot()` method then can visualize the results. #' #' @details -#' - PCA should be performed after filtering out low quality genes and samples and -#' normalization. +#' - PCA should be performed after filtering out low quality genes and samples, as well as +#' normalization of counts. #' - In addition, genes with constant counts across all samples are excluded from -#' the analysis internally. -#' - Centering and scaling is applied internally. +#' the analysis internally in `calc_pca()`. Centering and scaling is also applied internally. #' - Plots can be obtained with the [ggplot2::autoplot()] function #' with the corresponding method from the `ggfortify` package to plot the -#' results of a principal components analysis saved in a [HermesDataPca] +#' results of a principal components analysis saved in a [`HermesDataPca`] #' object. See [ggfortify::autoplot.prcomp()] for details. #' #' @param object (`AnyHermesData`) \cr input. -#' @param assay_name (`string`) \cr Indicating the name of the assay -#' of interest, with possible options: `counts`, `cpm`, `tpm`, `rpkm`, `voom`. -#' Default assay is `counts`. +#' @param assay_name (`string`) \cr name of the assay to use. #' #' @return A [HermesDataPca] object which is an extension of the [stats::prcomp] class. #' +#' @seealso Afterwards correlations between principal components +#' and sample variables can be calculated, see [`pca_cor_samplevar`]. +#' #' @importFrom S4Vectors isConstant #' @importFrom stats prcomp #' @export @@ -32,13 +34,16 @@ #' add_quality_flags() %>% #' filter() %>% #' normalize() +#' +#' # Perform PCA. #' result <- calc_pca(object, assay_name = "tpm") #' summary(result) #' +#' # Plot the results. #' autoplot(result) #' autoplot(result, x = 2, y = 3) #' autoplot(result, variance_percentage = FALSE) -#' autoplot(result, label = TRUE) +#' autoplot(result, label = TRUE, label.repel = TRUE) calc_pca <- function(object, assay_name = "counts") { assert_that( diff --git a/R/pca_cor_samplevar.R b/R/pca_cor_samplevar.R index 78a161a9..935d7a84 100644 --- a/R/pca_cor_samplevar.R +++ b/R/pca_cor_samplevar.R @@ -27,8 +27,14 @@ NULL #' add_quality_flags() %>% #' filter() %>% #' normalize() +#' +#' # Obtain the principal components. #' pca <- calc_pca(object)$x +#' +#' # Obtain the sample variable. #' x <- colData(object)$LowDepthFlag +#' +#' # Correlate them. #' r2 <- h_pca_var_rsquared(pca, x) h_pca_var_rsquared <- function(pca, x) { assert_that( @@ -59,12 +65,23 @@ h_pca_var_rsquared <- function(pca, x) { #' This function processes sample variables from [`AnyHermesData`] and the #' corresponding principal components matrix, and then generates the matrix of R2 values. #' +#' @details +#' - Note that only the `df` columns which are `numeric`, `character`, `factor` or +#' `logical` are included in the resulting matrix, because other variable types are not +#' supported. +#' - In addition, `df` columns which are constant, all `NA`, or `character` or `factor` +#' columns with too many levels are also dropped before the analysis. +#' #' @param pca (`matrix`)\cr comprises principal components generated by [calc_pca()]. -#' @param df (`dataframe`)\cr [SummarizedExperiment::colData()] of a [`AnyHermesData`] object. +#' @param df (`data.frame`)\cr from the [SummarizedExperiment::colData()] of a +#' [`AnyHermesData`] object. #' -#' @return A matrix with R2 values for all combinations of sample variables and principle +#' @return A matrix with R2 values for all combinations of sample variables and principal #' components. #' +#' @seealso [h_pca_var_rsquared()] which is used internally to calculate the R2 for one +#' sample variable. +#' #' @export #' #' @examples @@ -72,8 +89,21 @@ h_pca_var_rsquared <- function(pca, x) { #' add_quality_flags() %>% #' filter() %>% #' normalize() +#' +#' # Obtain the principal components. #' pca <- calc_pca(object)$x -#' r2_all <- h_pca_df_r2_matrix(pca, as.data.frame(colData(object))) +#' +#' # Obtain the `colData` as a `data.frame`. +#' df <- as.data.frame(colData(object)) +#' +#' # Correlate them. +#' r2_all <- h_pca_df_r2_matrix(pca, df) +#' str(r2_all) +#' +#' # We can see that only about half of the columns from `df` were +#' # used for the correlations. +#' ncol(r2_all) +#' ncol(df) h_pca_df_r2_matrix <- function(pca, df) { assert_that( is.matrix(pca), @@ -113,8 +143,10 @@ h_pca_df_r2_matrix <- function(pca, df) { #' #' @description `r lifecycle::badge("stable")` #' -#' This method analyses the correlations (in R2 values) between all sample variables -#' in [AnyHermesData] object and the principal components of the samples. +#' This `correlate()` method analyses the correlations (in R2 values) between all sample variables +#' in a [`AnyHermesData`] object and the principal components of the samples. +#' +#' A corresponding `autoplot()` method then can visualize the results in a heatmap. #' #' @rdname pca_cor_samplevar #' @aliases pca_cor_samplevar @@ -125,6 +157,8 @@ h_pca_df_r2_matrix <- function(pca, df) { #' #' @return A [`HermesDataPcaCor`] object with R2 values for all sample variables. #' +#' @seealso [h_pca_df_r2_matrix()] which is used internally for the details. +#' #' @export #' #' @examples @@ -132,6 +166,8 @@ h_pca_df_r2_matrix <- function(pca, df) { #' add_quality_flags() %>% #' filter() %>% #' normalize() +#' +#' # Perform PCA and then correlate the prinicipal components with the sample variables. #' object_pca <- calc_pca(object) #' result <- correlate(object_pca, object) setMethod( @@ -174,6 +210,8 @@ setMethod( #' @export #' #' @examples +#' +#' # Visualize the correlations in a heatmap. #' autoplot(result) #' #' # We can also choose to not reorder the columns. diff --git a/R/quality.R b/R/quality.R index 48ba90ca..c0c169fa 100644 --- a/R/quality.R +++ b/R/quality.R @@ -5,9 +5,9 @@ #' Control function which specifies the quality flag settings. #' One or more settings can be customized. Not specified settings are left at defaults. #' -#' @param min_cpm (non-negative `number`)\cr minimum CPM for each gene within -#' the sample. -#' @param min_cpm_prop (`proportion`)\cr minimum percentage of samples with +#' @param min_cpm (non-negative `number`)\cr minimum Counts per Million (CPM) for +#' each gene within the sample. +#' @param min_cpm_prop (`proportion`)\cr minimum proportion of samples with #' acceptable CPM of certain gene for low expression flagging. #' @param min_corr (`proportion`)\cr minimum Pearson correlation coefficient of #' CPM between samples for technical failure flagging. @@ -53,28 +53,39 @@ control_quality <- function(min_cpm = 1, #' #' @description `r lifecycle::badge("stable")` #' -#' This function adds quality flag information to a [AnyHermesData] object: +#' The function `add_quality_flags()` adds quality flag information to a [`AnyHermesData`] object: #' - `LowExpressionFlag`: for each gene, counts how many samples don't pass a minimum -#' expression CPM threshold. If too many, then it flags this gene as "low expression" gene. +#' expression Counts per Million (CPM) threshold. If too many, then it flags this gene +#' as a "low expression" gene. #' - `TechnicalFailureFlag`: first calculates the Pearson correlation matrix of the sample wise #' CPM values, resulting in a matrix measuring the correlation between samples. #' Then compares the average correlation per sample with a threshold - if it is too low, -#' then the sample is flagged as "technical failure". -#' - `LowDepthFlag`: computes the library size (total number of counts) per sample -#' (removing any `NA`s). If this number is too low, the sample is flagged as "low depth". +#' then the sample is flagged as a "technical failure" sample. +#' - `LowDepthFlag`: computes the library size (total number of counts) per sample. +#' If this number is too low, the sample is flagged as a "low depth" sample. #' -#' @details While `object` already has the variables above (as this is enforced by the validation -#' method for [`AnyHermesData`]), they are usually still `NA` after the initial creation. +#' Separate helper functions are internally used to create the flags, and +#' separate getter functions allow easy access to the quality control flags in an object. +#' +#' @rdname quality_flags +#' +#' @details While `object` already has the variables mentioned above as part of the +#' `rowData` and `colData` (as this is enforced by the validation +#' method for [`AnyHermesData`]), they are usually still `NA` after the initial +#' object creation. #' #' @param object (`AnyHermesData`) \cr input. -#' @param control (`list`) \cr list of settings used to perform the quality control procedure, -#' produced by [control_quality()]. -#' @param overwrite (`flag`)\cr whether previous results need to be overwritten. +#' @param control (`list`) \cr list of settings (thresholds etc.) used to compute the +#' quality control flags, produced by [control_quality()]. +#' @param overwrite (`flag`)\cr whether previously added flags may be overwritten. #' #' @return The input object with added quality flags. #' -#' @seealso [control_quality()] for the detailed settings specifications. +#' @seealso +#' - [control_quality()] for the detailed settings specifications; +#' - [set_tech_failure()] to manually flag samples as technical failures. #' +#' @importFrom stats setNames #' @export #' #' @examples @@ -82,9 +93,9 @@ control_quality <- function(min_cpm = 1, #' object <- HermesData(summarized_experiment) #' result <- add_quality_flags(object) #' which(get_tech_failure(result) != get_tech_failure(object)) -#' head(rowData(result)$LowExpressionFlag) -#' head(colData(result)$TechnicalFailureFlag) -#' head(colData(result)$LowDepthFlag) +#' head(get_low_expression(result)) +#' head(get_tech_failure(result)) +#' head(get_low_depth(result)) #' #' # It is possible to overwrite flags if needed, which will trigger a message. #' result2 <- add_quality_flags(result, control_quality(min_cpm = 1000), overwrite = TRUE) @@ -113,23 +124,21 @@ add_quality_flags <- function(object, object } -#' @describeIn add_quality_flags creates the low expression flag for genes +#' @describeIn quality_flags creates the low expression flag for genes #' given control settings. #' #' @importFrom edgeR cpm #' @export #' #' @examples -#' # Separate calculation of low expression flag. -#' low_expr_flag <- h_low_expression_flag(object) -#' head(low_expr_flag) -#' length(low_expr_flag) == nrow(object) #' -#' low_expr_flag2 <- h_low_expression_flag( +#' # Separate calculation of low expression flag. +#' low_expr_flag <- h_low_expression_flag( #' object, #' control_quality(min_cpm = 500, min_cpm_prop = 0.9) #' ) -#' head(low_expr_flag2) +#' length(low_expr_flag) == nrow(object) +#' head(low_expr_flag) h_low_expression_flag <- function(object, control = control_quality()) { assert_that( @@ -142,19 +151,18 @@ h_low_expression_flag <- function(object, n_samples_below_min_cpm > threshold_n_samples } -#' @describeIn add_quality_flags creates the low depth (library size) flag for samples +#' @describeIn quality_flags creates the low depth (library size) flag for samples #' given control settings. #' #' @importFrom stats quantile #' @export #' #' @examples -#' low_depth_flag <- h_low_depth_flag(object) -#' head(low_depth_flag) -#' length(low_depth_flag) == ncol(object) #' -#' low_depth_flag2 <- h_low_depth_flag(object, control_quality(min_depth = 5)) -#' head(low_depth_flag2) +#' # Separate calculation of low depth flag. +#' low_depth_flag <- h_low_depth_flag(object, control_quality(min_depth = 5)) +#' length(low_depth_flag) == ncol(object) +#' head(low_depth_flag) h_low_depth_flag <- function(object, control = control_quality()) { assert_that( @@ -169,20 +177,18 @@ h_low_depth_flag <- function(object, lib_sizes < control$min_depth } -#' @describeIn add_quality_flags creates the technical failure flag for samples +#' @describeIn quality_flags creates the technical failure flag for samples #' given control settings. #' #' @importFrom edgeR cpm #' @export #' #' @examples -#' object <- HermesData(summarized_experiment) -#' tech_failure_flag <- h_tech_failure_flag(object) -#' head(tech_failure_flag) -#' length(tech_failure_flag) == ncol(object) #' -#' tech_failure_flag2 <- h_tech_failure_flag(object, control_quality(min_corr = 0.35)) -#' head(tech_failure_flag2) +#' # Separate calculation of technical failure flag. +#' tech_failure_flag <- h_tech_failure_flag(object, control_quality(min_corr = 0.35)) +#' length(tech_failure_flag) == ncol(object) +#' head(tech_failure_flag) h_tech_failure_flag <- function(object, control = control_quality()) { assert_that( @@ -194,23 +200,7 @@ h_tech_failure_flag <- function(object, colMeans(corr_matrix) < control$min_corr } -#' Get Quality Flags -#' -#' Separate getter functions which allow easy access to the quality control flags. -#' -#' @param object (`AnyHermesData`)\cr input. -#' -#' @return Named logical vector containing the failure flags for all samples or genes, -#' respectively. -#' -#' @importFrom stats setNames -#' @name get_quality_flags -#' -#' @examples -#' object <- HermesData(summarized_experiment) -NULL - -#' @describeIn get_quality_flags get the technical failure flags for all samples. +#' @describeIn quality_flags get the technical failure flags for all samples. #' @export #' @examples #' head(get_tech_failure(object)) @@ -221,7 +211,7 @@ get_tech_failure <- function(object) { stats::setNames(flag_vals, samples) } -#' @describeIn get_quality_flags get the low depth failure flags for all samples. +#' @describeIn quality_flags get the low depth failure flags for all samples. #' @export #' @examples #' head(get_low_depth(object)) @@ -232,7 +222,7 @@ get_low_depth <- function(object) { stats::setNames(flag_vals, samples) } -#' @describeIn get_quality_flags get the low expression failure flags for all genes. +#' @describeIn quality_flags get the low expression failure flags for all genes. #' @export #' @examples #' head(get_low_expression(object)) diff --git a/R/top_genes.R b/R/top_genes.R index b116a9b3..e2e9da88 100644 --- a/R/top_genes.R +++ b/R/top_genes.R @@ -2,11 +2,13 @@ #' #' @description `r lifecycle::badge("experimental")` #' -#' This creates a [`HermesDataTopGenes`] object, which extends [`data.frame`]. It +#' `top_genes()` creates a [`HermesDataTopGenes`] object, which extends [`data.frame`]. It #' contains two columns: #' - `expression`: containing the statistic values calculated by `summary_fun` across columns. #' - `name`: the gene names. #' +#' The corresponding `autoplot()` method then visualizes the result as a barplot. +#' #' @details #' - The data frame is sorted in descending order of `expression` and only the top #' entries according to the selection criteria are included. @@ -15,8 +17,8 @@ #' #' @param object (`AnyHermedData`)\cr input. #' @param assay_name (`string`)\cr name of the assay to use for the sorting of genes. -#' @param summary_fun (`function`)\cr summary statistics function to apply to the assay resulting in -#' a numeric vector with one value per gene. +#' @param summary_fun (`function`)\cr summary statistics function to apply across the samples in +#' the assay resulting in a numeric vector with one value per gene. #' @param n_top (`count` or `NULL`)\cr selection criteria based on number of entries. #' @param min_threshold (`number` or `NULL` )\cr selection criteria based on a minimum #' summary statistics threshold. @@ -26,8 +28,15 @@ #' #' @examples #' object <- HermesData(summarized_experiment) +#' +#' # Default uses average of raw counts across samples to rank genes. #' top_genes(object) +#' +#' # Instead of showing top 10 genes, can also set a minimum threshold on average counts. #' top_genes(object, n_top = NULL, min_threshold = 50000) +#' +#' # We can also use the maximum of raw counts across samples, by specifying a different +#' # summary statistics function. #' result <- top_genes(object, summary_fun = rowMax) top_genes <- function(object, assay_name = "counts", @@ -102,6 +111,8 @@ top_genes <- function(object, #' @param title (`string`)\cr plot title. #' #' @examples +#' +#' # Finally we can produce barplots based on the results. #' autoplot(result, title = "My top genes") #' autoplot(result, y_lab = "Counts", title = "My top genes") setMethod( diff --git a/_pkgdown.yml b/_pkgdown.yml index 2c8871e2..61b31b15 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -75,7 +75,6 @@ reference: - add_quality_flags - calc_pca - diff_expression - - get_quality_flags - set_tech_failure - top_genes - connect_biomart diff --git a/man/HermesData-class.Rd b/man/HermesData-class.Rd index bb894a55..7017e5ae 100644 --- a/man/HermesData-class.Rd +++ b/man/HermesData-class.Rd @@ -18,7 +18,7 @@ HermesData(object) HermesDataFromMatrix(counts, ...) } \arguments{ -\item{object}{(\code{SummarizedExperiment})\cr input to create \code{\link{HermesData}} from. +\item{object}{(\code{SummarizedExperiment})\cr input to create the \code{\link{HermesData}} object from. If this is a \code{RangedSummarizedExperiment}, then the result will be \code{\link{RangedHermesData}}.} @@ -88,20 +88,17 @@ for full control of the details. } } \examples{ -# Convert to `SummarizedExperiment` using the default naive range mapper. -se <- makeSummarizedExperimentFromExpressionSet(expression_set) -# Then convert to `HermesData`. -# Create objects starting from a `SummarizedExperiment.` +# Convert an `ExpressionSet` to a `RangedSummarizedExperiment`. +ranged_summarized_experiment <- makeSummarizedExperimentFromExpressionSet(expression_set) + +# Then convert to `RangedHermesData`. +HermesData(ranged_summarized_experiment) + +# Create objects starting from a `SummarizedExperiment`. hermes_data <- HermesData(summarized_experiment) hermes_data -ranged_summarized_experiment <- as(summarized_experiment, "RangedSummarizedExperiment") -ranged_hermes_data <- HermesData(ranged_summarized_experiment) -ranged_hermes_data -# Create objects from a matrix and additional arguments. + +# Create objects from a matrix. Note that additional arguments are not required but possible. counts_matrix <- assay(summarized_experiment) -HermesDataFromMatrix( - counts = counts_matrix, - rowData = rowData(summarized_experiment), - colData = colData(summarized_experiment) -) +counts_hermes_data <- HermesDataFromMatrix(counts_matrix) } diff --git a/man/annotation.Rd b/man/annotation.Rd index 8a6ec25f..00a52413 100644 --- a/man/annotation.Rd +++ b/man/annotation.Rd @@ -8,7 +8,8 @@ \alias{annotation<-,AnyHermesData,DataFrame-method} \title{Annotation Accessor and Setter} \format{ -An object of class \code{character} of length 8. +The annotation column names are available in the exported +character vector \code{.row_data_annotation_cols}. } \usage{ \S4method{annotation}{AnyHermesData}(object, ...) @@ -18,11 +19,11 @@ An object of class \code{character} of length 8. \S4method{annotation}{AnyHermesData,DataFrame}(object) <- value } \arguments{ -\item{object}{(\code{AnyHermesData})\cr object to access the counts from.} +\item{object}{(\code{AnyHermesData})\cr object to access the annotations from.} \item{...}{not used.} -\item{value}{(\code{matrix})\cr what should the counts assay be replaced with.} +\item{value}{(\code{DataFrame})\cr what should the annotations be replaced with.} } \value{ The \code{\link[S4Vectors:DataFrame-class]{S4Vectors::DataFrame}} with the gene annotations: @@ -43,17 +44,10 @@ The \code{\link[S4Vectors:DataFrame-class]{S4Vectors::DataFrame}} with the gene These methods access and set the gene annotations stored in a \code{\link{AnyHermesData}} object. } \note{ -\itemize{ -\item The returned column names are available in the exported -character vector \code{.row_data_annotation_cols}. -} - -\itemize{ -\item When trying to replace the annotation with completely missing values for any genes, +When trying to replace the annotation with completely missing values for any genes, a warning will be given and the corresponding gene IDs will be saved in the attribute \code{annotation.missing.genes}. } -} \examples{ object <- HermesData(summarized_experiment) head(annotation(object)) diff --git a/man/assertions.Rd b/man/assertions.Rd index c26b576d..a529ad63 100644 --- a/man/assertions.Rd +++ b/man/assertions.Rd @@ -10,9 +10,9 @@ \alias{is_constant} \title{Additional Assertions for \code{assert_that}} \usage{ -is_class(object, class2) +is_class(x, class2) -is_hermes_data(object) +is_hermes_data(x) is_counts_vector(x) @@ -23,11 +23,9 @@ one_provided(one, two) is_constant(x) } \arguments{ -\item{object}{any object.} +\item{x}{an object to check.} -\item{class2}{(\code{character} or class definition)\cr the class to which \code{object} could belong.} - -\item{x}{An object to check.} +\item{class2}{(\code{character} or class definition)\cr the class to which \code{x} could belong.} \item{elements}{(\code{character})\cr names of elements which should be in the list \code{x}.} @@ -45,31 +43,43 @@ We provide additional assertion functions which can be used together with \itemize{ \item \code{is_class}: checks the class. -\item \code{is_hermes_data}: checks the class. +\item \code{is_hermes_data}: checks whether \code{x} is an \code{\link{AnyHermesData}} object. \item \code{is_counts_vector}: checks for a vector of counts (positive integers). \item \code{is_list_with}: checks for a list containing elements. -\item \code{one_provided}: checks that exactly one of two inputs is not \code{NULL}. +\item \code{one_provided}: checks that exactly one of the two inputs \code{one}, \code{two} is not \code{NULL}. -\item \code{is_constant}: checks for a column being constant. +\item \code{is_constant}: checks whether the vector \code{x} is constant (only supports \code{numeric}, \code{factor}, +\code{character}, \code{logical}). \code{NA}s are removed first. }} \examples{ +# Assert a general class. a <- 5 is_class(a, "character") + +# Assert a `AnyHermesData` object. is_hermes_data(HermesData(summarized_experiment)) is_hermes_data(42) -a <- 5 -is_class(a, "character") + +# Assert a counts vector. +a <- 5L +is_counts_vector(a) + +# Assert a list containing certain elements. b <- list(a = 5, b = 3) is_list_with(b, c("a", "c")) is_list_with(b, c("a", "b")) + +# Assert that exactly one of two arguments is provided. a <- 10 b <- 10 one_provided(a, b) one_provided(a, NULL) + +# Assert a constant vector. is_constant(c(1, 2)) is_constant(c(NA, 1)) is_constant(c("a", "a")) diff --git a/man/calc_cor.Rd b/man/calc_cor.Rd index aa233665..3bd464f7 100644 --- a/man/calc_cor.Rd +++ b/man/calc_cor.Rd @@ -22,7 +22,7 @@ \arguments{ \item{object}{(\code{AnyHermesData})\cr object to calculate the correlation.} -\item{assay_name}{(\code{string})\cr the assay name where the counts are located in.} +\item{assay_name}{(\code{string})\cr the name of the assay to use.} \item{method}{(\code{string})\cr the correlation method, see \code{\link[stats:cor]{stats::cor()}} for details.} @@ -40,21 +40,39 @@ A \code{\link{HermesDataCor}} object. \description{ \ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} -This calculates the correlation matrix between the sample vectors of counts from -a specified assay, as a \code{\link{HermesDataCor}} object which is an extension of a \code{\link{matrix}} with -additional quality flags in the slot \code{flag_data}: This contains the -\code{TechnicalFailureFlag} and \code{LowDepthFlag} columns describing the original input samples. +The \code{correlate()} method can calculate the correlation matrix between the sample vectors of +counts from a specified assay. This produces a \code{\link{HermesDataCor}} object, which is an extension +of a \code{\link{matrix}} with additional quality flags in the slot \code{flag_data} +(containing the \code{TechnicalFailureFlag} and \code{LowDepthFlag} columns describing the original +input samples). + +An \code{autoplot()} method then afterwards can produce the corresponding heatmap. } \section{Functions}{ \itemize{ -\item \code{autoplot,HermesDataCor-method}: This plot method uses the \code{\link[ComplexHeatmap:Heatmap]{ComplexHeatmap::Heatmap()}} function +\item \code{autoplot,HermesDataCor-method}: This \code{autoplot()} method uses the \code{\link[ComplexHeatmap:Heatmap]{ComplexHeatmap::Heatmap()}} function to plot the correlations between samples saved in a \code{\link{HermesDataCor}} object. }} \examples{ object <- HermesData(summarized_experiment) + +# Calculate the sample correlation matrix. correlate(object) -result <- correlate(object, method = "pearson") + +# We can specify another correlation coefficient to be calculated. +result <- correlate(object, method = "spearman") + +# Plot the correlation matrix. autoplot(result) + +# We can customize the heatmap. autoplot(result, show_column_names = FALSE, show_row_names = FALSE) + +# Including changing the axis label text size. +autoplot( + result, + row_names_gp = grid::gpar(fontsize = 8), + column_names_gp = grid::gpar(fontsize = 8) +) } diff --git a/man/calc_pca.Rd b/man/calc_pca.Rd index 899b2697..a77159c3 100644 --- a/man/calc_pca.Rd +++ b/man/calc_pca.Rd @@ -13,9 +13,7 @@ calc_pca(object, assay_name = "counts") \arguments{ \item{object}{(\code{AnyHermesData}) \cr input.} -\item{assay_name}{(\code{string}) \cr Indicating the name of the assay -of interest, with possible options: \code{counts}, \code{cpm}, \code{tpm}, \code{rpkm}, \code{voom}. -Default assay is \code{counts}.} +\item{assay_name}{(\code{string}) \cr name of the assay to use.} } \value{ A \link{HermesDataPca} object which is an extension of the \link[stats:prcomp]{stats::prcomp} class. @@ -23,19 +21,20 @@ A \link{HermesDataPca} object which is an extension of the \link[stats:prcomp]{s \description{ \ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} -Perform principal components analysis of the gene count vectors across all -samples. +The \code{calc_pca()} function performs principal components analysis of the gene count +vectors across all samples. + +A corresponding \code{autoplot()} method then can visualize the results. } \details{ \itemize{ -\item PCA should be performed after filtering out low quality genes and samples and -normalization. +\item PCA should be performed after filtering out low quality genes and samples, as well as +normalization of counts. \item In addition, genes with constant counts across all samples are excluded from -the analysis internally. -\item Centering and scaling is applied internally. +the analysis internally in \code{calc_pca()}. Centering and scaling is also applied internally. \item Plots can be obtained with the \code{\link[ggplot2:autoplot]{ggplot2::autoplot()}} function with the corresponding method from the \code{ggfortify} package to plot the -results of a principal components analysis saved in a \link{HermesDataPca} +results of a principal components analysis saved in a \code{\link{HermesDataPca}} object. See \code{\link[ggfortify:autoplot.pca_common]{ggfortify::autoplot.prcomp()}} for details. } } @@ -44,11 +43,18 @@ object <- HermesData(summarized_experiment) \%>\% add_quality_flags() \%>\% filter() \%>\% normalize() + +# Perform PCA. result <- calc_pca(object, assay_name = "tpm") summary(result) +# Plot the results. autoplot(result) autoplot(result, x = 2, y = 3) autoplot(result, variance_percentage = FALSE) -autoplot(result, label = TRUE) +autoplot(result, label = TRUE, label.repel = TRUE) +} +\seealso{ +Afterwards correlations between principal components +and sample variables can be calculated, see \code{\link{pca_cor_samplevar}}. } diff --git a/man/cbind.Rd b/man/cbind.Rd index 410277f6..f3665ed5 100644 --- a/man/cbind.Rd +++ b/man/cbind.Rd @@ -16,19 +16,22 @@ This method combines \code{\link{AnyHermesData}} objects with the same ranges bu samples (columns in assays). } \note{ -Note that this just inherits +\itemize{ +\item Note that this just inherits \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment::cbind,SummarizedExperiment-method()}}. When binding a \code{\link{AnyHermesData}} object with a \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{SummarizedExperiment::SummarizedExperiment}} object, then the result will be a \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{SummarizedExperiment::SummarizedExperiment}} object (the more general class). +\item Note that the combined object needs to have unique sample IDs (column names). +} } \examples{ a <- HermesData(summarized_experiment[, 1:10]) b <- HermesData(summarized_experiment[, 11:20]) result <- cbind(a, b) class(result) - -result2 <- cbind(summarized_experiment[, 1:10], b) -class(result2) +} +\seealso{ +\code{\link{rbind}} to row bind objects. } diff --git a/man/connect_biomart.Rd b/man/connect_biomart.Rd index bf8540df..90445b73 100644 --- a/man/connect_biomart.Rd +++ b/man/connect_biomart.Rd @@ -19,7 +19,7 @@ connect_biomart(prefix = c("ENSG", "GeneID")) \description{ \ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} -Creates a connection object of class \code{\link{ConnectionBiomart}} which contains +\code{connect_biomart()} creates a connection object of class \code{\link{ConnectionBiomart}} which contains the \code{biomaRt} object of class \code{\link[biomaRt:Mart-class]{biomaRt::Mart}} and the prefix of the object which is used downstream for the query. } diff --git a/man/control_normalize.Rd b/man/control_normalize.Rd index f317408e..7d9cf509 100644 --- a/man/control_normalize.Rd +++ b/man/control_normalize.Rd @@ -9,11 +9,11 @@ control_normalize(log = TRUE, lib_sizes = NULL, prior_count = 1) \arguments{ \item{log}{(\code{flag})\cr whether \code{log2} values are returned, otherwise original scale is used.} -\item{lib_sizes}{(\code{numeric})\cr library sizes, the default is the vector with the sum of the -counts for each of the samples.} +\item{lib_sizes}{(\code{NULL} or \code{counts})\cr library sizes, if \code{NULL} the vector with the sum of the +counts for each of the samples will be used.} -\item{prior_count}{(\code{count})\cr average count to be added to each observation to avoid -taking log of zero, used only when \code{log = TRUE}.} +\item{prior_count}{(non-negative \code{number})\cr average count to be added to each observation to +avoid taking log of zero, used only when \code{log = TRUE}.} } \value{ List with the above settings used to perform the normalization procedure. @@ -24,7 +24,7 @@ List with the above settings used to perform the normalization procedure. This control function allows for easy customization of the normalization settings. } \note{ -To be used with the \code{normalize()} function. +To be used with the \code{\link[=normalize]{normalize()}} function. } \examples{ control_normalize() diff --git a/man/control_quality.Rd b/man/control_quality.Rd index 3c96ae54..044917ea 100644 --- a/man/control_quality.Rd +++ b/man/control_quality.Rd @@ -12,10 +12,10 @@ control_quality( ) } \arguments{ -\item{min_cpm}{(non-negative \code{number})\cr minimum CPM for each gene within -the sample.} +\item{min_cpm}{(non-negative \code{number})\cr minimum Counts per Million (CPM) for +each gene within the sample.} -\item{min_cpm_prop}{(\code{proportion})\cr minimum percentage of samples with +\item{min_cpm_prop}{(\code{proportion})\cr minimum proportion of samples with acceptable CPM of certain gene for low expression flagging.} \item{min_corr}{(\code{proportion})\cr minimum Pearson correlation coefficient of diff --git a/man/diff_expression.Rd b/man/diff_expression.Rd index ad78e864..80030397 100644 --- a/man/diff_expression.Rd +++ b/man/diff_expression.Rd @@ -34,18 +34,20 @@ to flag up- or down-regulation of transcription.} A \code{\link{HermesDataDiffExpr}} object which is a data frame with the following columns for each gene in the \code{\link{HermesData}} object: \itemize{ -\item \code{log2_fc} (estimate of the log2 fold change between the 2 levels of the +\item \code{log2_fc} (the estimate of the log2 fold change between the 2 levels of the provided factor) \item \code{stat} (the test statistic, which one depends on the method used) -\item \code{p_val} (the raw p-value), -\item \code{adj_p_val} (the adjusted p-value) values from differential expression analysis for each feature / gene . +\item \code{p_val} (the raw p-value) +\item \code{adj_p_val} (the multiplicity adjusted p-value value) } } \description{ \ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} -This function performs differential expression analysis +The \code{diff_expression()} function performs differential expression analysis using a method of preference. + +A corresponding \code{autoplot()} method is visualizing the results as a volcano plot. } \details{ Possible method choices are: @@ -81,13 +83,13 @@ head(res1) res2 <- diff_expression(object, group = "SEX", method = "deseq2") head(res2) -# Passing method arguments to the internally used helper functions. +# Pass method arguments to the internally used helper functions. res3 <- diff_expression(object, group = "SEX", method = "voom", robust = TRUE, trend = TRUE) head(res3) res4 <- diff_expression(object, group = "SEX", method = "deseq2", fitType = "local") head(res4) -# Creating the corresponding volcano plots. +# Create the corresponding volcano plots. autoplot(res1) autoplot(res3) } diff --git a/man/draw_genes_barplot.Rd b/man/draw_genes_barplot.Rd index b0eb0154..01e86b39 100644 --- a/man/draw_genes_barplot.Rd +++ b/man/draw_genes_barplot.Rd @@ -29,7 +29,11 @@ This creates a barplot of chromosomes for the \link{AnyHermesData} object with t object <- HermesData(summarized_experiment) # Display chromosomes 1-22, X, Y, and MT. Other chromosomes are displayed in "Others". -draw_genes_barplot(object) +# To increase readability, we can have flip the coordinate axes. +draw_genes_barplot(object) + coord_flip() + +# Alternatively we can also rotate the x-axis tick labels. +draw_genes_barplot(object) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) # Display chromosomes 1 and 2. Other chromosomes are displayed in "Others". draw_genes_barplot(object, chromosomes = c("1", "2")) diff --git a/man/draw_libsize_densities.Rd b/man/draw_libsize_densities.Rd index c54ad8e2..54714def 100644 --- a/man/draw_libsize_densities.Rd +++ b/man/draw_libsize_densities.Rd @@ -23,5 +23,5 @@ on the plot corresponds to a sample. \examples{ result <- HermesData(summarized_experiment) draw_libsize_densities(result) -draw_libsize_densities(result, FALSE) +draw_libsize_densities(result, log = FALSE) } diff --git a/man/draw_libsize_qq.Rd b/man/draw_libsize_qq.Rd index 3d5f49a0..08c4c135 100644 --- a/man/draw_libsize_qq.Rd +++ b/man/draw_libsize_qq.Rd @@ -25,6 +25,8 @@ This creates a Q-Q plot of the library sizes of the \link{AnyHermesData} object. result <- HermesData(summarized_experiment) draw_libsize_qq(result) draw_libsize_qq(result, color = "blue", linetype = "solid") + # We can also add sample names as labels. -draw_libsize_qq(result) + geom_text(label = colnames(result), stat = "qq") +library(ggrepel) +draw_libsize_qq(result) + geom_text_repel(label = colnames(result), stat = "qq") } diff --git a/man/expression_set.Rd b/man/expression_set.Rd index 80e1106c..99f0f684 100644 --- a/man/expression_set.Rd +++ b/man/expression_set.Rd @@ -21,7 +21,11 @@ This example data can be used to try out conversion of a \code{\link[Biobase:cla object into a \code{\link{HermesData}} object. } \seealso{ -summarized_experiment which contains similar data as +\itemize{ +\item \code{\link[SummarizedExperiment:makeSummarizedExperimentFromExpressionSet]{SummarizedExperiment::makeSummarizedExperimentFromExpressionSet()}} to convert into a \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{SummarizedExperiment::SummarizedExperiment}}. +\item \code{\link{summarized_experiment}} which contains similar data already as a +\code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{SummarizedExperiment::SummarizedExperiment}}. +} } \keyword{datasets} diff --git a/man/filter.Rd b/man/filter.Rd index 8b7801ae..085a9ec6 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -44,11 +44,17 @@ requires non-standard evaluation of arguments. \examples{ a <- HermesData(summarized_experiment) dim(a) + # Filter genes and samples on default QC flags. result <- filter(a) dim(result) + # Filter only genes without low expression. result <- filter(a, what = "genes") + # Filter only samples with low depth and technical failure. result <- filter(a, what = "samples") + +# Filter only genes, and require certain annotations to be present. +result <- filter(a, what = "genes", annotation_required = c("StartBP", "EndBP", "WidthBP")) } diff --git a/man/genes.Rd b/man/genes.Rd index 191a4f82..6eb71881 100644 --- a/man/genes.Rd +++ b/man/genes.Rd @@ -25,3 +25,6 @@ nicely named accessor method. a <- HermesData(summarized_experiment) genes(a) } +\seealso{ +\code{\link[=samples]{samples()}} to access the sample IDs. +} diff --git a/man/get_quality_flags.Rd b/man/get_quality_flags.Rd deleted file mode 100644 index f8c2c993..00000000 --- a/man/get_quality_flags.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quality.R -\name{get_quality_flags} -\alias{get_quality_flags} -\alias{get_tech_failure} -\alias{get_low_depth} -\alias{get_low_expression} -\title{Get Quality Flags} -\usage{ -get_tech_failure(object) - -get_low_depth(object) - -get_low_expression(object) -} -\arguments{ -\item{object}{(\code{AnyHermesData})\cr input.} -} -\value{ -Named logical vector containing the failure flags for all samples or genes, -respectively. -} -\description{ -Separate getter functions which allow easy access to the quality control flags. -} -\section{Functions}{ -\itemize{ -\item \code{get_tech_failure}: get the technical failure flags for all samples. - -\item \code{get_low_depth}: get the low depth failure flags for all samples. - -\item \code{get_low_expression}: get the low expression failure flags for all genes. -}} - -\examples{ -object <- HermesData(summarized_experiment) -head(get_tech_failure(object)) -head(get_low_depth(object)) -head(get_low_expression(object)) -} diff --git a/man/h_cpm.Rd b/man/h_cpm.Rd deleted file mode 100644 index df61d7a6..00000000 --- a/man/h_cpm.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/normalization.R -\name{h_cpm} -\alias{h_cpm} -\title{Counts per Million (CPM) Normalization} -\usage{ -h_cpm(object, control = control_normalize()) -} -\arguments{ -\item{object}{(\code{HermesData}) \cr input.} - -\item{control}{(\code{list}) \cr list of settings used to perform the normalization procedure.} -} -\value{ -A numeric matrix with normalized counts using the CPM method. -} -\description{ -\ifelse{html}{\out{Stable lifecycle}}{\strong{Stable}} - -This helper function calculates the CPM normalized counts. -} -\examples{ -h <- HermesData(summarized_experiment) -cont <- control_normalize() -counts_cpm <- h_cpm(h, cont) -str(counts_cpm) -} diff --git a/man/h_diff_expr_deseq2.Rd b/man/h_diff_expr_deseq2.Rd index 331d4a9e..84ee92e6 100644 --- a/man/h_diff_expr_deseq2.Rd +++ b/man/h_diff_expr_deseq2.Rd @@ -26,9 +26,15 @@ This helper functions performs the differential expression analysis with } \examples{ object <- HermesData(summarized_experiment) + +# Create the design matrix corresponding to the factor of interest. design <- model.matrix(~SEX, colData(object)) + +# Then perform the `DESeq2` differential expression analysis. result <- h_diff_expr_deseq2(object, design) head(result) + +# Change of the `fitType` can be required in some cases. result2 <- h_diff_expr_deseq2(object, design, fitType = "local") head(result2) } diff --git a/man/h_diff_expr_voom.Rd b/man/h_diff_expr_voom.Rd index 08489b70..ea00d2ec 100644 --- a/man/h_diff_expr_voom.Rd +++ b/man/h_diff_expr_voom.Rd @@ -27,9 +27,15 @@ for given counts in a \link{AnyHermesData} object and a corresponding \code{desi } \examples{ object <- HermesData(summarized_experiment) + +# Create the design matrix corresponding to the factor of interest. design <- model.matrix(~SEX, colData(object)) + +# Then perform the differential expression analysis. result <- h_diff_expr_voom(object, design) head(result) + +# Sometimes we might want to specify method details. result2 <- h_diff_expr_voom(object, design, trend = TRUE, robust = TRUE) head(result2) } diff --git a/man/h_get_annotation_biomart.Rd b/man/h_get_annotation_biomart.Rd index 447d04ef..aac7b4f6 100644 --- a/man/h_get_annotation_biomart.Rd +++ b/man/h_get_annotation_biomart.Rd @@ -7,9 +7,10 @@ h_get_annotation_biomart(gene_ids, id_var, mart) } \arguments{ -\item{gene_ids}{(\code{character})\cr gene IDs.} +\item{gene_ids}{(\code{character})\cr gene IDs, e.g. \code{10329}.} -\item{id_var}{(\code{string})\cr gene ID variable, e.g. \code{entrezgene_id}.} +\item{id_var}{(\code{string})\cr corresponding gene ID variable name in Biomart, +e.g. \code{entrezgene_id}.} \item{mart}{(\code{Mart})\cr given \code{\link[biomaRt:Mart-class]{biomaRt::Mart}} object.} } diff --git a/man/h_has_req_annotations.Rd b/man/h_has_req_annotations.Rd index a17ee5a1..de5a15f1 100644 --- a/man/h_has_req_annotations.Rd +++ b/man/h_has_req_annotations.Rd @@ -28,3 +28,6 @@ all(result) rowData(object)$WidthBP[1] <- NA # nolint which(!h_has_req_annotations(object, "WidthBP")) } +\seealso{ +\code{\link[=filter]{filter()}} where this is used internally. +} diff --git a/man/h_pca_df_r2_matrix.Rd b/man/h_pca_df_r2_matrix.Rd index 395e1bbe..4194d5d8 100644 --- a/man/h_pca_df_r2_matrix.Rd +++ b/man/h_pca_df_r2_matrix.Rd @@ -9,10 +9,11 @@ h_pca_df_r2_matrix(pca, df) \arguments{ \item{pca}{(\code{matrix})\cr comprises principal components generated by \code{\link[=calc_pca]{calc_pca()}}.} -\item{df}{(\code{dataframe})\cr \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment::colData()}} of a \code{\link{AnyHermesData}} object.} +\item{df}{(\code{data.frame})\cr from the \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment::colData()}} of a +\code{\link{AnyHermesData}} object.} } \value{ -A matrix with R2 values for all combinations of sample variables and principle +A matrix with R2 values for all combinations of sample variables and principal components. } \description{ @@ -21,11 +22,37 @@ components. This function processes sample variables from \code{\link{AnyHermesData}} and the corresponding principal components matrix, and then generates the matrix of R2 values. } +\details{ +\itemize{ +\item Note that only the \code{df} columns which are \code{numeric}, \code{character}, \code{factor} or +\code{logical} are included in the resulting matrix, because other variable types are not +supported. +\item In addition, \code{df} columns which are constant, all \code{NA}, or \code{character} or \code{factor} +columns with too many levels are also dropped before the analysis. +} +} \examples{ object <- HermesData(summarized_experiment) \%>\% add_quality_flags() \%>\% filter() \%>\% normalize() + +# Obtain the principal components. pca <- calc_pca(object)$x -r2_all <- h_pca_df_r2_matrix(pca, as.data.frame(colData(object))) + +# Obtain the `colData` as a `data.frame`. +df <- as.data.frame(colData(object)) + +# Correlate them. +r2_all <- h_pca_df_r2_matrix(pca, df) +str(r2_all) + +# We can see that only about half of the columns from `df` were +# used for the correlations. +ncol(r2_all) +ncol(df) +} +\seealso{ +\code{\link[=h_pca_var_rsquared]{h_pca_var_rsquared()}} which is used internally to calculate the R2 for one +sample variable. } diff --git a/man/h_pca_var_rsquared.Rd b/man/h_pca_var_rsquared.Rd index e6dce4c4..9394a454 100644 --- a/man/h_pca_var_rsquared.Rd +++ b/man/h_pca_var_rsquared.Rd @@ -29,7 +29,13 @@ object <- HermesData(summarized_experiment) \%>\% add_quality_flags() \%>\% filter() \%>\% normalize() + +# Obtain the principal components. pca <- calc_pca(object)$x + +# Obtain the sample variable. x <- colData(object)$LowDepthFlag + +# Correlate them. r2 <- h_pca_var_rsquared(pca, x) } diff --git a/man/h_rpkm.Rd b/man/h_rpkm.Rd deleted file mode 100644 index 9244439f..00000000 --- a/man/h_rpkm.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/normalization.R -\name{h_rpkm} -\alias{h_rpkm} -\title{Reads per Kilobase per Million (RPKM) Normalization} -\usage{ -h_rpkm(object, control = control_normalize()) -} -\arguments{ -\item{object}{(\code{AnyHermesData})\cr input object.} - -\item{control}{(\code{list})\cr list of settings used to perform the normalization procedure.} -} -\value{ -A numeric matrix with normalized counts using the RPKM method. -} -\description{ -\ifelse{html}{\out{Stable lifecycle}}{\strong{Stable}} - -This helper function calculates the RPKM normalized counts. -} -\note{ -To be used with the \code{normalize()} function. -} -\examples{ -h <- HermesData(summarized_experiment) -cont <- control_normalize(log = FALSE, lib_sizes = rep(1e6L, 20)) -counts_rpkm <- h_rpkm(h, cont) -str(counts_rpkm) -} diff --git a/man/h_tpm.Rd b/man/h_tpm.Rd deleted file mode 100644 index d24aed4f..00000000 --- a/man/h_tpm.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/normalization.R -\name{h_tpm} -\alias{h_tpm} -\title{Transcripts per Million (TPM) Normalization} -\usage{ -h_tpm(object, control = control_normalize()) -} -\arguments{ -\item{object}{(\code{HermesData})\cr input.} - -\item{control}{(\code{list})\cr list of settings used to perform the normalization procedure.} -} -\value{ -A numeric matrix with normalized counts using the TPM method. -} -\description{ -\ifelse{html}{\out{Stable lifecycle}}{\strong{Stable}} - -This helper function calculates the TPM normalized counts. -} -\examples{ -h <- HermesData(summarized_experiment) -cont <- control_normalize() -counts_tpm <- h_tpm(h, cont) -str(counts_tpm) -} diff --git a/man/h_voom.Rd b/man/h_voom.Rd deleted file mode 100644 index 51070dcd..00000000 --- a/man/h_voom.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/normalization.R -\name{h_voom} -\alias{h_voom} -\title{VOOM Normalization} -\usage{ -h_voom(object, control = control_normalize()) -} -\arguments{ -\item{object}{(\code{HermesData})\cr input.} - -\item{control}{(\code{list})\cr list of settings used to perform the normalization procedure.} -} -\value{ -A numeric matrix with normalized counts using the VOOM method. -} -\description{ -\ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} - -This helper function calculates the VOOM normalized counts. -} -\examples{ -h <- HermesData(summarized_experiment) -cont <- control_normalize() -counts_voom <- h_voom(h, cont) -str(counts_voom) -} diff --git a/man/metadata.Rd b/man/metadata.Rd index 324d2c38..7f14da5a 100644 --- a/man/metadata.Rd +++ b/man/metadata.Rd @@ -5,6 +5,8 @@ \title{Metadata Accessor and Setter} \arguments{ \item{x}{(\code{AnyHermesData})\cr object to access the metadata from.} + +\item{value}{(\code{list})\cr the list to replace the current metadata with.} } \value{ The metadata which is a list. diff --git a/man/multi_assay_experiment.Rd b/man/multi_assay_experiment.Rd index 9cb99cc9..cb9fa65f 100644 --- a/man/multi_assay_experiment.Rd +++ b/man/multi_assay_experiment.Rd @@ -5,9 +5,12 @@ \alias{multi_assay_experiment} \title{Example \code{MultiAssayExperiment} Data} \format{ -A \code{\link[MultiAssayExperiment:MultiAssayExperiment]{MultiAssayExperiment::MultiAssayExperiment}} object with 3 separate SE objects. The first SE object -contains 5 samples and covers 1000 features (Entrez gene IDs). The second SE object contains 9 samples with -2500 features. The third SE object contains 6 samples with 1300 features. +A \code{\link[MultiAssayExperiment:MultiAssayExperiment]{MultiAssayExperiment::MultiAssayExperiment}} object with 3 separate SE objects. +\itemize{ +\item The first SE object contains 5 samples and covers 1000 features (Entrez gene IDs). +\item The second SE object contains 9 samples with 2500 features. +\item The third SE object contains 6 samples with 1300 features. +} } \source{ This is an artificial dataset designed to resemble real data. diff --git a/man/normalize.Rd b/man/normalize.Rd index a0fdec6d..bb30e8e1 100644 --- a/man/normalize.Rd +++ b/man/normalize.Rd @@ -3,6 +3,10 @@ \name{normalize,AnyHermesData-method} \alias{normalize,AnyHermesData-method} \alias{normalize} +\alias{h_cpm} +\alias{h_rpkm} +\alias{h_tpm} +\alias{h_voom} \title{Normalization of \code{AnyHermesData} Objects} \usage{ \S4method{normalize}{AnyHermesData}( @@ -11,6 +15,14 @@ control = control_normalize(), ... ) + +h_cpm(object, control = control_normalize()) + +h_rpkm(object, control = control_normalize()) + +h_tpm(object, control = control_normalize()) + +h_voom(object, control = control_normalize()) } \arguments{ \item{object}{(\code{AnyHermesData})\cr object to normalize.} @@ -28,11 +40,11 @@ The \code{control} is saved in the \code{metadata} of the object for future refe \description{ \ifelse{html}{\out{Stable lifecycle}}{\strong{Stable}} -This method is normalizing the input \code{\link{AnyHermesData}} according to one or more +The \code{normalize()} method is normalizing the input \code{\link{AnyHermesData}} according to one or more specified normalization methods. The results are saved as additional assays in the object. -Possible normalization methods are: +Possible normalization methods (which are implemented with separate helper functions): \itemize{ \item \code{cpm}: Counts per Million (CPM). Separately by sample, the original counts of the genes are divided by the library size of this sample, and multiplied by one million. This is the @@ -46,10 +58,22 @@ across samples (which can be seen that the sum of all RPKM values will vary from sample). Therefore here we divide the RPKM by the sum of all RPKM values for each sample, and multiply by one million. \item \code{voom}: VOOM normalization. This is essentially just a slight variation of CPM where -a \code{prior_count} of 0.5 is combined with \code{lib_sizes} increased by 1 for each sample. It is used -as a starting point for the corresponding differential expression analysis. +a \code{prior_count} of 0.5 is combined with \code{lib_sizes} increased by 1 for each sample. Note that +this is not required for the corresponding differential expression analysis, but just provided +as a complementary experimental normalization approach here. } } +\section{Functions}{ +\itemize{ +\item \code{h_cpm}: calculates the Counts per Million (CPM) normalized counts. + +\item \code{h_rpkm}: calculates the Reads per Kilobase per Million (RPKM) normalized counts. + +\item \code{h_tpm}: calculates the Transcripts per Million (TPM) normalized counts. + +\item \code{h_voom}: calculates the VOOM normalized counts. \ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} +}} + \examples{ a <- HermesData(summarized_experiment) @@ -63,4 +87,23 @@ tpm[1:3, 1:3] result_orig <- normalize(a, control = control_normalize(log = FALSE)) tpm_orig <- assay(result_orig, "tpm") tpm_orig[1:3, 1:3] + +# Separate calculation of the CPM normalized counts. +counts_cpm <- h_cpm(a) +str(counts_cpm) + +# Separate calculation of the RPKM normalized counts. +counts_rpkm <- h_rpkm(a) +str(counts_rpkm) + +# Separate calculation of the TPM normalized counts. +counts_tpm <- h_tpm(a) +str(counts_tpm) + +# Separate calculation of the VOOM normalized counts. +counts_voom <- h_voom(a) +str(counts_voom) +} +\seealso{ +\code{\link[=control_normalize]{control_normalize()}} to define the normalization method settings. } diff --git a/man/pca_cor_samplevar.Rd b/man/pca_cor_samplevar.Rd index b71189ef..67c13b69 100644 --- a/man/pca_cor_samplevar.Rd +++ b/man/pca_cor_samplevar.Rd @@ -36,8 +36,10 @@ A \code{\link{HermesDataPcaCor}} object with R2 values for all sample variables. \description{ \ifelse{html}{\out{Stable lifecycle}}{\strong{Stable}} -This method analyses the correlations (in R2 values) between all sample variables -in \link{AnyHermesData} object and the principal components of the samples. +This \code{correlate()} method analyses the correlations (in R2 values) between all sample variables +in a \code{\link{AnyHermesData}} object and the principal components of the samples. + +A corresponding \code{autoplot()} method then can visualize the results in a heatmap. } \section{Functions}{ \itemize{ @@ -50,10 +52,17 @@ object <- HermesData(summarized_experiment) \%>\% add_quality_flags() \%>\% filter() \%>\% normalize() + +# Perform PCA and then correlate the prinicipal components with the sample variables. object_pca <- calc_pca(object) result <- correlate(object_pca, object) + +# Visualize the correlations in a heatmap. autoplot(result) # We can also choose to not reorder the columns. autoplot(result, cluster_columns = FALSE) } +\seealso{ +\code{\link[=h_pca_df_r2_matrix]{h_pca_df_r2_matrix()}} which is used internally for the details. +} diff --git a/man/add_quality_flags.Rd b/man/quality_flags.Rd similarity index 52% rename from man/add_quality_flags.Rd rename to man/quality_flags.Rd index e13a9fcf..927da111 100644 --- a/man/add_quality_flags.Rd +++ b/man/quality_flags.Rd @@ -5,6 +5,9 @@ \alias{h_low_expression_flag} \alias{h_low_depth_flag} \alias{h_tech_failure_flag} +\alias{get_tech_failure} +\alias{get_low_depth} +\alias{get_low_expression} \title{Add Quality Flags} \usage{ add_quality_flags(object, control = control_quality(), overwrite = FALSE) @@ -14,14 +17,20 @@ h_low_expression_flag(object, control = control_quality()) h_low_depth_flag(object, control = control_quality()) h_tech_failure_flag(object, control = control_quality()) + +get_tech_failure(object) + +get_low_depth(object) + +get_low_expression(object) } \arguments{ \item{object}{(\code{AnyHermesData}) \cr input.} -\item{control}{(\code{list}) \cr list of settings used to perform the quality control procedure, -produced by \code{\link[=control_quality]{control_quality()}}.} +\item{control}{(\code{list}) \cr list of settings (thresholds etc.) used to compute the +quality control flags, produced by \code{\link[=control_quality]{control_quality()}}.} -\item{overwrite}{(\code{flag})\cr whether previous results need to be overwritten.} +\item{overwrite}{(\code{flag})\cr whether previously added flags may be overwritten.} } \value{ The input object with added quality flags. @@ -29,21 +38,27 @@ The input object with added quality flags. \description{ \ifelse{html}{\out{Stable lifecycle}}{\strong{Stable}} -This function adds quality flag information to a \link{AnyHermesData} object: +The function \code{add_quality_flags()} adds quality flag information to a \code{\link{AnyHermesData}} object: \itemize{ \item \code{LowExpressionFlag}: for each gene, counts how many samples don't pass a minimum -expression CPM threshold. If too many, then it flags this gene as "low expression" gene. +expression Counts per Million (CPM) threshold. If too many, then it flags this gene +as a "low expression" gene. \item \code{TechnicalFailureFlag}: first calculates the Pearson correlation matrix of the sample wise CPM values, resulting in a matrix measuring the correlation between samples. Then compares the average correlation per sample with a threshold - if it is too low, -then the sample is flagged as "technical failure". -\item \code{LowDepthFlag}: computes the library size (total number of counts) per sample -(removing any \code{NA}s). If this number is too low, the sample is flagged as "low depth". +then the sample is flagged as a "technical failure" sample. +\item \code{LowDepthFlag}: computes the library size (total number of counts) per sample. +If this number is too low, the sample is flagged as a "low depth" sample. } + +Separate helper functions are internally used to create the flags, and +separate getter functions allow easy access to the quality control flags in an object. } \details{ -While \code{object} already has the variables above (as this is enforced by the validation -method for \code{\link{AnyHermesData}}), they are usually still \code{NA} after the initial creation. +While \code{object} already has the variables mentioned above as part of the +\code{rowData} and \code{colData} (as this is enforced by the validation +method for \code{\link{AnyHermesData}}), they are usually still \code{NA} after the initial +object creation. } \section{Functions}{ \itemize{ @@ -55,6 +70,12 @@ given control settings. \item \code{h_tech_failure_flag}: creates the technical failure flag for samples given control settings. + +\item \code{get_tech_failure}: get the technical failure flags for all samples. + +\item \code{get_low_depth}: get the low depth failure flags for all samples. + +\item \code{get_low_expression}: get the low expression failure flags for all genes. }} \examples{ @@ -62,36 +83,37 @@ given control settings. object <- HermesData(summarized_experiment) result <- add_quality_flags(object) which(get_tech_failure(result) != get_tech_failure(object)) -head(rowData(result)$LowExpressionFlag) -head(colData(result)$TechnicalFailureFlag) -head(colData(result)$LowDepthFlag) +head(get_low_expression(result)) +head(get_tech_failure(result)) +head(get_low_depth(result)) # It is possible to overwrite flags if needed, which will trigger a message. result2 <- add_quality_flags(result, control_quality(min_cpm = 1000), overwrite = TRUE) -# Separate calculation of low expression flag. -low_expr_flag <- h_low_expression_flag(object) -head(low_expr_flag) -length(low_expr_flag) == nrow(object) -low_expr_flag2 <- h_low_expression_flag( +# Separate calculation of low expression flag. +low_expr_flag <- h_low_expression_flag( object, control_quality(min_cpm = 500, min_cpm_prop = 0.9) ) -head(low_expr_flag2) -low_depth_flag <- h_low_depth_flag(object) -head(low_depth_flag) +length(low_expr_flag) == nrow(object) +head(low_expr_flag) + +# Separate calculation of low depth flag. +low_depth_flag <- h_low_depth_flag(object, control_quality(min_depth = 5)) length(low_depth_flag) == ncol(object) +head(low_depth_flag) -low_depth_flag2 <- h_low_depth_flag(object, control_quality(min_depth = 5)) -head(low_depth_flag2) -object <- HermesData(summarized_experiment) -tech_failure_flag <- h_tech_failure_flag(object) -head(tech_failure_flag) +# Separate calculation of technical failure flag. +tech_failure_flag <- h_tech_failure_flag(object, control_quality(min_corr = 0.35)) length(tech_failure_flag) == ncol(object) - -tech_failure_flag2 <- h_tech_failure_flag(object, control_quality(min_corr = 0.35)) -head(tech_failure_flag2) +head(tech_failure_flag) +head(get_tech_failure(object)) +head(get_low_depth(object)) +head(get_low_expression(object)) } \seealso{ -\code{\link[=control_quality]{control_quality()}} for the detailed settings specifications. +\itemize{ +\item \code{\link[=control_quality]{control_quality()}} for the detailed settings specifications; +\item \code{\link[=set_tech_failure]{set_tech_failure()}} to manually flag samples as technical failures. +} } diff --git a/man/query.Rd b/man/query.Rd index 3fd9d320..6c23e893 100644 --- a/man/query.Rd +++ b/man/query.Rd @@ -18,14 +18,15 @@ query(genes, connection) A \code{\link[S4Vectors:DataFrame-class]{S4Vectors::DataFrame}} with the gene annotations. It is required that: \itemize{ \item The \code{rownames} are identical to the input \code{genes}. -\item The \code{colnames} are equal to the annotation columns, see \code{\link{annotation}}. -\item Therefore, missing information needs to be properly included with \code{NA} entries. +\item The \code{colnames} are equal to the annotation columns \code{\link{.row_data_annotation_cols}}. +\item Therefore, missing information needs to be properly included in the \code{DataFrame} +with \code{NA} entries. } } \description{ \ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} -This generic function is the interface for querying gene annotations from +The generic function \code{query()} is the interface for querying gene annotations from a data base connection. } \details{ @@ -35,7 +36,7 @@ is extensible: It is simple to add new connections and corresponding query metho for other data bases, e.g. company internal data bases. Please make sure to follow the required format of the returned value. \item The Biomart queries might not return information for all the genes. This can be -due to different versions being used in the gene IDs and the queries Ensembl data base. +due to different versions being used in the gene IDs and the queried Ensembl data base. } } \examples{ diff --git a/man/rbind.Rd b/man/rbind.Rd index 8657ca11..d53f5726 100644 --- a/man/rbind.Rd +++ b/man/rbind.Rd @@ -16,19 +16,23 @@ This method combines \code{\link{AnyHermesData}} objects with the same samples b features of interest (rows in assays). } \note{ -Note that this just inherits +\itemize{ +\item Note that this just inherits \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment::rbind,SummarizedExperiment-method()}}. When binding a \code{\link{AnyHermesData}} object with a \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{SummarizedExperiment::SummarizedExperiment}} object, then the result will be a \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{SummarizedExperiment::SummarizedExperiment}} object (the more general class). +\item Note that we need to have unique gene IDs (row names) and the same prefix +across the combined object. +} } \examples{ -a <- HermesData(summarized_experiment[1:2542]) -b <- HermesData(summarized_experiment[2543: 5085]) +a <- HermesData(summarized_experiment[1:2542, ]) +b <- HermesData(summarized_experiment[2543:5085, ]) result <- rbind(a, b) class(result) - -result2 <- rbind(summarized_experiment, b) -class(result2) +} +\seealso{ +\code{\link{cbind}} to column bind objects. } diff --git a/man/samples.Rd b/man/samples.Rd index 11143225..1e67bbd6 100644 --- a/man/samples.Rd +++ b/man/samples.Rd @@ -23,3 +23,6 @@ nicely named accessor method. a <- HermesData(summarized_experiment) samples(a) } +\seealso{ +\code{\link[=genes]{genes()}} to access the gene IDs. +} diff --git a/man/subset.Rd b/man/subset.Rd index 5da47c11..b71ff5a6 100644 --- a/man/subset.Rd +++ b/man/subset.Rd @@ -5,6 +5,12 @@ \title{Subsetting \code{AnyHermesData} Objects} \arguments{ \item{x}{(\code{AnyHermesData})\cr object to subset from.} + +\item{subset}{(\code{expression})\cr logical expression based on the \code{rowData} columns to +select genes.} + +\item{select}{(\code{expression})\cr logical expression based on the \code{colData} columns to +select samples.} } \value{ The subsetted \code{\link{AnyHermesData}} object. @@ -22,5 +28,13 @@ Note that this just inherits \examples{ a <- HermesData(summarized_experiment) a + +# Subset both genes and samples. subset(a, subset = LowExpressionFlag, select = DISCSTUD == "N") + +# Subset only genes. +subset(a, subset = Chromosome == "2") + +# Subset only samples. +subset(a, select = AGE > 18) } diff --git a/man/summarized_experiment.Rd b/man/summarized_experiment.Rd index 91292838..1811a9a6 100644 --- a/man/summarized_experiment.Rd +++ b/man/summarized_experiment.Rd @@ -21,6 +21,6 @@ This example \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{ \code{\link{HermesData}} object. It already contains the required columns in \code{rowData} and \code{colData}. } \seealso{ -expression_set which contains similar data as \code{\link[Biobase:class.ExpressionSet]{Biobase::ExpressionSet}}. +\code{\link{expression_set}} which contains similar data as a \code{\link[Biobase:class.ExpressionSet]{Biobase::ExpressionSet}}. } \keyword{datasets} diff --git a/man/summary.Rd b/man/summary.Rd index e2504401..40341721 100644 --- a/man/summary.Rd +++ b/man/summary.Rd @@ -44,6 +44,7 @@ object_summary <- summary(object) str(object_summary) slotNames(object_summary) object_summary@lib_sizes + # Just calling the summary method like this will use the `show()` method. summary(object) } diff --git a/man/top_genes.Rd b/man/top_genes.Rd index dfaff1eb..3a0297d0 100644 --- a/man/top_genes.Rd +++ b/man/top_genes.Rd @@ -29,8 +29,8 @@ top_genes( \item{assay_name}{(\code{string})\cr name of the assay to use for the sorting of genes.} -\item{summary_fun}{(\code{function})\cr summary statistics function to apply to the assay resulting in -a numeric vector with one value per gene.} +\item{summary_fun}{(\code{function})\cr summary statistics function to apply across the samples in +the assay resulting in a numeric vector with one value per gene.} \item{n_top}{(\code{count} or \code{NULL})\cr selection criteria based on number of entries.} @@ -49,12 +49,14 @@ A \link{HermesDataTopGenes} object. \description{ \ifelse{html}{\out{Experimental lifecycle}}{\strong{Experimental}} -This creates a \code{\link{HermesDataTopGenes}} object, which extends \code{\link{data.frame}}. It +\code{top_genes()} creates a \code{\link{HermesDataTopGenes}} object, which extends \code{\link{data.frame}}. It contains two columns: \itemize{ \item \code{expression}: containing the statistic values calculated by \code{summary_fun} across columns. \item \code{name}: the gene names. } + +The corresponding \code{autoplot()} method then visualizes the result as a barplot. } \details{ \itemize{ @@ -73,9 +75,18 @@ on the x-axis. \examples{ object <- HermesData(summarized_experiment) + +# Default uses average of raw counts across samples to rank genes. top_genes(object) + +# Instead of showing top 10 genes, can also set a minimum threshold on average counts. top_genes(object, n_top = NULL, min_threshold = 50000) + +# We can also use the maximum of raw counts across samples, by specifying a different +# summary statistics function. result <- top_genes(object, summary_fun = rowMax) + +# Finally we can produce barplots based on the results. autoplot(result, title = "My top genes") autoplot(result, y_lab = "Counts", title = "My top genes") } diff --git a/man/validate.Rd b/man/validate.Rd index 86c91fa9..8c9b06a2 100644 --- a/man/validate.Rd +++ b/man/validate.Rd @@ -55,6 +55,6 @@ required columns. \item \code{validate_names}: validates that the object contains row and column names. \item \code{validate_prefix}: validates that the object prefix is a string -without whitespace, special characters or digits. +and only contains alphabetic characters. }} diff --git a/tests/testthat/test-HermesData-methods.R b/tests/testthat/test-HermesData-methods.R index 9c832507..d76d9ac7 100644 --- a/tests/testthat/test-HermesData-methods.R +++ b/tests/testthat/test-HermesData-methods.R @@ -110,7 +110,7 @@ test_that("annotation setter works as expected", { test_that("annotation setter gives a warning, saves gene IDs in attribute if gene info is missing", { object <- get_se() - h1 <- .HermesData(object) + h1 <- HermesData(object) # Value where information for one gene is completely missing, only partially missing for the other. value <- S4Vectors::DataFrame( StartBP = c(NA, 10),