diff --git a/NAMESPACE b/NAMESPACE index 9ff2b6b..76b1aeb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,9 @@ export(add_metabo) export(add_pheno) +export(create_multiomics_set) +export(create_omics_set) +export(create_omics_set_factory) export(hclust_matrix_rows) export(import_dataset_csv) export(import_dataset_csv_factory) diff --git a/R/create_omics_sets.R b/R/create_omics_sets.R new file mode 100644 index 0000000..e1cd6a3 --- /dev/null +++ b/R/create_omics_sets.R @@ -0,0 +1,387 @@ +#' Create a Biobase set object to store omics data +#' +#' Creates a Biobase object to store an omics dataset and associated samples and +#' features metadata. +#' +#' @param dataset Matrix, the omics dataset in matrix form with features as rows +#' and samples as columns. +#' @param omics_type Character, which type of omics data is being stored? +#' Possible values are `'genomics'`, `'transcriptomics'`, `'metabolomics'` and +#' `'phenomics'`. Use `'phenomics'` for any other omics. +#' @param features_metadata Data.frame, a feature annotation data-frame with +#' features as rows and information about the features as columns. The number +#' of rows and row names must match those of the `dataset` matrix. +#' @param samples_metadata Data.frame, a samples information data-frame with +#' samples as rows and information about the samples as columns. The number of +#' rows and row names must match the number of columns and column names of the +#' `dataset` matrix. +#' @returns An object derived from [Biobase::eSet-class]: +#' * if `omics_type = 'genomics'`: a [Biobase::SnpSet-class] object; +#' * if `omics_type = 'transcriptomics'`: a [Biobase::ExpressionSet-class] +#' object. +#' * if `omics_type = 'metabolomics'`: a [MetabolomeSet-class] object. +#' * if `omics_type = 'phenomics'` a [PhenotypeSet-class] object. +#' @examples +#' \dontrun{ +#' data_geno <- import_dataset_csv( +#' "genotype_dataset.csv", +#' col_id = "Marker", +#' features_as_rows = TRUE +#' ) +#' geno_info_features <- import_fmetadata_csv( +#' "genotype_features_info.csv", +#' col_id = "Marker" +#' ) +#' samples_information <- import_smetadata_csv( +#' "samples_information.csv", +#' col_id = "Sample" +#' ) +#' create_omics_set( +#' dataset = data_geno, +#' omics_type = "genomics", +#' features_metadata = geno_info_features, +#' samples_metadata = samples_information +#' ) +#' } +#' @export +create_omics_set <- function(dataset, + omics_type = c("phenomics", + "genomics", + "transcriptomics", + "metabolomics"), + features_metadata = NULL, + samples_metadata = NULL) { + + omics_types <- rlang::arg_match(omics_types) + + res <- switch(omics_type, + "genomics" = new("SnpSet", call = dataset), + "transcriptomics" = Biobase::ExpressionSet(assayData = dataset), + "metabolomics" = new("MetabolomeSet", call = dataset), + "phenomics" = new("PhenotypeSet", call = dataset) + ) + + if (!is.null(features_metadata)) { + + missing_features <- setdiff(rownames(dataset), rownames(features_metadata)) + if (length(missing_features) > 0) { + warning( + length(missing_features), + " features are not present in feature metadata." + ) + ## adding the missing features as NAs in the metadata data-frame + features_metadata[missing_features, ] <- NA + features_metadata[missing_features, "feature_id"] <- missing_features + } + + irrelevant_features <- setdiff( + rownames(features_metadata), + rownames(dataset) + ) + if (length(irrelevant_features) > 0) { + warning( + length(irrelevant_features), " features", + " in feature metadata not in dataset, will be removed from metadata." + ) + } + + features_metadata <- features_metadata[rownames(dataset), ] + feature_data <- Biobase::AnnotatedDataFrame(features_metadata) + + Biobase::featureData(res) <- feature_data + } else { + ## we need to have in the featureData a column with feature ID to be able + ## to perform subsetting later on + temp <- data.frame("feature_id" = rownames(dataset)) + rownames(temp) <- rownames(dataset) + + Biobase::featureData(res) <- Biobase::AnnotatedDataFrame(temp) + } + + if (!is.null(samples_metadata)) { + + missing_samples <- setdiff(colnames(dataset), rownames(samples_metadata)) + if (length(missing_samples)) { + warning( + length(missing_samples), + " samples are not present in samples metadata." + ) + ## adding the missing samples as NAs in the metadata data-frame + samples_metadata[missing_samples, ] <- NA + samples_metadata[missing_samples, "id"] <- missing_samples + } + + irrelevant_samples <- setdiff( + rownames(samples_metadata), + colnames(dataset) + ) + if (length(irrelevant_samples)) { + warning( + length(irrelevant_samples), " samples", + " in samples metadata not in dataset, will be removed from metadata." + ) + } + + samples_metadata <- samples_metadata[colnames(dataset), ] + samples_data <- Biobase::AnnotatedDataFrame(samples_metadata) + + Biobase::phenoData(res) <- samples_data + } + + return(res) +} + +#' Target factory for omics sets creation +#' +#' Creates a list of targets that generate omics sets from targets containing +#' datasets, features and samples metadata. +#' +#' @param datasets Vector of symbols, the names of the targets containing the +#' omics datasets. +#' @param omics_types Character vector, which type of omics data is being stored +#' for each dataset? Possible values are `'genomics'`, `'transcriptomics'`, +#' `'metabolomics'` and `'phenomics'`. Use `'phenomics'` for any other omics. +#' Use `'phenomics'` for any other omics. +#' @param features_metadatas Vector of symbols, the names of the targets +#' containing the features metadata data-frame associated with each omics +#' dataset. Use `NULL` if no feature metadata exists for a dataset. +#' @param samples_metadatas Vector of symbols, the names of the targets +#' containing the samples metadata data-frame associated with each omics +#' dataset. Use `NULL` if no samples metadata exists for a dataset. +#' @param target_name_suffixes Character vector, a suffix to add to the name of +#' the targets created by this target factory for each dataset. If none +#' provided, the suffixes will be extracted from the `datasets` argument. +#' Default value is NULL. +#' @return A list of target objects, with three datasets provided, and +#' `target_name_suffixes = c("geno", "transcripto", "metabo")`, the following +#' targets will be returned: `set_geno`, `set_transcripto` and `set_metabo`. +#' @examples +#' \dontrun{ +#' ## in the _targets.R +#' library(moiraine) +#' library(targets) +#' +#' list( +#' ## targets to import the different datasets +#' +#' ## Example where genomics dataset has no features metadata information +#' ## Will generate the following targets: set_geno, set_transcripto +#' create_omics_set_factory( +#' datasets = c(data_geno, data_transcripto), +#' omics_types = c("genomics", "transcriptomics"), +#' features_metadata = c(NULL, fmeta_transcripto), +#' samples_metadata = c(smeta_geno, smeta_transcripto) +#' ) +#' ) +#' } +#' @export +create_omics_set_factory <- function(datasets, + omics_types, + features_metadatas = NULL, + samples_metadatas = NULL, + target_name_suffixes = NULL) { + n_datasets <- length(.input2symVect(rlang::enquo(datasets))) + n_fmeta <- length(.input2symVect(rlang::enquo(features_metadatas))) + n_smeta <- length(.input2symVect(rlang::enquo(samples_metadatas))) + + if (length(omics_types) != n_datasets) { + stop("'datasets' and 'omics_types' vectors must have the same length.") + } + + if (length(target_name_suffixes) != n_datasets && + !is.null(target_name_suffixes)) { + stop("'datasets' and 'target_name_suffixes' vectors must have the same length.") + } + + if (n_fmeta != n_datasets && !is.null(substitute(features_metadatas))) { + stop("'datasets' and 'features_metadatas' vectors must have the same length.") + } + + if (n_smeta != n_datasets && !is.null(substitute(samples_metadatas))) { + stop("'datasets' and 'samples_metadatas' vectors must have the same length.") + } + + values <- tibble::tibble( + ds = .input2symVect(rlang::enquo(datasets)), + ot = omics_types, + fmet = .input2symVect(rlang::enquo(features_metadatas)), + smet = .input2symVect(rlang::enquo(samples_metadatas)) + ) + + if (is.null(target_name_suffixes)) { + target_name_suffixes <- stringr::str_remove( + .symbolVect2charVect(rlang::enquo(datasets)), + "data_" + ) + } + + values$target_name_suffix <- target_name_suffixes + + targets <- tarchetypes::tar_map( + values = values, + names = "target_name_suffix", + targets::tar_target_raw( + "set", + substitute( + create_omics_set( + dataset = ds, + omics_type = ot, + features_metadata = fmet, + samples_metadata = smet + ) + ) + ) + ) + + return(targets) +} + +#' Adds an omics set to a MultiDataSet object +#' +#' Adds a omics set to an existing MultiDataSet object. +#' +#' @param mo_data A [MultiDataSet::MultiDataSet-class] object. +#' @param omics_set A [Biobase::eSet-class] object, created via +#' [create_omics_set()]. Currently accepted objects: [Biobase::SnpSet-class], +#' [Biobase::ExpressionSet-class], [MetabolomeSet-class], +#' [PhenotypeSet-class]. +#' @param ds_name Character, name of the dataset (will be used as suffix for the +#' name of the dataset in the resulting MultiDataSet object). +#' @param ... Further arguments passed to `[MultiDataSet::add_snps()], +#' [MultiDataSet::add_rnaseq()], [add_metabo()] or[add_pheno()] (depending on +#' `omics_set` class). +#' @returns A [MultiDataSet::MultiDataSet-class] object, the `mo_data` with +#' `omics_set` as an additional dataset. +#' @examples +#' \dontrun{ +#' add_omics_set(mo_data, omics_set, "exp1") +#' } +#' @export +add_omics_set <- function(mo_data, omics_set, ds_name, ...) { + type <- class(omics_set)[[1]] + fct <- switch(type, + "SnpSet" = MultiDataSet::add_snps, + "ExpressionSet" = MultiDataSet::add_rnaseq, + "MetabolomeSet" = add_metabo, + "PhenotypeSet" = add_pheno + ) + fct(mo_data, omics_set, dataset.name = ds_name, ...) +} + +##### STOPPED HERE + +#' Create a MultiDataSet object to store multi-omics data +#' +#' Creates a MultiDataSet object from a list of Biobase Set objects to store the different omics sets. +#' +#' @param sets_list List of Biobase Set objects, created via \code{\link{create_omics_set}}. Currently accepted ojects: \code{SnpSet}, +#' \code{ExpressionSet}, \code{MetabolomeSet}, \code{PhenotypeSet}. +#' @param datasets_names Optional, vector of character, name for each Set object. Will be appended to the data type in the resulting +#' object. If the `sets_list` list contains several objects of the same data type (e.g. several SnpSets), their names must be unique. +#' If "" is provided, no name will be appended to the data type for the corresponding dataset. +#' @param show_warnings Logical, should warnings be displayed when adding a set to the MultiDataSet object? Default value is `TRUE`. +#' @return a \code{\link[MultiDataSet]{MultiDataSet-class}}. +#' @export +create_multiomics_set <- function(sets_list, datasets_names = NULL, show_warnings = TRUE) { + # if(is.null(names(sets_list))) stop("The sets_list list must be named.") + # if(length(setdiff(names(sets_list), c("genomics", "transcriptomics", "metabolomics", "phenomics")))) stop("Names of the sets_list list must be one of 'genomics', 'transcriptomics', 'metabolomics', 'phenomics'.") + + if (!length(sets_list)) stop("sets_list list is empty.") + + cl <- sapply(sets_list, class) + if (length(setdiff(cl, c("SnpSet", "ExpressionSet", "MetabolomeSet", "PhenotypeSet")))) stop("Elements in sets_list must be SnpSet, ExpressionSet, MetabolomeSet or PhenotypeSet objects.") + + if (!is.null(datasets_names)) { + if (length(datasets_names) != length(sets_list)) stop("dataset_names vector must have same length as sets_list list.") + + if (any(duplicated(paste0(cl, datasets_names)))) stop("Dataset names for objects of a same type must be unique.") + } else { + ## if several sets_list elements have the same data type, must provide unique datasets_names + datasets_names <- .make_unique_ids(cl) + } + + res <- MultiDataSet::createMultiDataSet() + + for (i in 1:length(sets_list)) { + x <- sets_list[i] + ds_name <- datasets_names[i] + if (ds_name == "") ds_name <- NULL + + res <- .add_omics_set(res, sets_list[[i]], cl[i], ds_name, warnings = show_warnings) + } + + ## Check that the samples metadata is consistent across the datasets + temp <- get_samples_metadata(res) |> + purrr::reduce( + function(.x, .y) { + cc <- intersect(colnames(.x), colnames(.y)) + # .x <- dplyr::select(.x, tidyselect::all_of(cc)) + # .y <- dplyr::select(.y, tidyselect::all_of(cc)) + dplyr::full_join(.x, .y, by = cc) + } + ) + + duplicated_samples <- duplicated(temp$id) + if (any(duplicated_samples)) { + stop( + "Conflicting information in samples metadata for samples ", + paste0(temp$id[duplicated_samples], collapse = ", "), + "." + ) + } + + return(res) +} + + +.symbolVect2charVect <- function(x) { + string <- rlang::quo_text(x, nlines = Inf) + unlist(strsplit(gsub("(c\\(|\\)|\\s)", "", string), ",")) +} + +.input2symVect <- function(x) { + string <- rlang::quo_text(x, nlines = Inf) + + if (string == "NULL") { + return(str2expression("NULL")) + } + + string <- unlist(strsplit(gsub("(c\\(|\\)|\\s)", "", string), ",")) + # res <- rlang::syms(string) + res <- str2expression(string) + + return(res) +} + +#' Generate suffixes to make values unique. +#' +#' For a given vector of values, returns a vector of the same size containing +#' suffixes to make each value in the input vector unique. +#' +#' @param x Character vector of IDs. +#' @returns Character vector of the same size as `x`, containing at each +#' position: +#' * `''` if the corresponding value in `x` is unique; +#' * A number as character (e.g. `'1'`, `'2`, etc) if the corresponding value +#' in `x` is not unique, such that by adding this value to the values in `x`, +#' it becomes unique. +#' @examples +#' \dontrun{ +#' .make_unique_ids(1:5) +#' #> "" "" "" "" "" +#' +#' .make_unique_ids(c(1:5, 1)) +#' #> "1" "" "" "" "" "2" +#' } +#' +#' @noRd +.make_unique_ids <- function(x) { + are_duplicates <- unique(x[duplicated(x)]) + res <- character(length(x)) + for (v in are_duplicates) { + indx <- x == v + res[indx] <- seq_len(sum(indx)) + } + + return(res) +} diff --git a/R/data_import.R b/R/data_import.R index bf02e77..22c037a 100644 --- a/R/data_import.R +++ b/R/data_import.R @@ -306,7 +306,7 @@ import_smetadata_csv <- function(file, col_id, ...) { #' @examples #' \dontrun{ #' ## in the _targets.R -#' library(MOITestPipeline) +#' library(moiraine) #' #' list( #' import_dataset_csv_factory( @@ -383,7 +383,7 @@ import_dataset_csv_factory <- function(files, #' @examples #' \dontrun{ #' ## in the _targets.R -#' library(MOITestPipeline) +#' library(moiraine) #' #' list( #' import_fmetadata_csv_factory( @@ -460,7 +460,7 @@ import_fmetadata_csv_factory <- function(files, #' @examples #' \dontrun{ #' ## in the _targets.R -#' library(MOITestPipeline) +#' library(moiraine) #' #' list( #' import_fmetadata_gff_factory( @@ -542,7 +542,7 @@ import_fmetadata_gff_factory <- function(files, #' @examples #' \dontrun{ #' ## in the _targets.R -#' library(MOITestPipeline) +#' library(moiraine) #' #' list( #' import_smetadata_csv_factory( diff --git a/man/create_multiomics_set.Rd b/man/create_multiomics_set.Rd new file mode 100644 index 0000000..4e75775 --- /dev/null +++ b/man/create_multiomics_set.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_omics_sets.R +\name{create_multiomics_set} +\alias{create_multiomics_set} +\title{Create a MultiDataSet object to store multi-omics data} +\usage{ +create_multiomics_set(sets_list, datasets_names = NULL, show_warnings = TRUE) +} +\arguments{ +\item{sets_list}{List of Biobase Set objects, created via \code{\link{create_omics_set}}. Currently accepted ojects: \code{SnpSet}, +\code{ExpressionSet}, \code{MetabolomeSet}, \code{PhenotypeSet}.} + +\item{datasets_names}{Optional, vector of character, name for each Set object. Will be appended to the data type in the resulting +object. If the \code{sets_list} list contains several objects of the same data type (e.g. several SnpSets), their names must be unique. +If "" is provided, no name will be appended to the data type for the corresponding dataset.} + +\item{show_warnings}{Logical, should warnings be displayed when adding a set to the MultiDataSet object? Default value is \code{TRUE}.} +} +\value{ +a \code{\link[MultiDataSet]{MultiDataSet-class}}. +} +\description{ +Creates a MultiDataSet object from a list of Biobase Set objects to store the different omics sets. +} diff --git a/man/create_omics_set.Rd b/man/create_omics_set.Rd new file mode 100644 index 0000000..ef1e5ba --- /dev/null +++ b/man/create_omics_set.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_omics_sets.R +\name{create_omics_set} +\alias{create_omics_set} +\title{Create a Biobase set object to store omics data} +\usage{ +create_omics_set( + dataset, + omics_type, + features_metadata = NULL, + samples_metadata = NULL +) +} +\arguments{ +\item{dataset}{Matrix, the omics dataset in matrix form with features as rows and samples as columns.} + +\item{omics_type}{Character, which type of omics data is being stored? Possible values are \code{'genomics'}, \code{'transcriptomics'}, +\code{'metabolomics'} and \code{'phenomics'}.} + +\item{features_metadata}{Data.frame, a feature annotation dataframe with features as rows and information about the features as columns. +The number of rows and rownames must match those of the \code{dataset} matrix.} + +\item{samples_metadata}{Data.frame, a samples information dataframe with samples as rows and information about the samples as columns. +The number of rows and rownames must match the number of columns and colnames of the \code{dataset} matrix.} +} +\value{ +\itemize{ +\item if \code{omics_type = 'genomics'}: a \code{\link[Biobase]{SnpSet}} object. +\item if \code{omics_type = 'transcriptomics'}: an \code{\link[Biobase]{ExpressionSet}} object. +\item if \code{omics_type = 'metabolomics'}: a \code{\link{MetabolomeSet}} object. +\item if \code{omics_type = 'phenomics'}: a \code{\link{PhenotypeSet}} object. +} +} +\description{ +Creates a Biobase object to store an omics dataset and associated samples and features information. +} +\examples{ +\dontrun{ +data_geno <- import_dataset_csv("genotype_dataset.csv", + col_id = "Marker", + features_as_rows = TRUE +) +geno_info_features <- import_fmetadata_csv("genotype_features_info.csv", + col_id = "Marker", + features_as_rows = TRUE +) +samples_information <- import_smetadata_csv("samples_information.csv", + col_id = "Sample" +) +create_omics_set( + dataset = data_geno, + features_metadata = geno_info_features, + samples_metadata = samples_information, + omics_type = "genomics" +) +} +} diff --git a/man/create_omics_set_factory.Rd b/man/create_omics_set_factory.Rd new file mode 100644 index 0000000..868ed9d --- /dev/null +++ b/man/create_omics_set_factory.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_omics_sets.R +\name{create_omics_set_factory} +\alias{create_omics_set_factory} +\title{Target factory for omics set creation} +\usage{ +create_omics_set_factory( + datasets, + omics_types, + features_metadatas = NULL, + samples_metadatas = NULL, + target_name_suffixes = NULL +) +} +\arguments{ +\item{datasets}{Vector of symbols, the names of the targets containing the omics datasets.} + +\item{omics_types}{Character vector, which type of omics data is being stored for each dataset? +Possible values are 'genomics', 'transcriptomics', 'metabolomics' and 'phenomics'.} + +\item{features_metadatas}{Vector of symbols, the names of the targets containing the features metadata +data-frame associated with each omics dataset. Use \code{NULL} if no feature metadata exists for a dataset.} + +\item{samples_metadatas}{Vector of symbols, the names of the targets containing the samples metadata +data-frame associated with each omics dataset. Use \code{NULL} if no samples metadata exists for a dataset.} + +\item{target_name_suffixes}{Vector of symbols, a suffix to add to the name of the targets created by this +target factory for each dataset. If none provided, the suffixes will be extracted from the \code{datasets} argument. +Default value is NULL.} +} +\value{ +A list of target objects, with three datasets provided, and +\code{target_name_suffixes = c("geno", "transcripto", "metabo")}, the following targets will be returned: \code{set_geno}, +\code{set_transcripto} and \code{set_metabo}. +} +\description{ +Creates a list of targets that generate omics sets from targets containing +datasets, features and samples metadata. +} +\examples{ +\dontrun{ +## in the _targets.R +library(MOITestPipeline) + +list( + ## targets to import the different datasets + + ## Example where genomics dataset has no features metadata information + create_omics_set_factory( + datasets = c(data_geno, data_transcripto), + omics_types = c("genomics", "transcriptomics"), + features_metadata = c(NULL, fmeta_transcripto), + samples_metadata = c(smeta_geno, smeta_transcripto) + ) +) +} +}