Working on create_omics_sets (not over)

Plant-Food-Research-Open · Jan 11, 2024 · 8b2f0c9 · 8b2f0c9
1 parent acb7e08
commit 8b2f0c9
Show file tree

Hide file tree

Showing 6 changed files with 532 additions and 4 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,9 @@
 
 export(add_metabo)
 export(add_pheno)
+export(create_multiomics_set)
+export(create_omics_set)
+export(create_omics_set_factory)
 export(hclust_matrix_rows)
 export(import_dataset_csv)
 export(import_dataset_csv_factory)

diff --git a/R/create_omics_sets.R b/R/create_omics_sets.R
@@ -0,0 +1,387 @@
+#' Create a Biobase set object to store omics data
+#'
+#' Creates a Biobase object to store an omics dataset and associated samples and
+#' features metadata.
+#'
+#' @param dataset Matrix, the omics dataset in matrix form with features as rows
+#'   and samples as columns.
+#' @param omics_type Character, which type of omics data is being stored?
+#'   Possible values are `'genomics'`, `'transcriptomics'`, `'metabolomics'` and
+#'   `'phenomics'`. Use `'phenomics'` for any other omics.
+#' @param features_metadata Data.frame, a feature annotation data-frame with
+#'   features as rows and information about the features as columns. The number
+#'   of rows and row names must match those of the `dataset` matrix.
+#' @param samples_metadata Data.frame, a samples information data-frame with
+#'   samples as rows and information about the samples as columns. The number of
+#'   rows and row names must match the number of columns and column names of the
+#'   `dataset` matrix.
+#' @returns An object derived from [Biobase::eSet-class]:
+#' * if `omics_type = 'genomics'`: a [Biobase::SnpSet-class] object;
+#' * if `omics_type = 'transcriptomics'`: a [Biobase::ExpressionSet-class]
+#'   object.
+#' * if `omics_type = 'metabolomics'`: a [MetabolomeSet-class] object.
+#' * if `omics_type = 'phenomics'` a [PhenotypeSet-class] object.
+#' @examples
+#' \dontrun{
+#' data_geno <- import_dataset_csv(
+#' "genotype_dataset.csv",
+#'   col_id = "Marker",
+#'   features_as_rows = TRUE
+#' )
+#' geno_info_features <- import_fmetadata_csv(
+#' "genotype_features_info.csv",
+#'   col_id = "Marker"
+#' )
+#' samples_information <- import_smetadata_csv(
+#' "samples_information.csv",
+#'   col_id = "Sample"
+#' )
+#' create_omics_set(
+#'   dataset = data_geno,
+#'   omics_type = "genomics",
+#'   features_metadata = geno_info_features,
+#'   samples_metadata = samples_information
+#' )
+#' }
+#' @export
+create_omics_set <- function(dataset,
+                             omics_type = c("phenomics",
+                                            "genomics",
+                                            "transcriptomics",
+                                            "metabolomics"),
+                             features_metadata = NULL,
+                             samples_metadata = NULL) {
+
+  omics_types <- rlang::arg_match(omics_types)
+
+  res <- switch(omics_type,
+                "genomics" = new("SnpSet", call = dataset),
+                "transcriptomics" = Biobase::ExpressionSet(assayData = dataset),
+                "metabolomics" = new("MetabolomeSet", call = dataset),
+                "phenomics" = new("PhenotypeSet", call = dataset)
+  )
+
+  if (!is.null(features_metadata)) {
+
+    missing_features <- setdiff(rownames(dataset), rownames(features_metadata))
+    if (length(missing_features) > 0) {
+      warning(
+        length(missing_features),
+        " features are not present in feature metadata."
+      )
+      ## adding the missing features as NAs in the metadata data-frame
+      features_metadata[missing_features, ] <- NA
+      features_metadata[missing_features, "feature_id"] <- missing_features
+    }
+
+    irrelevant_features <- setdiff(
+      rownames(features_metadata),
+      rownames(dataset)
+    )
+    if (length(irrelevant_features) > 0) {
+      warning(
+        length(irrelevant_features), " features",
+        " in feature metadata not in dataset, will be removed from metadata."
+      )
+    }
+
+    features_metadata <- features_metadata[rownames(dataset), ]
+    feature_data <- Biobase::AnnotatedDataFrame(features_metadata)
+
+    Biobase::featureData(res) <- feature_data
+  } else {
+    ## we need to have in the featureData a column with feature ID to be able
+    ## to perform subsetting later on
+    temp <- data.frame("feature_id" = rownames(dataset))
+    rownames(temp) <- rownames(dataset)
+
+    Biobase::featureData(res) <- Biobase::AnnotatedDataFrame(temp)
+  }
+
+  if (!is.null(samples_metadata)) {
+
+    missing_samples <- setdiff(colnames(dataset), rownames(samples_metadata))
+    if (length(missing_samples)) {
+      warning(
+        length(missing_samples),
+        " samples are not present in samples metadata."
+      )
+      ## adding the missing samples as NAs in the metadata data-frame
+      samples_metadata[missing_samples, ] <- NA
+      samples_metadata[missing_samples, "id"] <- missing_samples
+    }
+
+    irrelevant_samples <- setdiff(
+      rownames(samples_metadata),
+      colnames(dataset)
+    )
+    if (length(irrelevant_samples)) {
+      warning(
+        length(irrelevant_samples), " samples",
+        " in samples metadata not in dataset, will be removed from metadata."
+      )
+    }
+
+    samples_metadata <- samples_metadata[colnames(dataset), ]
+    samples_data <- Biobase::AnnotatedDataFrame(samples_metadata)
+
+    Biobase::phenoData(res) <- samples_data
+  }
+
+  return(res)
+}
+
+#' Target factory for omics sets creation
+#'
+#' Creates a list of targets that generate omics sets from targets containing
+#' datasets, features and samples metadata.
+#'
+#' @param datasets Vector of symbols, the names of the targets containing the
+#'   omics datasets.
+#' @param omics_types Character vector, which type of omics data is being stored
+#'   for each dataset? Possible values are `'genomics'`, `'transcriptomics'`,
+#'   `'metabolomics'` and `'phenomics'`.  Use `'phenomics'` for any other omics.
+#'   Use `'phenomics'` for any other omics.
+#' @param features_metadatas Vector of symbols, the names of the targets
+#'   containing the features metadata data-frame associated with each omics
+#'   dataset. Use `NULL` if no feature metadata exists for a dataset.
+#' @param samples_metadatas Vector of symbols, the names of the targets
+#'   containing the samples metadata data-frame associated with each omics
+#'   dataset. Use `NULL` if no samples metadata exists for a dataset.
+#' @param target_name_suffixes Character vector, a suffix to add to the name of
+#'   the targets created by this target factory for each dataset. If none
+#'   provided, the suffixes will be extracted from the `datasets` argument.
+#'   Default value is NULL.
+#' @return A list of target objects, with three datasets provided, and
+#'   `target_name_suffixes = c("geno", "transcripto", "metabo")`, the following
+#'   targets will be returned: `set_geno`, `set_transcripto` and `set_metabo`.
+#' @examples
+#' \dontrun{
+#' ## in the _targets.R
+#' library(moiraine)
+#' library(targets)
+#'
+#' list(
+#'   ## targets to import the different datasets
+#'
+#'   ## Example where genomics dataset has no features metadata information
+#'   ## Will generate the following targets: set_geno, set_transcripto
+#'   create_omics_set_factory(
+#'     datasets = c(data_geno, data_transcripto),
+#'     omics_types = c("genomics", "transcriptomics"),
+#'     features_metadata = c(NULL, fmeta_transcripto),
+#'     samples_metadata = c(smeta_geno, smeta_transcripto)
+#'   )
+#' )
+#' }
+#' @export
+create_omics_set_factory <- function(datasets,
+                                     omics_types,
+                                     features_metadatas = NULL,
+                                     samples_metadatas = NULL,
+                                     target_name_suffixes = NULL) {
+  n_datasets <- length(.input2symVect(rlang::enquo(datasets)))
+  n_fmeta <- length(.input2symVect(rlang::enquo(features_metadatas)))
+  n_smeta <- length(.input2symVect(rlang::enquo(samples_metadatas)))
+
+  if (length(omics_types) != n_datasets) {
+    stop("'datasets' and 'omics_types' vectors must have the same length.")
+  }
+
+  if (length(target_name_suffixes) != n_datasets &&
+      !is.null(target_name_suffixes)) {
+    stop("'datasets' and 'target_name_suffixes' vectors must have the same length.")
+  }
+
+  if (n_fmeta != n_datasets && !is.null(substitute(features_metadatas))) {
+    stop("'datasets' and 'features_metadatas' vectors must have the same length.")
+  }
+
+  if (n_smeta != n_datasets && !is.null(substitute(samples_metadatas))) {
+    stop("'datasets' and 'samples_metadatas' vectors must have the same length.")
+  }
+
+  values <- tibble::tibble(
+    ds = .input2symVect(rlang::enquo(datasets)),
+    ot = omics_types,
+    fmet = .input2symVect(rlang::enquo(features_metadatas)),
+    smet = .input2symVect(rlang::enquo(samples_metadatas))
+  )
+
+  if (is.null(target_name_suffixes)) {
+    target_name_suffixes <- stringr::str_remove(
+      .symbolVect2charVect(rlang::enquo(datasets)),
+      "data_"
+    )
+  }
+
+  values$target_name_suffix <- target_name_suffixes
+
+  targets <- tarchetypes::tar_map(
+    values = values,
+    names = "target_name_suffix",
+    targets::tar_target_raw(
+      "set",
+      substitute(
+        create_omics_set(
+          dataset = ds,
+          omics_type = ot,
+          features_metadata = fmet,
+          samples_metadata = smet
+        )
+      )
+    )
+  )
+
+  return(targets)
+}
+
+#' Adds an omics set to a MultiDataSet object
+#'
+#' Adds a omics set to an existing MultiDataSet object.
+#'
+#' @param mo_data A [MultiDataSet::MultiDataSet-class] object.
+#' @param omics_set A [Biobase::eSet-class] object, created via
+#'   [create_omics_set()]. Currently accepted objects: [Biobase::SnpSet-class],
+#'   [Biobase::ExpressionSet-class], [MetabolomeSet-class],
+#'   [PhenotypeSet-class].
+#' @param ds_name Character, name of the dataset (will be used as suffix for the
+#'   name of the dataset in the resulting MultiDataSet object).
+#' @param ... Further arguments passed to `[MultiDataSet::add_snps()],
+#'   [MultiDataSet::add_rnaseq()], [add_metabo()] or[add_pheno()] (depending on
+#'   `omics_set` class).
+#' @returns A [MultiDataSet::MultiDataSet-class] object, the `mo_data` with
+#'   `omics_set` as an additional dataset.
+#' @examples
+#' \dontrun{
+#' add_omics_set(mo_data, omics_set, "exp1")
+#' }
+#' @export
+add_omics_set <- function(mo_data, omics_set, ds_name, ...) {
+  type <- class(omics_set)[[1]]
+  fct <- switch(type,
+         "SnpSet" = MultiDataSet::add_snps,
+         "ExpressionSet" = MultiDataSet::add_rnaseq,
+         "MetabolomeSet" = add_metabo,
+         "PhenotypeSet" = add_pheno
+  )
+  fct(mo_data, omics_set, dataset.name = ds_name, ...)
+}
+
+##### STOPPED HERE
+
+#' Create a MultiDataSet object to store multi-omics data
+#'
+#' Creates a MultiDataSet object from a list of Biobase Set objects to store the different omics sets.
+#'
+#' @param sets_list List of Biobase Set objects, created via \code{\link{create_omics_set}}. Currently accepted ojects: \code{SnpSet},
+#' \code{ExpressionSet}, \code{MetabolomeSet}, \code{PhenotypeSet}.
+#' @param datasets_names Optional, vector of character, name for each Set object. Will be appended to the data type in the resulting
+#' object. If the `sets_list` list contains several objects of the same data type (e.g. several SnpSets), their names must be unique.
+#' If "" is provided, no name will be appended to the data type for the corresponding dataset.
+#' @param show_warnings Logical, should warnings be displayed when adding a set to the MultiDataSet object? Default value is `TRUE`.
+#' @return a \code{\link[MultiDataSet]{MultiDataSet-class}}.
+#' @export
+create_multiomics_set <- function(sets_list, datasets_names = NULL, show_warnings = TRUE) {
+  # if(is.null(names(sets_list))) stop("The sets_list list must be named.")
+  # if(length(setdiff(names(sets_list), c("genomics", "transcriptomics", "metabolomics", "phenomics")))) stop("Names of the sets_list list must be one of 'genomics', 'transcriptomics', 'metabolomics', 'phenomics'.")
+
+  if (!length(sets_list)) stop("sets_list list is empty.")
+
+  cl <- sapply(sets_list, class)
+  if (length(setdiff(cl, c("SnpSet", "ExpressionSet", "MetabolomeSet", "PhenotypeSet")))) stop("Elements in sets_list must be SnpSet, ExpressionSet, MetabolomeSet or PhenotypeSet objects.")
+
+  if (!is.null(datasets_names)) {
+    if (length(datasets_names) != length(sets_list)) stop("dataset_names vector must have same length as sets_list list.")
+
+    if (any(duplicated(paste0(cl, datasets_names)))) stop("Dataset names for objects of a same type must be unique.")
+  } else {
+    ## if several sets_list elements have the same data type, must provide unique datasets_names
+    datasets_names <- .make_unique_ids(cl)
+  }
+
+  res <- MultiDataSet::createMultiDataSet()
+
+  for (i in 1:length(sets_list)) {
+    x <- sets_list[i]
+    ds_name <- datasets_names[i]
+    if (ds_name == "") ds_name <- NULL
+
+    res <- .add_omics_set(res, sets_list[[i]], cl[i], ds_name, warnings = show_warnings)
+  }
+
+  ## Check that the samples metadata is consistent across the datasets
+  temp <- get_samples_metadata(res) |>
+    purrr::reduce(
+      function(.x, .y) {
+        cc <- intersect(colnames(.x), colnames(.y))
+        # .x <- dplyr::select(.x, tidyselect::all_of(cc))
+        # .y <- dplyr::select(.y, tidyselect::all_of(cc))
+        dplyr::full_join(.x, .y, by = cc)
+      }
+    )
+
+  duplicated_samples <- duplicated(temp$id)
+  if (any(duplicated_samples)) {
+    stop(
+      "Conflicting information in samples metadata for samples ",
+      paste0(temp$id[duplicated_samples], collapse = ", "),
+      "."
+    )
+  }
+
+  return(res)
+}
+
+
+.symbolVect2charVect <- function(x) {
+  string <- rlang::quo_text(x, nlines = Inf)
+  unlist(strsplit(gsub("(c\\(|\\)|\\s)", "", string), ","))
+}
+
+.input2symVect <- function(x) {
+  string <- rlang::quo_text(x, nlines = Inf)
+
+  if (string == "NULL") {
+    return(str2expression("NULL"))
+  }
+
+  string <- unlist(strsplit(gsub("(c\\(|\\)|\\s)", "", string), ","))
+  # res <- rlang::syms(string)
+  res <- str2expression(string)
+
+  return(res)
+}
+
+#' Generate suffixes to make values unique.
+#'
+#' For a given vector of values, returns a vector of the same size containing
+#' suffixes to make each value in the input vector unique.
+#'
+#' @param x Character vector of IDs.
+#' @returns Character vector of the same size as `x`, containing at each
+#'   position:
+#' * `''` if the corresponding value in `x` is unique;
+#' * A number as character (e.g. `'1'`, `'2`, etc) if the corresponding value
+#'   in `x` is not unique, such that by adding this value to the values in `x`,
+#'   it becomes unique.
+#' @examples
+#' \dontrun{
+#' .make_unique_ids(1:5)
+#' #> "" "" "" "" ""
+#'
+#' .make_unique_ids(c(1:5, 1))
+#' #> "1" ""  ""  ""  ""  "2"
+#' }
+#'
+#' @noRd
+.make_unique_ids <- function(x) {
+  are_duplicates <- unique(x[duplicated(x)])
+  res <- character(length(x))
+  for (v in are_duplicates) {
+    indx <- x == v
+    res[indx] <- seq_len(sum(indx))
+  }
+
+  return(res)
+}