Skip to content

Commit

Permalink
Working on create_omics_sets (not over)
Browse files Browse the repository at this point in the history
  • Loading branch information
oliviaAB committed Jan 11, 2024
1 parent acb7e08 commit 8b2f0c9
Show file tree
Hide file tree
Showing 6 changed files with 532 additions and 4 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

export(add_metabo)
export(add_pheno)
export(create_multiomics_set)
export(create_omics_set)
export(create_omics_set_factory)
export(hclust_matrix_rows)
export(import_dataset_csv)
export(import_dataset_csv_factory)
Expand Down
387 changes: 387 additions & 0 deletions R/create_omics_sets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,387 @@
#' Create a Biobase set object to store omics data
#'
#' Creates a Biobase object to store an omics dataset and associated samples and
#' features metadata.
#'
#' @param dataset Matrix, the omics dataset in matrix form with features as rows
#' and samples as columns.
#' @param omics_type Character, which type of omics data is being stored?
#' Possible values are `'genomics'`, `'transcriptomics'`, `'metabolomics'` and
#' `'phenomics'`. Use `'phenomics'` for any other omics.
#' @param features_metadata Data.frame, a feature annotation data-frame with
#' features as rows and information about the features as columns. The number
#' of rows and row names must match those of the `dataset` matrix.
#' @param samples_metadata Data.frame, a samples information data-frame with
#' samples as rows and information about the samples as columns. The number of
#' rows and row names must match the number of columns and column names of the
#' `dataset` matrix.
#' @returns An object derived from [Biobase::eSet-class]:
#' * if `omics_type = 'genomics'`: a [Biobase::SnpSet-class] object;
#' * if `omics_type = 'transcriptomics'`: a [Biobase::ExpressionSet-class]
#' object.
#' * if `omics_type = 'metabolomics'`: a [MetabolomeSet-class] object.
#' * if `omics_type = 'phenomics'` a [PhenotypeSet-class] object.
#' @examples
#' \dontrun{
#' data_geno <- import_dataset_csv(
#' "genotype_dataset.csv",
#' col_id = "Marker",
#' features_as_rows = TRUE
#' )
#' geno_info_features <- import_fmetadata_csv(
#' "genotype_features_info.csv",
#' col_id = "Marker"
#' )
#' samples_information <- import_smetadata_csv(
#' "samples_information.csv",
#' col_id = "Sample"
#' )
#' create_omics_set(
#' dataset = data_geno,
#' omics_type = "genomics",
#' features_metadata = geno_info_features,
#' samples_metadata = samples_information
#' )
#' }
#' @export
create_omics_set <- function(dataset,
omics_type = c("phenomics",
"genomics",
"transcriptomics",
"metabolomics"),
features_metadata = NULL,
samples_metadata = NULL) {

omics_types <- rlang::arg_match(omics_types)

res <- switch(omics_type,
"genomics" = new("SnpSet", call = dataset),
"transcriptomics" = Biobase::ExpressionSet(assayData = dataset),
"metabolomics" = new("MetabolomeSet", call = dataset),
"phenomics" = new("PhenotypeSet", call = dataset)
)

if (!is.null(features_metadata)) {

missing_features <- setdiff(rownames(dataset), rownames(features_metadata))
if (length(missing_features) > 0) {
warning(
length(missing_features),
" features are not present in feature metadata."
)
## adding the missing features as NAs in the metadata data-frame
features_metadata[missing_features, ] <- NA
features_metadata[missing_features, "feature_id"] <- missing_features
}

irrelevant_features <- setdiff(
rownames(features_metadata),
rownames(dataset)
)
if (length(irrelevant_features) > 0) {
warning(
length(irrelevant_features), " features",
" in feature metadata not in dataset, will be removed from metadata."
)
}

features_metadata <- features_metadata[rownames(dataset), ]
feature_data <- Biobase::AnnotatedDataFrame(features_metadata)

Biobase::featureData(res) <- feature_data
} else {
## we need to have in the featureData a column with feature ID to be able
## to perform subsetting later on
temp <- data.frame("feature_id" = rownames(dataset))
rownames(temp) <- rownames(dataset)

Biobase::featureData(res) <- Biobase::AnnotatedDataFrame(temp)
}

if (!is.null(samples_metadata)) {

missing_samples <- setdiff(colnames(dataset), rownames(samples_metadata))
if (length(missing_samples)) {
warning(
length(missing_samples),
" samples are not present in samples metadata."
)
## adding the missing samples as NAs in the metadata data-frame
samples_metadata[missing_samples, ] <- NA
samples_metadata[missing_samples, "id"] <- missing_samples
}

irrelevant_samples <- setdiff(
rownames(samples_metadata),
colnames(dataset)
)
if (length(irrelevant_samples)) {
warning(
length(irrelevant_samples), " samples",
" in samples metadata not in dataset, will be removed from metadata."
)
}

samples_metadata <- samples_metadata[colnames(dataset), ]
samples_data <- Biobase::AnnotatedDataFrame(samples_metadata)

Biobase::phenoData(res) <- samples_data
}

return(res)
}

#' Target factory for omics sets creation
#'
#' Creates a list of targets that generate omics sets from targets containing
#' datasets, features and samples metadata.
#'
#' @param datasets Vector of symbols, the names of the targets containing the
#' omics datasets.
#' @param omics_types Character vector, which type of omics data is being stored
#' for each dataset? Possible values are `'genomics'`, `'transcriptomics'`,
#' `'metabolomics'` and `'phenomics'`. Use `'phenomics'` for any other omics.
#' Use `'phenomics'` for any other omics.
#' @param features_metadatas Vector of symbols, the names of the targets
#' containing the features metadata data-frame associated with each omics
#' dataset. Use `NULL` if no feature metadata exists for a dataset.
#' @param samples_metadatas Vector of symbols, the names of the targets
#' containing the samples metadata data-frame associated with each omics
#' dataset. Use `NULL` if no samples metadata exists for a dataset.
#' @param target_name_suffixes Character vector, a suffix to add to the name of
#' the targets created by this target factory for each dataset. If none
#' provided, the suffixes will be extracted from the `datasets` argument.
#' Default value is NULL.
#' @return A list of target objects, with three datasets provided, and
#' `target_name_suffixes = c("geno", "transcripto", "metabo")`, the following
#' targets will be returned: `set_geno`, `set_transcripto` and `set_metabo`.
#' @examples
#' \dontrun{
#' ## in the _targets.R
#' library(moiraine)
#' library(targets)
#'
#' list(
#' ## targets to import the different datasets
#'
#' ## Example where genomics dataset has no features metadata information
#' ## Will generate the following targets: set_geno, set_transcripto
#' create_omics_set_factory(
#' datasets = c(data_geno, data_transcripto),
#' omics_types = c("genomics", "transcriptomics"),
#' features_metadata = c(NULL, fmeta_transcripto),
#' samples_metadata = c(smeta_geno, smeta_transcripto)
#' )
#' )
#' }
#' @export
create_omics_set_factory <- function(datasets,
omics_types,
features_metadatas = NULL,
samples_metadatas = NULL,
target_name_suffixes = NULL) {
n_datasets <- length(.input2symVect(rlang::enquo(datasets)))
n_fmeta <- length(.input2symVect(rlang::enquo(features_metadatas)))
n_smeta <- length(.input2symVect(rlang::enquo(samples_metadatas)))

if (length(omics_types) != n_datasets) {
stop("'datasets' and 'omics_types' vectors must have the same length.")
}

if (length(target_name_suffixes) != n_datasets &&
!is.null(target_name_suffixes)) {
stop("'datasets' and 'target_name_suffixes' vectors must have the same length.")
}

if (n_fmeta != n_datasets && !is.null(substitute(features_metadatas))) {
stop("'datasets' and 'features_metadatas' vectors must have the same length.")
}

if (n_smeta != n_datasets && !is.null(substitute(samples_metadatas))) {
stop("'datasets' and 'samples_metadatas' vectors must have the same length.")
}

values <- tibble::tibble(
ds = .input2symVect(rlang::enquo(datasets)),
ot = omics_types,
fmet = .input2symVect(rlang::enquo(features_metadatas)),
smet = .input2symVect(rlang::enquo(samples_metadatas))
)

if (is.null(target_name_suffixes)) {
target_name_suffixes <- stringr::str_remove(
.symbolVect2charVect(rlang::enquo(datasets)),
"data_"
)
}

values$target_name_suffix <- target_name_suffixes

targets <- tarchetypes::tar_map(
values = values,
names = "target_name_suffix",
targets::tar_target_raw(
"set",
substitute(
create_omics_set(
dataset = ds,
omics_type = ot,
features_metadata = fmet,
samples_metadata = smet
)
)
)
)

return(targets)
}

#' Adds an omics set to a MultiDataSet object
#'
#' Adds a omics set to an existing MultiDataSet object.
#'
#' @param mo_data A [MultiDataSet::MultiDataSet-class] object.
#' @param omics_set A [Biobase::eSet-class] object, created via
#' [create_omics_set()]. Currently accepted objects: [Biobase::SnpSet-class],
#' [Biobase::ExpressionSet-class], [MetabolomeSet-class],
#' [PhenotypeSet-class].
#' @param ds_name Character, name of the dataset (will be used as suffix for the
#' name of the dataset in the resulting MultiDataSet object).
#' @param ... Further arguments passed to `[MultiDataSet::add_snps()],
#' [MultiDataSet::add_rnaseq()], [add_metabo()] or[add_pheno()] (depending on
#' `omics_set` class).
#' @returns A [MultiDataSet::MultiDataSet-class] object, the `mo_data` with
#' `omics_set` as an additional dataset.
#' @examples
#' \dontrun{
#' add_omics_set(mo_data, omics_set, "exp1")
#' }
#' @export
add_omics_set <- function(mo_data, omics_set, ds_name, ...) {
type <- class(omics_set)[[1]]
fct <- switch(type,
"SnpSet" = MultiDataSet::add_snps,
"ExpressionSet" = MultiDataSet::add_rnaseq,
"MetabolomeSet" = add_metabo,
"PhenotypeSet" = add_pheno
)
fct(mo_data, omics_set, dataset.name = ds_name, ...)
}

##### STOPPED HERE

#' Create a MultiDataSet object to store multi-omics data
#'
#' Creates a MultiDataSet object from a list of Biobase Set objects to store the different omics sets.
#'
#' @param sets_list List of Biobase Set objects, created via \code{\link{create_omics_set}}. Currently accepted ojects: \code{SnpSet},
#' \code{ExpressionSet}, \code{MetabolomeSet}, \code{PhenotypeSet}.
#' @param datasets_names Optional, vector of character, name for each Set object. Will be appended to the data type in the resulting
#' object. If the `sets_list` list contains several objects of the same data type (e.g. several SnpSets), their names must be unique.
#' If "" is provided, no name will be appended to the data type for the corresponding dataset.
#' @param show_warnings Logical, should warnings be displayed when adding a set to the MultiDataSet object? Default value is `TRUE`.
#' @return a \code{\link[MultiDataSet]{MultiDataSet-class}}.
#' @export
create_multiomics_set <- function(sets_list, datasets_names = NULL, show_warnings = TRUE) {
# if(is.null(names(sets_list))) stop("The sets_list list must be named.")
# if(length(setdiff(names(sets_list), c("genomics", "transcriptomics", "metabolomics", "phenomics")))) stop("Names of the sets_list list must be one of 'genomics', 'transcriptomics', 'metabolomics', 'phenomics'.")

if (!length(sets_list)) stop("sets_list list is empty.")

cl <- sapply(sets_list, class)
if (length(setdiff(cl, c("SnpSet", "ExpressionSet", "MetabolomeSet", "PhenotypeSet")))) stop("Elements in sets_list must be SnpSet, ExpressionSet, MetabolomeSet or PhenotypeSet objects.")

if (!is.null(datasets_names)) {
if (length(datasets_names) != length(sets_list)) stop("dataset_names vector must have same length as sets_list list.")

if (any(duplicated(paste0(cl, datasets_names)))) stop("Dataset names for objects of a same type must be unique.")
} else {
## if several sets_list elements have the same data type, must provide unique datasets_names
datasets_names <- .make_unique_ids(cl)
}

res <- MultiDataSet::createMultiDataSet()

for (i in 1:length(sets_list)) {
x <- sets_list[i]
ds_name <- datasets_names[i]
if (ds_name == "") ds_name <- NULL

res <- .add_omics_set(res, sets_list[[i]], cl[i], ds_name, warnings = show_warnings)
}

## Check that the samples metadata is consistent across the datasets
temp <- get_samples_metadata(res) |>
purrr::reduce(
function(.x, .y) {
cc <- intersect(colnames(.x), colnames(.y))
# .x <- dplyr::select(.x, tidyselect::all_of(cc))
# .y <- dplyr::select(.y, tidyselect::all_of(cc))
dplyr::full_join(.x, .y, by = cc)
}
)

duplicated_samples <- duplicated(temp$id)
if (any(duplicated_samples)) {
stop(
"Conflicting information in samples metadata for samples ",
paste0(temp$id[duplicated_samples], collapse = ", "),
"."
)
}

return(res)
}


.symbolVect2charVect <- function(x) {
string <- rlang::quo_text(x, nlines = Inf)
unlist(strsplit(gsub("(c\\(|\\)|\\s)", "", string), ","))
}

.input2symVect <- function(x) {
string <- rlang::quo_text(x, nlines = Inf)

if (string == "NULL") {
return(str2expression("NULL"))
}

string <- unlist(strsplit(gsub("(c\\(|\\)|\\s)", "", string), ","))
# res <- rlang::syms(string)
res <- str2expression(string)

return(res)
}

#' Generate suffixes to make values unique.
#'
#' For a given vector of values, returns a vector of the same size containing
#' suffixes to make each value in the input vector unique.
#'
#' @param x Character vector of IDs.
#' @returns Character vector of the same size as `x`, containing at each
#' position:
#' * `''` if the corresponding value in `x` is unique;
#' * A number as character (e.g. `'1'`, `'2`, etc) if the corresponding value
#' in `x` is not unique, such that by adding this value to the values in `x`,
#' it becomes unique.
#' @examples
#' \dontrun{
#' .make_unique_ids(1:5)
#' #> "" "" "" "" ""
#'
#' .make_unique_ids(c(1:5, 1))
#' #> "1" "" "" "" "" "2"
#' }
#'
#' @noRd
.make_unique_ids <- function(x) {
are_duplicates <- unique(x[duplicated(x)])
res <- character(length(x))
for (v in are_duplicates) {
indx <- x == v
res[indx] <- seq_len(sum(indx))
}

return(res)
}
Loading

0 comments on commit 8b2f0c9

Please sign in to comment.