random effects now in design formula

csbg · Jan 28, 2025 · 668c3b2 · 668c3b2
1 parent 6827ade
commit 668c3b2
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 81 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,5 +38,7 @@ Examples:
 
 ### Changed
 - The design formula must now contain the string 'Time' rather than 'X' like it was before. X from
-  before stood for the time. This change is intendet to make the design formula more explicit and 
+  before stood for the time. This change is intended to make the design formula more explicit and 
   self explanatory.
+- Random effects can now be directly be specified in the design formula, rather
+  than being passed as part of the dream_params.
diff --git a/R/cluster_hits.R b/R/cluster_hits.R
@@ -92,7 +92,7 @@ cluster_hits <- function(
     report_dir = here::here(),
     report = TRUE
     ) {
-  
+
   report_dir <- normalizePath(
     report_dir,
     mustWork = FALSE
@@ -151,13 +151,12 @@ cluster_hits <- function(
     spline_params = spline_params,
     mode = mode
   )
-
-  dream_params <- splineomics[["dream_params"]]
-
+
   # Put them in there under those names, so that the report generation fun
   # can access them directly like this.
-  report_info[["Fixed effects (design)"]] <- splineomics[["design"]] 
-  report_info[["Random effects"]] <- dream_params[["random_effects"]] 
+  effects <- extract_effects(design)
+  report_info[["Fixed effects"]] <- effects[["fixed_effects"]] 
+  report_info[["Random effects"]] <- effects[["random_effects"]] 
   report_info[["meta_condition"]] <- c(condition)
   report_info[["plot_data_batch_correction"]] <- paste(
     meta_batch_column,
@@ -639,6 +638,9 @@ make_clustering_report <- function(
     raw_data
     ) {
 
+  design <- gsub("Time", "X", design)  
+  effects <- extract_effects(design)
+
   # Optionally remove the batch-effect with the batch column and design matrix
   # For mode == "integrated", the batch-effect is removed from the whole data
   # For mode == "isolated", the batch-effect is removed for every level
@@ -648,7 +650,7 @@ make_clustering_report <- function(
     condition = condition,
     meta_batch_column = meta_batch_column,
     meta_batch2_column = meta_batch2_column,
-    design = design,
+    design = effects[["fixed_effects"]],
     mode = mode,
     spline_params = spline_params
   )

diff --git a/R/preprocess_rna_seq_data.R b/R/preprocess_rna_seq_data.R
@@ -10,24 +10,38 @@
 #' require a different normalization method, you can supply your own
 #' custom normalization function.
 #'
-#' @param raw_counts A matrix of raw RNA-seq counts (genes as rows, samples as
-#'  columns).
-#' @param meta A dataframe containing the metadata for data.
-#' @param spline_params Parameters for spline functions (optional). Must contain
-#' the named elements spline_type, which must contain either the string "n" for
-#' natural cubic splines, or "b", for B-splines, the named element degree in the
-#' case of B-splines, that must contain only an integer, and the named element
-#' dof, specifying the degree of freedom, containing an integer and required
-#' both for natural and B-splines.
-#' @param design A design formula for the limma analysis, such as
-#' '~ 1 + Phase*X + Reactor'.
-#' @param dream_params A named list or NULL. When not NULL, it must at least 
-#' contain the named element 'random_effects', which must contain a string that
-#' is a formula for the random effects of the mixed models by dream. 
-#' Additionally, it can contain the named elements dof, which must be a int
-#' bigger than 1, which is the degree of freedom for the dream topTable, and
-#' the named element KenwardRoger, which must be a bool, specifying whether
-#' to use that method or not.
+#' @param splineomics An S3 object of class `SplineOmics` that must contain the 
+#' following elements:
+#' \itemize{
+#'   \item \code{data}: A matrix of the omics dataset, with feature names 
+#'   optionally as row headers (genes as rows, samples as columns).
+#'   \item \code{meta}: A dataframe containing metadata corresponding to the 
+#'   \code{data}. The dataframe must include a 'Time' column and a column 
+#'   specified by the \code{condition}.
+#'   \item \code{design}: A character string representing the design formula 
+#'   for the limma analysis (e.g., \code{'~ 1 + Phase*X + Reactor'}).
+#'   \item \code{spline_params}: A list of spline parameters used in the 
+#'   analysis. This can include:
+#'     \itemize{
+#'       \item \code{spline_type}: A character string specifying the type of 
+#'       spline. Must be either \code{'n'} for natural cubic splines or 
+#'       \code{'b'} for B-splines.
+#'       \item \code{dof}: An integer specifying the degrees of freedom. 
+#'       Required for both natural cubic splines and B-splines.
+#'       \item \code{degree}: An integer specifying the degree of the spline 
+#'       (for B-splines only).
+#'       \item \code{knots}: Positions of the internal knots (for B-splines).
+#'       \item \code{bknots}: Boundary knots (for B-splines).
+#'     }
+#'   \item \code{dream_params}: A named list or \code{NULL}. When not 
+#'   \code{NULL}, it can contain:
+#'     \itemize{
+#'       \item \code{dof}: An integer greater than 1, specifying the degrees 
+#'       of freedom for the dream topTable.
+#'       \item \code{KenwardRoger}: A boolean indicating whether to use the 
+#'       Kenward-Roger method.
+#'     }
+#' }
 #' @param normalize_func An optional normalization function. If provided, this
 #' function will be used to normalize the `DGEList` object. If not provided,
 #' TMM normalization (via `edgeR::calcNormFactors`) will be used by default.
@@ -37,19 +51,22 @@
 #' @return A `voom` object, which includes the log2-counts per million (logCPM)
 #'  matrix and observation-specific weights.
 #'
+#' @importFrom edgeR DGEList calcNormFactors
 #' @importFrom limma voom
+#' @importFrom variancePartition voomWithDreamWeights
 #'
 #' @export
 #'
 preprocess_rna_seq_data <- function(
-    raw_counts,
-    meta,
-    spline_params,
-    design,
-    dream_params = NULL,
+    splineomics,
     normalize_func = NULL
     ) {
 
+  check_splineomics_elements(
+    splineomics = splineomics,
+    func_type = "preprocess_rna_seq_data"
+  )
+
   args <- lapply(
     as.list(match.call()[-1]),
     eval,
@@ -59,13 +76,20 @@ preprocess_rna_seq_data <- function(
   check_null_elements(args)
   input_control <- InputControl$new(args)
   input_control$auto_validate()
+
+  raw_counts <- splineomics[["data"]]
+  meta <- splineomics[["meta"]]
+  spline_params <- splineomics[["spline_params"]]
+  design <- splineomics[["design"]]
 
   # Because at first I enforced that X in the design formula stands for the time
   # and I heavily oriented my code towards that. But then I realised that it is
   # nonsense to encode the time as X, and now it is explicitly "Time" (because
   # meta must contain the exact name "Time" for this respective column).
   design <- gsub("Time", "X", design)  
 
+  effects <- extract_effects(design)
+
   message("Preprocessing RNA-seq data (normalization + voom)...")
 
   # Check if edgeR is installed; if not, inform the user
@@ -81,6 +105,9 @@ preprocess_rna_seq_data <- function(
 
   }
 
+  # voomWithDreamWeights wants it like this
+  colnames(raw_counts) <- rownames(meta)  
+
   # Step 1: Create DGEList object from raw counts
   y <- edgeR::DGEList(counts = raw_counts)
 
@@ -91,26 +118,20 @@ preprocess_rna_seq_data <- function(
     # Default: Normalize the counts using TMM normalization
     y <- edgeR::calcNormFactors(y)
   }
-  
+
   # Step 3: Create design matrix
   result <- design2design_matrix(
     meta = meta,
     spline_params = spline_params,
     level_index = 1,
-    design = design
+    design = effects[["fixed_effects"]]
   )
 
   # Step 4: Apply voom transformation to get logCPM values and weights
-  if (!is.null(dream_params)) {
-    full_formula <- paste(
-      design,
-      dream_params[["random_effects"]],
-      sep = " + "
-      )
+  if (effects[["random_effects"]] != "") {
     voom_obj <- variancePartition::voomWithDreamWeights(
       counts = y,
-      formula = stats::as.formula(full_formula),
-      # random.formula = stats::as.formula(dream_params[["random_effects"]]),
+      formula = stats::as.formula(design),
       data = result[["meta"]]    # spline transformed meta.
     )
   }

diff --git a/R/run_limma_splines.R b/R/run_limma_splines.R
@@ -244,12 +244,14 @@ between_level <- function(
   data <- data[, samples]
 
   meta <- meta[meta[[condition]] %in% compared_levels, ]
-
+
+  effects <- extract_effects(design)
+
   result <- design2design_matrix(
     meta = meta,
     spline_params = spline_params,
     level_index = 1,
-    design = design
+    design = effects[["fixed_effects"]]
   )
 
   design_matrix <- result$design_matrix
@@ -277,7 +279,7 @@ between_level <- function(
     seq_len(num_matching_columns)
   )
 
-  if (!is.null(dream_params)) {
+  if (effects[["random_effects"]] != "") {
     colnames(data) <- rownames(meta)  # dream requires this format
 
     # Apply the Kenward-Roger method if specified
@@ -290,8 +292,7 @@ between_level <- function(
     fit <- variancePartition::dream(
       exprObj = data,
       formula = stats::as.formula(design),
-      random.formula = stats::as.formula(dream_params[["random_effects"]]),
-      data = result$meta,    # Spline transformed meta.
+      data = result[["meta"]],    # Spline transformed meta.
       ddf = method
     )
 
@@ -513,10 +514,8 @@ process_top_table <- function(
 #' such as the output from `limma::voom` or a similar preprocessing pipeline.
 #' @param meta A dataframe containing metadata, including a 'Time' column.
 #' @param design A design formula or matrix for the limma analysis.
-#' @param dream_params A named list or NULL. When not NULL, it must at least 
-#' contain the named element 'random_effects', which must contain a string that
-#' is a formula for the random effects of the mixed models by dream. 
-#' Additionally, it can contain the named elements dof, which must be a int
+#' @param dream_params A named list or NULL. When not NULL, it it can contain 
+#' the named elements dof, which must be a int
 #' bigger than 1, which is the degree of freedom for the dream topTable, and
 #' the named element KenwardRoger, which must be a bool, specifying whether
 #' to use that method or not.
@@ -547,34 +546,34 @@ process_within_level <- function(
     padjust_method
     ) {
 
+  effects <- extract_effects(design)
+
   result <- design2design_matrix(
     meta,
     spline_params,
     level_index,
-    design
+    design = effects[["fixed_effects"]]
   )
 
-  design_matrix <- result$design_matrix
+  design_matrix <- result[["design_matrix"]]
 
   if (!is.null(rna_seq_data)) {
     data <- rna_seq_data
   }
 
-  if (!is.null(dream_params)) {
+  if (effects[["random_effects"]] != "") {
     colnames(data) <- rownames(meta)  # dream wants it like this.
-    meta <- result$meta  # Only do it after, new meta has more columns
 
     if (isTRUE(dream_params[["KenwardRoger"]])) {
       method <- "Kenward-Roger"
     } else {
       method = NULL
     }
-    
+
     fit <- variancePartition::dream(
       exprObj = data,
       formula = stats::as.formula(design),
-      data = meta,
-      random.formula = stats::as.formula(dream_params[["random_effects"]]),
+      data = result[["meta"]],
       ddf = method
     )
 

diff --git a/R/utils_general.R b/R/utils_general.R
@@ -224,3 +224,65 @@ stop_call_false <- function(...) {
   # Call stop with the concatenated message and call. = FALSE
   stop(message_text, call. = FALSE)
 }
+
+
+#' Extract fixed and random effects from a model formula string.
+#'
+#' @description
+#' This function processes a model formula string by separating it into 
+#' fixed effects and random effects. Random effects are substrings in the
+#' format "(...)", and fixed effects are the remaining part of the string.
+#'
+#' @param formula_string A character string representing the model formula.
+#'
+#' @return A list containing two components:
+#'         - `fixed_effects`: The cleaned fixed effects string.
+#'         - `random_effects`: The concatenated random effects string.
+#'
+#' @examples
+#' extract_effects("~ 1 + Condition*Time + Plate + (1|Reactor)")
+#' # Returns:
+#' # $fixed_effects
+#' # [1] "~ 1 + Condition*Time + Plate"
+#' #
+#' # $random_effects
+#' # [1] "(1|Reactor)"
+extract_effects <- function(formula_string) {
+  # Match all substrings in the format "(...)"
+  matches <- gregexpr("\\([^)]*\\)", formula_string, perl = TRUE)
+  substrings <- regmatches(formula_string, matches)[[1]]
+
+  # If there are random effects, clean them
+  if (length(substrings) > 0) {
+    # Remove one non-whitespace character before and after each match
+    cleaned_random_effects <- sapply(substrings, function(substring) {
+      substring <- sub("\\S\\(", "(", substring)
+      substring <- sub("\\)\\S", ")", substring)
+      return(substring)
+    }, USE.NAMES = FALSE)
+
+    # Concatenate random effects into a single string
+    random_effects <- paste(cleaned_random_effects, collapse = " ")
+  } else {
+    random_effects <- ""
+  }
+
+  # Remove all random effects from the original string to get fixed effects
+  fixed_effects <- formula_string
+  if (length(substrings) > 0) {
+    for (substring in substrings) {
+      fixed_effects <- sub(substring, "", fixed_effects, fixed = TRUE)
+    }
+  }
+
+  # Clean up any extra '+' or whitespace from fixed effects
+  fixed_effects <- gsub("\\s*\\+\\s*$", "", fixed_effects)
+  fixed_effects <- trimws(fixed_effects)
+
+  # Return both fixed and random effects
+  return(list(
+    fixed_effects = fixed_effects,
+    random_effects = random_effects
+  ))
+}
+