From eb88b89a110a7a89d753c3256aeeb6c7eed1fb28 Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 10:42:29 -0800 Subject: [PATCH 01/19] to allow pipes to work in examples --- R/GAMBLR.data-package.R | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 R/GAMBLR.data-package.R diff --git a/R/GAMBLR.data-package.R b/R/GAMBLR.data-package.R new file mode 100644 index 0000000..ebc9994 --- /dev/null +++ b/R/GAMBLR.data-package.R @@ -0,0 +1,7 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @importFrom magrittr %>% +## usethis namespace: end +NULL From b38aa824e6205c09a5237bbe665f2cb800678243 Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 10:44:06 -0800 Subject: [PATCH 02/19] to allow pipes to work in examples --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index fcc05d1..aee9af3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ LazyData: true Imports: dplyr, ggplot2, + magrittr, purrr, tibble, tidyr From c633f3698ecc8f41fdcf03bfb380e5f490b3ee7e Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 10:49:59 -0800 Subject: [PATCH 03/19] to allow pipes to work in examples --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 3678176..06fbe94 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,3 +45,4 @@ import(ggplot2) import(purrr) import(tibble) import(tidyr) +importFrom(magrittr,"%>%") From aae650f23c7efb8a8b452cd4d157a16793b888f8 Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 11:15:13 -0800 Subject: [PATCH 04/19] ensure pipe is always available for examples --- NAMESPACE | 1 + R/utils-pipe.R | 14 ++++++++++++++ man/GAMBLR.data-package.Rd | 21 +++++++++++++++++++++ man/pipe.Rd | 20 ++++++++++++++++++++ 4 files changed, 56 insertions(+) create mode 100644 R/utils-pipe.R create mode 100644 man/GAMBLR.data-package.Rd create mode 100644 man/pipe.Rd diff --git a/NAMESPACE b/NAMESPACE index 06fbe94..2124a72 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ S3method(print,maf_data) S3method(rename,genomic_data) S3method(select,genomic_data) S3method(ungroup,genomic_data) +export("%>%") export(annotate_hotspots) export(assign_cn_to_ssm) export(bind_genomic_data) diff --git a/R/utils-pipe.R b/R/utils-pipe.R new file mode 100644 index 0000000..fd0b1d1 --- /dev/null +++ b/R/utils-pipe.R @@ -0,0 +1,14 @@ +#' Pipe operator +#' +#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. +#' +#' @name %>% +#' @rdname pipe +#' @keywords internal +#' @export +#' @importFrom magrittr %>% +#' @usage lhs \%>\% rhs +#' @param lhs A value or the magrittr placeholder. +#' @param rhs A function call using the magrittr semantics. +#' @return The result of calling `rhs(lhs)`. +NULL diff --git a/man/GAMBLR.data-package.Rd b/man/GAMBLR.data-package.Rd new file mode 100644 index 0000000..e4a78c4 --- /dev/null +++ b/man/GAMBLR.data-package.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GAMBLR.data-package.R +\docType{package} +\name{GAMBLR.data-package} +\alias{GAMBLR.data} +\alias{GAMBLR.data-package} +\title{GAMBLR.data: Collection of Curated Data for Genomic Analysis of Mature B-cell Lymphomas in R} +\description{ +The package contains manually curated data for the genomic Analysis of mature B-cell lymphomas in R, such as regions of somatic hypermutation, lymphoma genes, etc. +} +\author{ +\strong{Maintainer}: Kostiantyn Dreval \email{kdreval@sfu.ca} (\href{https://orcid.org/0000-0002-6214-2843}{ORCID}) + +Authors: +\itemize{ + \item Ryan Morin \email{rdmorin@sfu.ca} (\href{https://orcid.org/0000-0003-2932-7800}{ORCID}) + \item Adam Mattsson \email{cmattsson@bcgsc.ca} (\href{https://orcid.org/0000-0002-6318-7912}{ORCID}) +} + +} +\keyword{internal} diff --git a/man/pipe.Rd b/man/pipe.Rd new file mode 100644 index 0000000..a648c29 --- /dev/null +++ b/man/pipe.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipe.R +\name{\%>\%} +\alias{\%>\%} +\title{Pipe operator} +\usage{ +lhs \%>\% rhs +} +\arguments{ +\item{lhs}{A value or the magrittr placeholder.} + +\item{rhs}{A function call using the magrittr semantics.} +} +\value{ +The result of calling \code{rhs(lhs)}. +} +\description{ +See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. +} +\keyword{internal} From 982289be9e00faa3475aad6196380f405be45f36 Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 11:25:05 -0800 Subject: [PATCH 05/19] improved and working examples --- R/annotate_hotspots.R | 6 +++++- man/annotate_hotspots.Rd | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/annotate_hotspots.R b/R/annotate_hotspots.R index 4f64000..0edcb10 100644 --- a/R/annotate_hotspots.R +++ b/R/annotate_hotspots.R @@ -16,9 +16,13 @@ #' my_metadata = get_gambl_metadata() #' all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, #' projection = "grch37", -#' this_seq_type = "genome") +#' this_seq_type = "genome") %>% +#' dplyr::filter(Hugo_Symbol %in% c("EZH2","MEF2B","MYD88","KMT2D")) %>% +#' dplyr::arrange(Hugo_Symbol) #' #' hot_ssms = annotate_hotspots(all_coding_ssm) +#' hot_ssms %>% dplyr::filter(!is.na(hot_spot)) %>% +#' dplyr::select(1:5,37,hot_spot) #' annotate_hotspots = function( mutation_maf, diff --git a/man/annotate_hotspots.Rd b/man/annotate_hotspots.Rd index 6040620..68be61d 100644 --- a/man/annotate_hotspots.Rd +++ b/man/annotate_hotspots.Rd @@ -24,8 +24,12 @@ This function takes an already loaded MAF data frame with the \code{mutation_maf my_metadata = get_gambl_metadata() all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, projection = "grch37", - this_seq_type = "genome") + this_seq_type = "genome") \%>\% + dplyr::filter(Hugo_Symbol \%in\% c("EZH2","MEF2B","MYD88","KMT2D")) \%>\% + dplyr::arrange(Hugo_Symbol) hot_ssms = annotate_hotspots(all_coding_ssm) +hot_ssms \%>\% dplyr::filter(!is.na(hot_spot)) \%>\% + dplyr::select(1:5,37,hot_spot) } From abd5c7430a401471dc5af2be391581ee0dd55c81 Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 14:41:24 -0800 Subject: [PATCH 06/19] overhaul and improve assign_cn_to_ssm --- R/assign_cn_to_ssm.R | 161 ++++++++++++++++++++++------------ R/genomic_data.R | 9 +- R/get_sample_cn_segments.R | 90 ------------------- man/assign_cn_to_ssm.Rd | 73 ++++++++++----- man/get_sample_cn_segments.Rd | 65 -------------- 5 files changed, 161 insertions(+), 237 deletions(-) delete mode 100644 R/get_sample_cn_segments.R delete mode 100644 man/get_sample_cn_segments.Rd diff --git a/R/assign_cn_to_ssm.R b/R/assign_cn_to_ssm.R index 8faa091..35e2884 100644 --- a/R/assign_cn_to_ssm.R +++ b/R/assign_cn_to_ssm.R @@ -2,26 +2,26 @@ #' #' @description Annotate mutations with their copy number information. #' -#' @details This function takes a sample ID with the `this_sample_id` parameter -#' and annotates mutations with copy number information. A variety of -#' parameters are at hand for a customized workflow. For example, -#' the user can specify if only coding mutations are of interest. To do so, -#' set `coding_only = TRUE`. This function internally calls -#' `get_ssm_by_samples` and `get_sample_cn_segments`. This function can -#' also take a vector with genes of interest (`genes`) that the returned -#' data frame will be restricted to. -#' -#' @param this_sample_id Sample ID of the sample you want to annotate. -#' @param genes A vector of characters with gene symbols (Hugo). -#' @param this_seq_type Specified seq type for returned data. Default is genome. -#' @param projection Specified genome projection that returned data is in -#' reference to. Default is grch37. -#' @param coding_only Optional. Set to TRUE to restrict to only coding variants -#' (ssm). Deafult is FALSE. +#' @details This function takes a metadata table and returns all mutations +#' for the samples in that metadata. Each mutation is annotated with the +#' local copy number state of each mutated site. The user can specify if +#' only coding mutations are of interest. To do so, +#' set `coding_only = TRUE`. When necessary, this function relies on +#' `get_ssm_by_samples` and `get_cn_segments` to obtain the required data. +#' @param these_samples_metadata Metadata table with one or more rows to specify +#' the samples to process. +#' @param maf_data A data frame of mutations in MAF format or maf_data object +#' (e.g. from `get_coding_ssm` or `get_ssm_by_sample`). +#' @param seg_data A data frame of segmented copy number data or seg_data object +#' @param projection Specified genome projection that returned data is relative to. +#' This is only required when it cannot be inferred from maf_df or seg_df +#' (or they are not provided). +#' @param coding_only Optional. Set to TRUE to restrict to only variants in coding space +#' Default is to work with genome-wide variants. #' @param assume_diploid Optional, this parameter annotates every mutation as #' copy neutral. Default is FALSE. #' @param include_silent Logical parameter indicating whether to include silent -#' mutations into coding mutations. Default is FALSE. This parameter only +#' mutations in coding space. Default is FALSE. This parameter only #' makes sense if `coding_only` is set to TRUE. #' @param ... Any additional parameters. #' @@ -36,41 +36,98 @@ #' @export #' #' @examples -#' cn_list = assign_cn_to_ssm( -#' this_sample_id = "DOHH-2", -#' coding_only = TRUE -#' ) +#' # long-handed way +#' # 1. get some metadata for a collection of samples +#' some_meta = get_gambl_metadata() %>% +#' dplyr::filter(cohort=="FL_Dreval", +#' grepl("SP",sample_id)) +#' # 2. Get the SSMs for these samples +#' +#' ssm_genomes_grch37 = get_coding_ssm(projection = "grch37", +#' these_samples_metadata = some_meta) +#' # peek at the results +#' ssm_genomes_grch37 %>% dplyr::select(1:8) +#' +#' # 3. Lazily let this function obtain the corresponding seg_data for the right genome_build +#' cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_grch37) +#' +#' cn_list$maf %>% dplyr::select(1:8,log.ratio,CN) +#' +#' # This won't work because the hg38 seg_data is not bundled +#' ssm_genomes_hg38 = get_coding_ssm(projection = "hg38", +#' these_samples_metadata = some_meta) +#' cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_hg38) +#' +#' # Easiest/laziest way: +#' cn_list = assign_cn_to_ssm(projection = "grch37") +#' +#' +#' cn_list$maf %>% dplyr::group_by(Tumor_Sample_Barcode,CN) %>% +#' dplyr::count() #' assign_cn_to_ssm = function( - this_sample_id, - genes, - this_seq_type = "genome", - projection = "grch37", + these_samples_metadata, + maf_data, + seg_data, + projection, coding_only = FALSE, assume_diploid = FALSE, include_silent = FALSE, ... ){ - - #warn/notify the user what version of this function they are using - message("Using the bundled CN segments (.seg) calls in GAMBLR.data...") - + if(missing(these_samples_metadata)){ + stop("No metadata provided. these_samples_metadata is required") + } #check if any invalid parameters are provided check_excess_params(...) - - #ensure only one sample ID is provided - if(length(this_sample_id) > 1){ - stop( - "This function only supports queries of 1 sample ID at the time..." - ) + genomic_data = list() + if(!missing(maf_data)){ + genomic_data[["maf_data"]] = maf_data + } + if(!missing(seg_data)){ + genomic_data[["seg_data"]] = seg_data } - #get maf - maf_sample = get_ssm_by_sample( - this_sample_id = this_sample_id, + projection <- check_get_projection(genomic_data, suggested = projection) + + if(missing(seg_data)){ + seg_sample = get_cn_segments( + these_samples_metadata = these_samples_metadata, + projection = projection + ) + missing_from_seg = dplyr::filter(these_samples_metadata, + !sample_id %in% seg_sample$ID) %>% + pull(sample_id) %>% + unique() + if(length(missing_from_seg) == length(unique(these_samples_metadata$sample_id))){ + stop(paste("No seg_data could be found for ANY of the samples provided for",projection)) + } + if(length(missing_from_seg)){ + warning(paste("missing seg_data for",length(missing_from_seg),"samples")) + } + }else{ + seg_sample = seg_data + } + + if(missing(maf_data)){ + #get maf + maf_sample = get_ssm_by_samples( + these_samples_metadata = these_samples_metadata, projection = projection, - this_seq_type = this_seq_type - ) + ) + missing_from_maf = dplyr::filter(these_samples_metadata, + !sample_id %in% maf_sample$Tumor_Sample_Barcode) %>% + pull(sample_id) %>% + unique() + if(length(missing_from_maf) == length(unique(these_samples_metadata$sample_id))){ + stop(paste("No mutation could be found for ANY of the samples provided for",projection)) + } + if(length(missing_from_maf)){ + warning(paste("missing mutation for",length(missing_from_maf),"samples")) + } + }else{ + maf_sample = maf_data + } #maf filtering #silent mutations @@ -86,20 +143,7 @@ assign_cn_to_ssm = function( ) } - #subset to genes of interest - if(!missing(genes)){ - maf_sample = dplyr::filter(maf_sample, Hugo_Symbol %in% genes) - if(nrow(maf_sample) == 0){ - stop("No variants left after filtering on the provided genes...") - } - } - #get seg - seg_sample = get_sample_cn_segments( - these_sample_ids = this_sample_id, - projection = projection, - this_seq_type = this_seq_type - ) #annotate all CN segments as copy number neutral if(assume_diploid){ @@ -110,18 +154,21 @@ assign_cn_to_ssm = function( #wrangle the seg file seg_sample = seg_sample %>% dplyr::filter(end - start > 100) %>% - mutate(chrom = gsub("chr", "", chrom)) %>% rename( Chromosome = chrom, Start_Position = start, End_Position = end, - LOH = LOH_flag + LOH = LOH_flag, + Tumor_Sample_Barcode = ID ) %>% mutate(across(LOH, as.factor)) - + #perform an overlap join and add CN columns from the seg file and subset # MAF to basic columns (first 45) - maf_tmp = cool_overlaps(maf_sample, seg_sample, type = "any") + maf_tmp = cool_overlaps(maf_sample, seg_sample, + type = "any", + columns1=c("Chromosome","Start_Position","End_Position","Tumor_Sample_Barcode"), + columns2=c("Chromosome","Start_Position","End_Position","Tumor_Sample_Barcode")) #rename and change order of columns to match expected format maf_with_segs = maf_tmp %>% diff --git a/R/genomic_data.R b/R/genomic_data.R index e92b7c7..eada284 100644 --- a/R/genomic_data.R +++ b/R/genomic_data.R @@ -49,8 +49,15 @@ get_genome_build <- function(data) { #' @return A data frame with preserved genomic attributes. #' @export preserve_genomic_attributes <- function(new_data, old_data) { + # Preserve the genome_build attribute attr(new_data, "genome_build") <- attr(old_data, "genome_build") - class(new_data) <- class(old_data) + + # Combine the new data’s classes with the genomic classes + new_data_classes <- class(new_data) + # Ensure the genomic classes are at the front + new_classes <- unique(c("maf_data", "genomic_data", new_data_classes)) + class(new_data) <- new_classes + return(new_data) } diff --git a/R/get_sample_cn_segments.R b/R/get_sample_cn_segments.R deleted file mode 100644 index 17388ea..0000000 --- a/R/get_sample_cn_segments.R +++ /dev/null @@ -1,90 +0,0 @@ -#' @title Get Sample CN Segments. -#' -#' @description Get all segments for a single (or multiple) sample_id(s). -#' -#' @details This function returns CN segments. This works for single sample or multiple samples. -#' Specify the sample IDs you are interested in with `these_sample_ids` (as a vector of characters), -#' Or call this function with `these_samples_metadata` if you already have a metadata table subset to the sample IDs of interest. -#' If none of the above parameters are specified, the function will return CN segments for available samples (from get_gambl_metadata). -#' Note, this. function internally calls [GAMBLR.data::id_ease] for dealing with sample IDs and metadata tables. -#' -#' @param these_sample_ids Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to subset the return to. -#' If not provided (and if `these_sample_ids` is not provided), the function will return all samples from the specified seq_type in the metadata. -#' @param projection Selected genome projection for returned CN segments. Default is "grch37". -#' @param this_seq_type Seq type for returned CN segments. Default is genome. -#' @param with_chr_prefix Set to TRUE to add a chr prefix to chromosome names. Default is FALSE. -#' @param streamlined Return a minimal output rather than full details. Default is FALSE. -#' @param verbose Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function. -#' @param ... Any additional parameters. -#' -#' @return A data frame of segments for a specific or multiple sample ID(s). -#' -#' @import dplyr -#' @export -#' -#' @examples -#' #load pacakges -#' library(dplyr) -#' -#' #get CN segments for one sample -#' dohh2_segs = get_sample_cn_segments(these_sample_ids = "DOHH-2", -#' projection = "hg38", -#' streamlined = TRUE) -#' -#' #get CN segments for DLBCL cell line -#' cell_line_meta = GAMBLR.data::sample_data$meta %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' dlbcl_segs = get_sample_cn_segments(these_samples_metadata = cell_line_meta, -#' streamlined = TRUE) -#' -get_sample_cn_segments = function(these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - with_chr_prefix = FALSE, - streamlined = FALSE, - verbose = FALSE, - ...){ - - #warn/notify the user what version of this function they are using - message("Using the bundled CN segments (.seg) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) - - #return CN segments based on the selected projection - if(projection %in% valid_projections){ - all_segs = GAMBLR.data::sample_data[[projection]]$seg %>% - dplyr::filter(ID %in% sample_ids) - }else{ - stop(paste("please provide a valid projection. The following are available:", - paste(valid_projections,collapse=", "))) - } - - #deal with chr prefixes - if(!with_chr_prefix){ - all_segs = all_segs %>% - dplyr::mutate(chrom = gsub("chr", "", chrom)) - }else{ - if(!grepl("chr", all_segs$chrom[1])){ - all_segs$chrom = paste0("chr", all_segs$chrom) - } - } - - if(streamlined){all_segs = dplyr::select(all_segs, ID, CN)} - - return(all_segs) -} diff --git a/man/assign_cn_to_ssm.Rd b/man/assign_cn_to_ssm.Rd index e536b4b..2c44aed 100644 --- a/man/assign_cn_to_ssm.Rd +++ b/man/assign_cn_to_ssm.Rd @@ -5,10 +5,10 @@ \title{Assign CN to SSM.} \usage{ assign_cn_to_ssm( - this_sample_id, - genes, - this_seq_type = "genome", - projection = "grch37", + these_samples_metadata, + maf_data, + seg_data, + projection, coding_only = FALSE, assume_diploid = FALSE, include_silent = FALSE, @@ -16,23 +16,26 @@ assign_cn_to_ssm( ) } \arguments{ -\item{this_sample_id}{Sample ID of the sample you want to annotate.} +\item{these_samples_metadata}{Metadata table with one or more rows to specify +the samples to process.} -\item{genes}{A vector of characters with gene symbols (Hugo).} +\item{maf_data}{A data frame of mutations in MAF format or maf_data object +(e.g. from \code{get_coding_ssm} or \code{get_ssm_by_sample}).} -\item{this_seq_type}{Specified seq type for returned data. Default is genome.} +\item{seg_data}{A data frame of segmented copy number data or seg_data object} -\item{projection}{Specified genome projection that returned data is in -reference to. Default is grch37.} +\item{projection}{Specified genome projection that returned data is relative to. +This is only required when it cannot be inferred from maf_df or seg_df +(or they are not provided).} -\item{coding_only}{Optional. Set to TRUE to restrict to only coding variants -(ssm). Deafult is FALSE.} +\item{coding_only}{Optional. Set to TRUE to restrict to only variants in coding space +Default is to work with genome-wide variants.} \item{assume_diploid}{Optional, this parameter annotates every mutation as copy neutral. Default is FALSE.} \item{include_silent}{Logical parameter indicating whether to include silent -mutations into coding mutations. Default is FALSE. This parameter only +mutations in coding space. Default is FALSE. This parameter only makes sense if \code{coding_only} is set to TRUE.} \item{...}{Any additional parameters.} @@ -49,19 +52,41 @@ log.ratio, NA when no overlap was found). Annotate mutations with their copy number information. } \details{ -This function takes a sample ID with the \code{this_sample_id} parameter -and annotates mutations with copy number information. A variety of -parameters are at hand for a customized workflow. For example, -the user can specify if only coding mutations are of interest. To do so, -set \code{coding_only = TRUE}. This function internally calls -\code{get_ssm_by_samples} and \code{get_sample_cn_segments}. This function can -also take a vector with genes of interest (\code{genes}) that the returned -data frame will be restricted to. +This function takes a metadata table and returns all mutations +for the samples in that metadata. Each mutation is annotated with the +local copy number state of each mutated site. The user can specify if +only coding mutations are of interest. To do so, +set \code{coding_only = TRUE}. When necessary, this function relies on +\code{get_ssm_by_samples} and \code{get_cn_segments} to obtain the required data. } \examples{ -cn_list = assign_cn_to_ssm( - this_sample_id = "DOHH-2", - coding_only = TRUE -) +# long-handed way +# 1. get some metadata for a collection of samples +some_meta = get_gambl_metadata() \%>\% + dplyr::filter(cohort=="FL_Dreval", + grepl("SP",sample_id)) +# 2. Get the SSMs for these samples + +ssm_genomes_grch37 = get_coding_ssm(projection = "grch37", + these_samples_metadata = some_meta) +# peek at the results +ssm_genomes_grch37 \%>\% dplyr::select(1:8) + +# 3. Lazily let this function obtain the corresponding seg_data for the right genome_build +cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_grch37) + +cn_list$maf \%>\% dplyr::select(1:8,log.ratio,CN) + +# This won't work because the hg38 seg_data is not bundled +ssm_genomes_hg38 = get_coding_ssm(projection = "hg38", + these_samples_metadata = some_meta) +cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_hg38) + +# Easiest/laziest way: +cn_list = assign_cn_to_ssm(projection = "grch37") + + +cn_list$maf \%>\% dplyr::group_by(Tumor_Sample_Barcode,CN) \%>\% + dplyr::count() } diff --git a/man/get_sample_cn_segments.Rd b/man/get_sample_cn_segments.Rd deleted file mode 100644 index df0959c..0000000 --- a/man/get_sample_cn_segments.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_sample_cn_segments.R -\name{get_sample_cn_segments} -\alias{get_sample_cn_segments} -\title{Get Sample CN Segments.} -\usage{ -get_sample_cn_segments( - these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - with_chr_prefix = FALSE, - streamlined = FALSE, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to subset the return to. -If not provided (and if \code{these_sample_ids} is not provided), the function will return all samples from the specified seq_type in the metadata.} - -\item{projection}{Selected genome projection for returned CN segments. Default is "grch37".} - -\item{this_seq_type}{Seq type for returned CN segments. Default is genome.} - -\item{with_chr_prefix}{Set to TRUE to add a chr prefix to chromosome names. Default is FALSE.} - -\item{streamlined}{Return a minimal output rather than full details. Default is FALSE.} - -\item{verbose}{Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame of segments for a specific or multiple sample ID(s). -} -\description{ -Get all segments for a single (or multiple) sample_id(s). -} -\details{ -This function returns CN segments. This works for single sample or multiple samples. -Specify the sample IDs you are interested in with \code{these_sample_ids} (as a vector of characters), -Or call this function with \code{these_samples_metadata} if you already have a metadata table subset to the sample IDs of interest. -If none of the above parameters are specified, the function will return CN segments for available samples (from get_gambl_metadata). -Note, this. function internally calls \link{id_ease} for dealing with sample IDs and metadata tables. -} -\examples{ -#load pacakges -library(dplyr) - -#get CN segments for one sample -dohh2_segs = get_sample_cn_segments(these_sample_ids = "DOHH-2", - projection = "hg38", - streamlined = TRUE) - -#get CN segments for DLBCL cell line -cell_line_meta = GAMBLR.data::sample_data$meta \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -dlbcl_segs = get_sample_cn_segments(these_samples_metadata = cell_line_meta, - streamlined = TRUE) - -} From 358aadc5c4a18b1ef41969b8ecd504f8e0c954fa Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 14:42:40 -0800 Subject: [PATCH 07/19] helper function for genome_build automation --- R/genomic_data.R | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/R/genomic_data.R b/R/genomic_data.R index eada284..f8df597 100644 --- a/R/genomic_data.R +++ b/R/genomic_data.R @@ -1,5 +1,79 @@ # functions for creating and working with S3 objects +#' Check and set the genome_build/projection +#' +#' This helper function checks the genome build of each genomic data object in +#' \code{genomic_data_list} (using \code{get_genome_build()}) and ensures they are consistent. +#' If all objects share a single, unique genome build, that value is returned. If a +#' user-specified genome build (\code{suggested}) is provided, it is compared to the +#' inferred build and must match; otherwise, an error is raised. If the genomic data +#' objects have conflicting genome builds, or if no genome build can be inferred and +#' no \code{suggested} value is provided, the function stops with an error. +#' +#' @param genomic_data_list A list of genomic data objects. Each object should have a genome build +#' that can be retrieved by \code{get_genome_build()}. +#' @param suggested An optional character string specifying a genome build (projection) to be used. +#' If provided, it must match the genome build inferred from the data objects. +#' +#' @return A character string representing the genome build to be used. +#' @export +#' +#' @examples +#' # Example 1: When genomic data objects all have the same genome build. +#' # Assuming maf_data and seg_data both have a genome build of "hg38": +#' genomic_data <- list(maf_data = maf_data, seg_data = seg_data) +#' projection <- check_get_projection(genomic_data, suggested = "hg38") +#' +#' # Example 2: When the genomic data objects conflict or no genome build is available. +#' # This will throw an error: +#' genomic_data <- list(maf_data = maf_data, seg_data = seg_data_with_different_build) +#' projection <- check_get_projection(genomic_data, suggested = "hg38") +#' +check_get_projection <- function(genomic_data_list, suggested) { + # Extract genome builds from each genomic data object + builds <- sapply(genomic_data_list, get_genome_build) + uniq_builds <- unique(builds) + + if (length(uniq_builds) == 1) { + # A single, consistent genome build was inferred. + if (!missing(suggested) && suggested != uniq_builds) { + stop("Mismatch between user-specified genome_build and the genome_build inferred from objects.") + } + return(uniq_builds) + } + + if (length(uniq_builds) > 1) { + # Conflicting genome builds among the objects. + stop("Conflicting genome_build values found: ", paste(uniq_builds, collapse = ", ")) + } + + # No genome build could be inferred. + if (missing(suggested)) { + stop("No projection provided and genome_build cannot be inferred from the inputs.") + } + + return(suggested) +} + +check_get_projection <- function(genomic_data_list, suggested) { + builds <- sapply(genomic_data_list, get_genome_build) + uniq_builds <- unique(builds) + if (length(uniq_builds) == 1) { + # If a genome build can be inferred unambiguously + if (!missing(suggested) && suggested != uniq_builds) { + stop("Mismatch between user-specified genome_build and the genome_build inferred from objects.") + } + return(uniq_builds) + } + if (length(uniq_builds) > 1) { + stop("Conflicting genome_build values found: ", paste(uniq_builds, collapse = ", ")) + } + # If no genome build could be inferred (uniq_builds is empty) + if (missing(suggested)) { + stop("No projection provided and genome_build cannot be inferred from the inputs.") + } + return(suggested) +} #' Create MAF Data #' From e3685ee3afd1a1d31d79ced6673f4bd94355c3c1 Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 15:30:37 -0800 Subject: [PATCH 08/19] fix namespace --- NAMESPACE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 2124a72..6ad118b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(bind_genomic_data) export(calc_mutation_frequency_bin_region) export(calc_mutation_frequency_bin_regions) export(check_excess_params) +export(check_get_projection) export(collate_results) export(cool_overlaps) export(create_bed_data) @@ -31,7 +32,6 @@ export(get_genes) export(get_genome_build) export(get_manta_sv) export(get_mapped_colours) -export(get_sample_cn_segments) export(get_ssm_by_patients) export(get_ssm_by_regions) export(get_ssm_by_samples) From 254b0aab5f57abe17ab59f71da6470ea8b57985c Mon Sep 17 00:00:00 2001 From: rdmorin Date: Fri, 7 Feb 2025 15:43:29 -0800 Subject: [PATCH 09/19] fix docs --- R/get_ssm_by_patients.R | 22 +++++++++++----------- man/get_ssm_by_patients.Rd | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/R/get_ssm_by_patients.R b/R/get_ssm_by_patients.R index 6f33702..d634ce9 100644 --- a/R/get_ssm_by_patients.R +++ b/R/get_ssm_by_patients.R @@ -29,18 +29,18 @@ #' @export #' #' @examples -#' #load packages -#' library(dplyr) #' -#' #basic usage, these_patient_ids -#' my_patient = get_ssm_by_patients(these_patient_ids = "DOHH-2") -#' -#' #using a subset metadata tablee to retreive patient SSMs -#' cell_line_meta = GAMBLR.data::sample_data$meta %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' patient_maf = get_ssm_by_patients(these_samples_metadata = cell_line_meta, -#' this_seq_type = "genome") +#' # Lets find which patient_id occur more than once in the metadata first +#' my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) %>% +#' dplyr::group_by(patient_id) %>% +#' dplyr::tally() %>% +#' dplyr::filter(n>1) %>% +#' dplyr::pull(patient_id) +#' +#' #now let's get every SSM for all samples from these patients +#' patient_maf = get_ssm_by_patients(these_patient_ids = my_ids) +#' patient_maf %>% dplyr::group_by(Tumor_Sample_Barcode) %>% +#' dplyr::count() %>% head() #' get_ssm_by_patients = function(these_patient_ids, these_samples_metadata, diff --git a/man/get_ssm_by_patients.Rd b/man/get_ssm_by_patients.Rd index b24582c..2acd48a 100644 --- a/man/get_ssm_by_patients.Rd +++ b/man/get_ssm_by_patients.Rd @@ -51,17 +51,17 @@ This function expects either a vector of patient IDs (\code{these_patients_ids}) or an already subset metadata table (\code{these_samples_metadata}). } \examples{ -#load packages -library(dplyr) -#basic usage, these_patient_ids -my_patient = get_ssm_by_patients(these_patient_ids = "DOHH-2") +# Lets find which patient_id occur more than once in the metadata first +my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) \%>\% + dplyr::group_by(patient_id) \%>\% + dplyr::tally() \%>\% + dplyr::filter(n>1) \%>\% + dplyr::pull(patient_id) -#using a subset metadata tablee to retreive patient SSMs -cell_line_meta = GAMBLR.data::sample_data$meta \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -patient_maf = get_ssm_by_patients(these_samples_metadata = cell_line_meta, - this_seq_type = "genome") +#now let's get every SSM for all samples from these patients +patient_maf = get_ssm_by_patients(these_patient_ids = my_ids) +patient_maf \%>\% dplyr::group_by(Tumor_Sample_Barcode) \%>\% + dplyr::count() \%>\% head() } From 95add6706b448f646e96589fd6af77082b1254b6 Mon Sep 17 00:00:00 2001 From: Ryan Morin Date: Fri, 7 Feb 2025 17:58:45 -0800 Subject: [PATCH 10/19] update docs and overhaul a few functions --- NAMESPACE | 1 + R/annotate_hotspots.R | 39 ++--- R/calc_mutation_frequency_bin_region.R | 27 ++-- R/cool_overlaps.R | 1 + R/data.R | 49 ++++++ R/genomic_data.R | 146 +++++++++++------- R/get_cn_segments.R | 68 ++++---- R/get_ssm_by_patients.R | 33 ++-- R/get_ssm_by_samples.R | 59 ++++--- man/annotate_hotspots.Rd | 13 +- man/bind_genomic_data.Rd | 1 + man/calc_mutation_frequency_bin_region.Rd | 19 ++- man/calc_mutation_frequency_bin_regions.Rd | 33 ++-- man/check_excess_params.Rd | 1 + man/check_get_projection.Rd | 44 ++++++ man/chromosome_arms_grch37.Rd | 2 +- man/chromosome_arms_hg38.Rd | 2 +- man/colour_codes.Rd | 2 +- man/cool_overlaps.Rd | 1 + man/create_bed_data.Rd | 30 ++-- man/create_seg_data.Rd | 5 +- man/cytobands_grch37.Rd | 2 +- man/cytobands_hg38.Rd | 2 +- man/dhitsig_genes_with_weights.Rd | 2 +- man/dlbcl90_genes.Rd | 2 +- man/gambl_metadata.Rd | 2 +- man/gene_blacklist.Rd | 2 +- man/get_cn_segments.Rd | 18 ++- man/get_genome_build.Rd | 1 + man/get_ssm_by_patients.Rd | 8 +- man/get_ssm_by_samples.Rd | 31 ++-- man/grch37_all_gene_coordinates.Rd | 2 +- man/grch37_gene_coordinates.Rd | 2 +- man/grch37_lymphoma_genes_bed.Rd | 2 +- man/grch37_partners.Rd | 2 +- man/hg38_gene_coordinates.Rd | 2 +- man/hg38_lymphoma_genes_bed.Rd | 2 +- man/hg38_partners.Rd | 2 +- man/hgnc2pfam.df.Rd | 2 +- man/hotspot_regions_grch37.Rd | 2 +- man/hotspot_regions_hg38.Rd | 2 +- man/hotspots_annotations.Rd | 2 +- man/lymphoma_genes_bl_v0.1.Rd | 2 +- man/lymphoma_genes_bl_v0.2.Rd | 2 +- man/lymphoma_genes_comprehensive.Rd | 2 +- man/lymphoma_genes_dlbcl_v0.1.Rd | 2 +- man/lymphoma_genes_dlbcl_v0.2.Rd | 2 +- man/lymphoma_genes_lymphoma_genes_v0.0.Rd | 2 +- man/lymphoma_genes_mcl_v0.1.Rd | 2 +- man/lymphoma_genes_mcl_v0.2.Rd | 2 +- man/mutation.table.df.Rd | 2 +- man/preserve_genomic_attributes.Rd | 1 + man/process_regions.Rd | 5 +- man/protein_domains.Rd | 2 +- man/reddy_genes.Rd | 2 +- man/review_hotspots.Rd | 2 +- man/sample_data.Rd | 2 +- ...tic_hypermutation_locations_GRCh37_v0.0.Rd | 2 +- ...tic_hypermutation_locations_GRCh37_v0.1.Rd | 2 +- ...tic_hypermutation_locations_GRCh37_v0.2.Rd | 2 +- ...tic_hypermutation_locations_GRCh37_v0.3.Rd | 2 +- ...tic_hypermutation_locations_GRCh37_v0.4.Rd | 2 +- ...tic_hypermutation_locations_GRCh37_v0.5.Rd | 2 +- ...hypermutation_locations_GRCh37_v_latest.Rd | 2 +- ...tic_hypermutation_locations_GRCh38_v0.0.Rd | 2 +- ...tic_hypermutation_locations_GRCh38_v0.1.Rd | 2 +- ...tic_hypermutation_locations_GRCh38_v0.2.Rd | 2 +- ...tic_hypermutation_locations_GRCh38_v0.3.Rd | 2 +- ...tic_hypermutation_locations_GRCh38_v0.4.Rd | 2 +- ...tic_hypermutation_locations_GRCh38_v0.5.Rd | 2 +- ...hypermutation_locations_GRCh38_v_latest.Rd | 2 +- man/strip_genomic_classes.Rd | 1 + man/target_regions_grch37.Rd | 2 +- man/target_regions_hg38.Rd | 2 +- man/wright_genes_with_weights.Rd | 2 +- 75 files changed, 446 insertions(+), 289 deletions(-) create mode 100644 man/check_get_projection.Rd diff --git a/NAMESPACE b/NAMESPACE index 6ad118b..9c0a5a9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ export(review_hotspots) export(strip_genomic_classes) import(dplyr) import(ggplot2) +import(parallel) import(purrr) import(tibble) import(tidyr) diff --git a/R/annotate_hotspots.R b/R/annotate_hotspots.R index 0edcb10..1da616d 100644 --- a/R/annotate_hotspots.R +++ b/R/annotate_hotspots.R @@ -1,8 +1,10 @@ #' @title Annotate Hotspots. #' -#' @description Annotate MAF-like data frome with a hot_spot column indicating recurrent mutations. +#' @description Annotate MAF-like data frome with a hot_spot column indicating +#' recurrent mutations. #' -#' @details This function takes an already loaded MAF data frame with the `mutation_maf` parameter. +#' @details This function takes an already loaded MAF data frame with the +#' `mutation_maf` parameter. #' #' @param mutation_maf A data frame in MAF format. #' @param ... Any other parameter. These parameters will be ignored. @@ -17,27 +19,28 @@ #' all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, #' projection = "grch37", #' this_seq_type = "genome") %>% -#' dplyr::filter(Hugo_Symbol %in% c("EZH2","MEF2B","MYD88","KMT2D")) %>% +#' dplyr::filter(Hugo_Symbol %in% c("EZH2", +#' "MEF2B","MYD88","KMT2D")) %>% #' dplyr::arrange(Hugo_Symbol) #' #' hot_ssms = annotate_hotspots(all_coding_ssm) -#' hot_ssms %>% dplyr::filter(!is.na(hot_spot)) %>% -#' dplyr::select(1:5,37,hot_spot) +#' hot_ssms %>% dplyr::filter(!is.na(hot_spot)) %>% +#' dplyr::select(1:5,37,hot_spot) #' annotate_hotspots = function( - mutation_maf, - ... -){ + mutation_maf, + ... +) { - # check if any invalid parameters are provided - check_excess_params(...) + # check if any invalid parameters are provided + check_excess_params(...) - filled_coords <- GAMBLR.data::hotspots_annotations - # just the ssms that match these coordinates! - hot_ssms <- left_join( - mutation_maf, - filled_coords, - by = c("Chromosome", "Start_Position") - ) - return(hot_ssms) + filled_coords <- GAMBLR.data::hotspots_annotations + # just the ssms that match these coordinates! + hot_ssms <- left_join( + mutation_maf, + filled_coords, + by = c("Chromosome", "Start_Position") + ) + return(hot_ssms) } diff --git a/R/calc_mutation_frequency_bin_region.R b/R/calc_mutation_frequency_bin_region.R index 1082a68..1bf9592 100644 --- a/R/calc_mutation_frequency_bin_region.R +++ b/R/calc_mutation_frequency_bin_region.R @@ -48,14 +48,13 @@ #' @export #' #' @examples -#' myc_mut_freq = calc_mutation_frequency_bin_region(region = "8:128747680-128753674", +#' myc_region = "8:128747680-128753674" +#' myc_mut_freq = calc_mutation_frequency_bin_region(region = myc_region, #' slide_by = 10, #' window_size = 10000) +#' dplyr::arrange(myc_mut_freq,desc(mutation_count)) #' calc_mutation_frequency_bin_region <- function(region, - chromosome, - start_pos, - end_pos, these_samples_metadata = NULL, these_sample_ids = NULL, this_seq_type = "genome", @@ -73,25 +72,19 @@ calc_mutation_frequency_bin_region <- function(region, check_excess_params(...) # Create objects to describe region both as string and individual objects - try(if (missing(region) & missing(chromosome)) { - stop("No region information provided. Please provide a region as a string in the chrom:start-end format, or as individual arguments. ") + try(if (missing(region)) { + stop("No region information provided. Please provide a region as a string in the chrom:start-end format") }) if ((drop_unmutated | min_count_per_bin > 0) & return_format == "wide") { message("To return a wide table, all samples and windows must be kept. Ignoring drop_unmutated and min_count_per_bin arguments. ") } - if (missing(region)) { - region <- paste0( - chromosome, ":", start_pos, "-", - end_pos - ) - } else { - chunks <- region_to_chunks(region) - chromosome <- chunks$chromosome - start_pos <- as.numeric(chunks$start) - end_pos <- as.numeric(chunks$end) - } + + chunks <- region_to_chunks(region) + chromosome <- chunks$chromosome + start_pos <- as.numeric(chunks$start) + end_pos <- as.numeric(chunks$end) # Harmonize metadata and sample IDs metadata <- id_ease( diff --git a/R/cool_overlaps.R b/R/cool_overlaps.R index 12d8289..e20b1c3 100644 --- a/R/cool_overlaps.R +++ b/R/cool_overlaps.R @@ -41,6 +41,7 @@ #' data will match the exact order of rows in the input data1. #' #' @return data frame +#' @keywords internal #' #' @examples #' # obtain maf data diff --git a/R/data.R b/R/data.R index b2bf0c9..4274270 100644 --- a/R/data.R +++ b/R/data.R @@ -40,6 +40,7 @@ #' \item{end}{End coordinates for the specified chromosome arm.} #' \item{arm}{Chromosome arm, either p or q.} #' } +#' @keywords internal "chromosome_arms_grch37" @@ -55,6 +56,7 @@ #' \item{end}{End coordinates for the specified chromosome arm.} #' \item{arm}{Chromosome arm, either p or q.} #' } +#' @keywords internal "chromosome_arms_hg38" @@ -69,6 +71,7 @@ #' \item{ImportanceScore}{Numeric column with importance scores.} #' \item{Hugo_Symbol}{Gene symbols in Hugo format as a factor with 104 levels.} #' } +#' @keywords internal "dhitsig_genes_with_weights" @@ -81,6 +84,7 @@ #' \describe{ #' \item{Gene}{Genes symbols in Hugo format.} #' } +#' @keywords internal "gene_blacklist" @@ -98,6 +102,7 @@ #' \item{gene_name}{The gene name} #' \item{hugo_symbol}{Gene symbol in Hugo format} #' } +#' @keywords internal "grch37_all_gene_coordinates" @@ -115,6 +120,7 @@ #' \item{gene_name}{The gene name} #' \item{hugo_symbol}{Gene symbol in Hugo format} #' } +#' @keywords internal "grch37_gene_coordinates" @@ -130,6 +136,7 @@ #' \item{end_position}{The end coordinate for the gene} #' \item{hgnc_symbol}{Gene symbol in Hugo format} #' } +#' @keywords internal "grch37_lymphoma_genes_bed" @@ -146,6 +153,7 @@ #' \item{gene}{Gene symbol in Hugo format} #' \item{entrez}{Entrez ID} #' } +#' @keywords internal "grch37_partners" #' hg38 Gene Coordinates. @@ -162,6 +170,7 @@ #' \item{gene_name}{The gene name} #' \item{hugo_symbol}{Gene symbol in Hugo format} #' } +#' @keywords internal "hg38_gene_coordinates" @@ -177,6 +186,7 @@ #' \item{end_position}{The end coordinate for the gene} #' \item{hgnc_symbol}{Gene symbol in Hugo format} #' } +#' @keywords internal "hg38_lymphoma_genes_bed" @@ -193,6 +203,7 @@ #' \item{gene}{Gene symbol in Hugo format} #' \item{entrez}{Entrez ID} #' } +#' @keywords internal "hg38_partners" @@ -207,6 +218,7 @@ #' \item{start}{The start coordinate for the region} #' \item{end}{The end coordinate for the region} #' } +#' @keywords internal "hotspot_regions_grch37" @@ -221,6 +233,7 @@ #' \item{start}{The start coordinate for the region} #' \item{end}{The end coordinate for the region} #' } +#' @keywords internal "hotspot_regions_hg38" @@ -241,6 +254,7 @@ #' \item{Lacy}{Boolean flag, TRUE if gene verified by the stated study (Lacy)} #' \item{aSHM}{Boolean flag for annotating aSHM} #' } +#' @keywords internal "lymphoma_genes_comprehensive" @@ -256,6 +270,7 @@ #' \item{Approved name}{Approved name} #' \item{HGNC ID}{HGNC ID} #' } +#' @keywords internal "reddy_genes" @@ -270,6 +285,7 @@ #' \item{start}{Start coordinate of the region} #' \item{end}{End coordiante of the region} #' } +#' @keywords internal "target_regions_grch37" @@ -284,6 +300,7 @@ #' \item{start}{Start coordinate of the region} #' \item{end}{End coordiante of the region} #' } +#' @keywords internal "target_regions_hg38" @@ -298,6 +315,7 @@ #' \item{Hugo_Symbol}{Gene symbol in Hugo format} #' \item{Weight_tValue}{Weight Value for the specified gene} #' } +#' @keywords internal "wright_genes_with_weights" @@ -315,6 +333,7 @@ #' } #' @examples #' mutation.table.df +#' @keywords internal "mutation.table.df" #' Mapping table between gene.symbol, uniprot.id, and pfam @@ -337,6 +356,7 @@ #' @examples #' hgnc2pfam.df #' @source Pfam (v31.0) and UniProt +#' @keywords internal "hgnc2pfam.df" @@ -353,6 +373,7 @@ #' \item{colour}{Colour annotated in HEX format.} #' \item{is_alias}{Describes if the colour has an alias (yes) or not (NA)} #' } +#' @keywords internal "colour_codes" @@ -409,6 +430,7 @@ #' \item{n_BL_Panea_original}{Total number of mutated tumors as originally reported in Panea study.} #' \item{frequency_BL_Panea_original}{Frequency of mutation as originally reported in Panea study.} #' } +#' @keywords internal "lymphoma_genes_bl_v0.1" @@ -429,6 +451,7 @@ #' \item{MutationEffect}{Annotates the effect of the gene mutation.} #' \item{Mutation.PMID}{Pubmed ID to associated study where mutation effect is described.} #' } +#' @keywords internal "lymphoma_genes_bl_v0.2" #' Lymphoma Genes BL Latest @@ -472,6 +495,7 @@ #' \item{common_alias}{Variable annotating other common aliases for the event, if such exists.} #' \item{noncoding_driver_support}{Boolean variable annotating if the event has noncoding driver support or not.} #' } +#' @keywords internal "lymphoma_genes_dlbcl_v0.1" #' Lymphoma Genes DLBCL v0.2 @@ -493,6 +517,7 @@ #' \item{MutationEffect.citekey}{Alphanumeric representation of the citekey to associated study where mutation effect is described.} #' \item{Mutation.PMID}{Whether this gene is a feature in LymphGen classifier.} #' } +#' @keywords internal "lymphoma_genes_dlbcl_v0.2" #' Lymphoma Genes DLBCL Latest @@ -540,6 +565,7 @@ #' \item{Chappuy}{Boolean variable stating if the event is described in the study (Chappuy).} #' \item{entrezgene_id}{Gene ID in entrez fromat.} #' } +#' @keywords internal "lymphoma_genes_lymphoma_genes_v0.0" @@ -564,6 +590,7 @@ #' \item{noncoding_driver_support}{Boolean variable annotating if the event has noncoding driver support or not.} #' \item{aSHM}{Boolean varaible annotating if the event is considered an aSHM or not.} #' } +#' @keywords internal "lymphoma_genes_mcl_v0.1" #' Lymphoma Genes MCL v0.2 @@ -591,6 +618,7 @@ #' \item{Earliest_support}{The earlist study to describe this gene to be mutated in MCL.} #' \item{citekey}{Alphanumeric representation of the citekey where this gene was first described.} #' } +#' @keywords internal "lymphoma_genes_mcl_v0.2" @@ -636,6 +664,7 @@ #' \item{grch37}{A list containing 3 data frames; maf, seg, and bedpe. All in respect to grch37.} #' \item{hg38}{A list containing 3 data frames; maf, seg, and bedpe. All in respect to hg38.} #' } +#' @keywords internal "sample_data" @@ -653,6 +682,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh37_v0.0" @@ -670,6 +700,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh37_v0.1" @@ -687,6 +718,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh37_v0.2" @@ -704,6 +736,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh37_v0.3" #' Somatic Hypermutation Locations GRCh37 v0.4 @@ -720,6 +753,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh37_v0.4" #' Somatic Hypermutation Locations GRCh37 v0.5 @@ -736,6 +770,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh37_v0.5" #' Somatic Hypermutation Locations GRCh37 Latest @@ -752,6 +787,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh37_v_latest" @@ -770,6 +806,7 @@ #' \item{regulatory_comment}{Annotates region with regulatory information.} #' \item{name}{Location name.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh38_v0.0" @@ -788,6 +825,7 @@ #' \item{regulatory_comment}{Annotates region with regulatory information.} #' \item{name}{Location name.}v #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh38_v0.1" @@ -805,6 +843,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh38_v0.2" @@ -822,6 +861,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh38_v0.3" @@ -839,6 +879,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh38_v0.4" #' Somatic Hypermutation Locations GRCh38 v0.5 @@ -855,6 +896,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh38_v0.5" #' Somatic Hypermutation Locations GRCh38 Latest @@ -871,6 +913,7 @@ #' \item{region}{Region name.} #' \item{regulatory_comment}{Annotates region with regulatory information.} #' } +#' @keywords internal "somatic_hypermutation_locations_GRCh38_v_latest" @@ -909,6 +952,7 @@ #' \item{sex}{Female or Male} #' \item{time_point}{Smaple timepoint.} #' } +#' @keywords internal "gambl_metadata" @@ -955,6 +999,7 @@ #' \item{Chromosome}{Name of gene chromosome.} #' \item{hot_spot}{Hot spot annotation.} #' } +#' @keywords internal "hotspots_annotations" @@ -979,6 +1024,7 @@ #' \item{Description}{Description of the domain.} #' \item{Description}{NA.} #' } +#' @keywords internal "protein_domains" #' Cytobands coordinates (grch37) @@ -994,6 +1040,7 @@ #' \item{cb.name}{Cytoband name.} #' \item{label}{Cytoband label.} #' } +#' @keywords internal "cytobands_grch37" #' Cytobands coordinates (hg38) @@ -1009,6 +1056,7 @@ #' \item{cb.name}{Cytoband name.} #' \item{label}{Cytoband label.} #' } +#' @keywords internal "cytobands_hg38" #' DLBCL90 genes @@ -1025,4 +1073,5 @@ #' \item{gene_id}{ENSEMBL gene id with version.} #' \item{hgnc_symbol}{Human-readable gene symbol matching Gencode 33.} #' } +#' @keywords internal "dlbcl90_genes" diff --git a/R/genomic_data.R b/R/genomic_data.R index f8df597..d56e5a0 100644 --- a/R/genomic_data.R +++ b/R/genomic_data.R @@ -3,30 +3,34 @@ #' Check and set the genome_build/projection #' #' This helper function checks the genome build of each genomic data object in -#' \code{genomic_data_list} (using \code{get_genome_build()}) and ensures they are consistent. -#' If all objects share a single, unique genome build, that value is returned. If a -#' user-specified genome build (\code{suggested}) is provided, it is compared to the -#' inferred build and must match; otherwise, an error is raised. If the genomic data -#' objects have conflicting genome builds, or if no genome build can be inferred and -#' no \code{suggested} value is provided, the function stops with an error. +#' \code{genomic_data_list} (using \code{get_genome_build()}) and ensures +#' they are consistent. If all objects share a single, unique genome build, +#' that value is returned. If a user-specified genome build (\code{suggested}) +#' is provided, it is compared to the inferred build and must match; otherwise, +#' an error is raised. If the genomic data objects have conflicting genome +#' builds or if no genome build can be inferred and no \code{suggested} +#' value is provided, the function stops with an error. #' -#' @param genomic_data_list A list of genomic data objects. Each object should have a genome build -#' that can be retrieved by \code{get_genome_build()}. -#' @param suggested An optional character string specifying a genome build (projection) to be used. -#' If provided, it must match the genome build inferred from the data objects. +#' @param genomic_data_list A list of genomic data objects. Each object should +#' have a genome build that can be retrieved by \code{get_genome_build()}. +#' @param suggested An optional character string specifying a genome build +#' (projection) to be used. If provided, it must match the genome build inferred +#' from the data objects. #' #' @return A character string representing the genome build to be used. #' @export -#' +#' @keywords internal #' @examples #' # Example 1: When genomic data objects all have the same genome build. #' # Assuming maf_data and seg_data both have a genome build of "hg38": #' genomic_data <- list(maf_data = maf_data, seg_data = seg_data) #' projection <- check_get_projection(genomic_data, suggested = "hg38") #' -#' # Example 2: When the genomic data objects conflict or no genome build is available. -#' # This will throw an error: -#' genomic_data <- list(maf_data = maf_data, seg_data = seg_data_with_different_build) +#' # Example 2: When the genomic data objects conflict or no genome build +#' # is available. +#' # This will raise an error: +#' genomic_data <- list(maf_data = maf_data, +#' seg_data = seg_data_with_different_build) #' projection <- check_get_projection(genomic_data, suggested = "hg38") #' check_get_projection <- function(genomic_data_list, suggested) { @@ -37,44 +41,49 @@ check_get_projection <- function(genomic_data_list, suggested) { if (length(uniq_builds) == 1) { # A single, consistent genome build was inferred. if (!missing(suggested) && suggested != uniq_builds) { - stop("Mismatch between user-specified genome_build and the genome_build inferred from objects.") + stop("Mismatch between user-specified genome_build and + the genome_build inferred from objects.") } return(uniq_builds) } if (length(uniq_builds) > 1) { # Conflicting genome builds among the objects. - stop("Conflicting genome_build values found: ", paste(uniq_builds, collapse = ", ")) + stop("Conflicting genome_build values found: ", + paste(uniq_builds, collapse = ", ")) } - + # No genome build could be inferred. if (missing(suggested)) { - stop("No projection provided and genome_build cannot be inferred from the inputs.") + stop("No projection provided and genome_build + cannot be inferred from the inputs.") } - + return(suggested) } -check_get_projection <- function(genomic_data_list, suggested) { - builds <- sapply(genomic_data_list, get_genome_build) - uniq_builds <- unique(builds) - if (length(uniq_builds) == 1) { - # If a genome build can be inferred unambiguously - if (!missing(suggested) && suggested != uniq_builds) { - stop("Mismatch between user-specified genome_build and the genome_build inferred from objects.") - } - return(uniq_builds) - } - if (length(uniq_builds) > 1) { - stop("Conflicting genome_build values found: ", paste(uniq_builds, collapse = ", ")) - } - # If no genome build could be inferred (uniq_builds is empty) - if (missing(suggested)) { - stop("No projection provided and genome_build cannot be inferred from the inputs.") - } - return(suggested) +## GAMBLR.data +#' Create Segmented Data +#' +#' This function creates segmented data from the given input. +#' +#' @param seg_df A data frame containing the segmented data. +#' @param genome_build Required character vector specifying the genome build +#' currently supported: "grch37" or "hg38". +#' @return A data frame with class attributes for segmented data. +#' @export +#' @examples +#' seg_df <- data.frame(...) +#' create_seg_data(seg_df, "grch37") +create_seg_data <- function(seg_df, genome_build) { + if (!inherits(seg_df, "data.frame")) stop("data must be a data frame") + if (!genome_build %in% c("grch37", "hg38")) stop("Invalid genome build") + structure(seg_df, + class = c("seg_data", class(seg_df)), + genome_build = genome_build) } + #' Create MAF Data #' #' This function creates MAF (Mutation Annotation Format) data from the given input. @@ -93,6 +102,7 @@ create_maf_data <- function(maf_df, genome_build) { } #' @export +#' @keywords internal print.maf_data <- function(x, ...) { cat("MAF Data Object\n") cat("Genome Build:", attr(x, "genome_build"), "\n") @@ -110,6 +120,7 @@ print.maf_data <- function(x, ...) { #' @param data A data frame with genome build attribute. #' @return A string specifying the genome build. #' @export +#' @keywords internal get_genome_build <- function(data) { attr(data, "genome_build") } @@ -122,6 +133,7 @@ get_genome_build <- function(data) { #' @param old_data The original data frame with genomic attributes. #' @return A data frame with preserved genomic attributes. #' @export +#' @keywords internal preserve_genomic_attributes <- function(new_data, old_data) { # Preserve the genome_build attribute attr(new_data, "genome_build") <- attr(old_data, "genome_build") @@ -147,6 +159,7 @@ preserve_genomic_attributes <- function(new_data, old_data) { #' c("genomic_data", "maf_data", "bed_data"). #' @return The object with the specified classes removed. #' @export +#' @keywords internal strip_genomic_classes <- function(x, classes = c("genomic_data", "maf_data", "bed_data")) { current_classes <- class(x) new_classes <- setdiff(current_classes, classes) @@ -157,31 +170,37 @@ strip_genomic_classes <- function(x, classes = c("genomic_data", "maf_data", "be # S3 methods for genomic_data class #' @export +#' @keywords internal mutate.genomic_data <- function(.data, ...) { new_data <- dplyr::mutate(as.data.frame(.data), ...) preserve_genomic_attributes(new_data, .data) } #' @export +#' @keywords internal filter.genomic_data <- function(.data, ...) { new_data <- dplyr::filter(as.data.frame(.data), ...) preserve_genomic_attributes(new_data, .data) } #' @export +#' @keywords internal select.genomic_data <- function(.data, ...) { new_data <- dplyr::select(as.data.frame(.data), ...) preserve_genomic_attributes(new_data, .data) } #' @export +#' @keywords internal rename.genomic_data <- function(.data, ...) { new_data <- dplyr::rename(as.data.frame(.data), ...) preserve_genomic_attributes(new_data, .data) } #' @export +#' @keywords internal arrange.genomic_data <- function(.data, ...) { new_data <- dplyr::arrange(as.data.frame(.data), ...) preserve_genomic_attributes(new_data, .data) } #' @export +#' @keywords internal group_by.genomic_data <- function(.data, ..., .add = FALSE) { new_data <- dplyr::group_by(as.data.frame(.data), ..., .add = .add) preserve_genomic_attributes(new_data, .data) @@ -209,8 +228,9 @@ ungroup.genomic_data <- function(x, ...) { #' #' merged_maf = bind_genomic_data(maf1, maf2,check_id=FALSE) #' +#' @keywords internal bind_genomic_data <- function(..., check_id = TRUE) { - + in_list <- list(...) if ("maf_data" %in% class(in_list[[1]])) { @@ -224,7 +244,8 @@ bind_genomic_data <- function(..., check_id = TRUE) { } # Ensure all inputs are either maf_data or seg_data objects - if (!all(sapply(in_list, inherits, "maf_data")) && !all(sapply(in_list, inherits, "seg_data"))) { + if (!all(sapply(in_list, inherits, "maf_data")) && + !all(sapply(in_list, inherits, "seg_data"))) { stop("All inputs must be maf_data objects or seg_data objects.") } @@ -232,11 +253,13 @@ bind_genomic_data <- function(..., check_id = TRUE) { genome_builds <- unique(sapply(in_list, get_genome_build)) if (length(genome_builds) > 1) { - stop("Cannot bind seg_data or maf_data objects with different genome builds: ", + stop("Cannot bind seg_data or maf_data objects + with different genome builds: ", paste(genome_builds, collapse = ", ")) } - # If check_id is TRUE, verify that the expected ID column exists and that IDs are unique. + # If check_id is TRUE, verify that the expected ID column exists and + # that IDs are unique. if (check_id) { # Collect unique sample IDs from each dataset id_sets <- lapply(in_list, function(df) { @@ -252,15 +275,18 @@ bind_genomic_data <- function(..., check_id = TRUE) { # If any ID is found in multiple datasets, throw an error if (length(duplicate_ids) > 0) { - stop("Duplicate IDs found in multiple input data frames: ", paste(duplicate_ids, collapse = ", ")) + stop("Duplicate IDs found in multiple input data frames: ", + paste(duplicate_ids, collapse = ", ")) } } combined <- dplyr::bind_rows(in_list) - attr(combined, "genome_build") <- genome_builds[1] # Assign the common genome build + attr(combined, "genome_build") <- genome_builds[1] +# Assign the common genome build if (!"maf_data" %in% class(combined)) { - class(combined) <- c("maf_data", "genomic_data", class(combined)) # Preserve class + class(combined) <- c("maf_data", "genomic_data", class(combined)) +# Preserve class } return(combined) @@ -294,13 +320,19 @@ bind_genomic_data <- function(..., check_id = TRUE) { #' where they occur. #' #' @param bed_df A data frame containing the BED data. -#' @param genome_build A string specifying the genome build ("grch37" or "hg38"). -#' If NULL, the function will try to infer the genome build from the object name. -#' @param fix_names Either NULL (the default), or one of "chrom_start_end" or "concat". -#' If not NULL and duplicate names are detected, the function will apply the chosen fix. -#' @param concat_cols When `fix_names = "concat"`, a character vector specifying which columns +#' @param genome_build A string specifying the genome build +#' ("grch37" or "hg38"). +#' If NULL, the function will try to infer the genome build +#' from the object name. +#' @param fix_names Either NULL (the default), or one of "chrom_start_end" +#' or "concat". +#' If not NULL and duplicate names are detected, the function will +#' apply the chosen fix. +#' @param concat_cols When `fix_names = "concat"`, a character vector +#' specifying which columns #' from the original data to merge. -#' @param sep The separator to use when concatenating columns if fix_names = "concat". +#' @param sep The separator to use when concatenating columns if +#' fix_names = "concat". #' Defaults to "" (no separator). #' @return A data frame with class attributes for BED data. #' @@ -314,15 +346,13 @@ bind_genomic_data <- function(..., check_id = TRUE) { #' concat_cols = c("gene","region"), #' sep="-") #' # the build is automatically inferred if it is in the variable name -#' # get_genome_build(ashm_bed) -#' # [1] "grch37" -#' +#' get_genome_build(ashm_bed) +#' print(ashm_bed) #' another_bed = create_bed_data(somatic_hypermutation_locations_GRCh37_v_latest, #' fix_names = "concat", #' concat_cols = c("chr_name","hg19_start","hg19_end")) #' -#' # get_genome_build(another_bed) -#' # [1] "grch37" +#' get_genome_build(another_bed) #' #' # get a bed_data object for all gene regions and combine several columns to make a unique name #' gene_regions <- create_bed_data(hg38_gene_coordinates, @@ -330,10 +360,8 @@ bind_genomic_data <- function(..., check_id = TRUE) { #' sep="-", #' concat_cols = c("chromosome","start","end","gene_name")) #' -#' #get_genome_build(gene_regions) -#' # [1] "hg38" -#' -#' +#' get_genome_build(gene_regions) +#' create_bed_data <- function(bed_df, genome_build = NULL, fix_names = NULL, diff --git a/R/get_cn_segments.R b/R/get_cn_segments.R index c3cecb0..18174e9 100644 --- a/R/get_cn_segments.R +++ b/R/get_cn_segments.R @@ -1,33 +1,16 @@ -## GAMBLR.data -#' Create Segmented Data -#' -#' This function creates segmented data from the given input. -#' -#' @param seg_df A data frame containing the segmented data. -#' @param genome_build A string specifying the genome build ("grch37" or "hg38"). -#' @return A data frame with class attributes for segmented data. -#' @export -#' @examples -#' seg_df <- data.frame(...) -#' create_seg_data(seg_df, "grch37") -create_seg_data <- function(seg_df, genome_build) { - if (!inherits(seg_df, "data.frame")) stop("data must be a data frame") - if (!genome_build %in% c("grch37", "hg38")) stop("Invalid genome build") - - structure(seg_df, - class = c("seg_data", class(seg_df)), - genome_build = genome_build) -} - #' @title Get CN Segments. #' #' @description Retrieve all copy number segments from the GAMBL outputs #' -#' @details This function merely loads and returns all the seg_data available for a projection (genome build) -#' @param these_samples_metadata User must provide a metadata table to restrict the data to the samples in your table. -#' The metadata also ensures the proper handling of duplicate sample_id across seq_types and ensures the -#' seq_type in the metadata faithfully represents the seq_type of the data -#' @param projection Desired genome coordinate system for returned CN segments. Default is "grch37". +#' @details This function merely loads and returns all the seg_data +#' available for a projection (genome build) +#' @param these_samples_metadata User must provide a metadata table to +#' restrict the data to the samples in your table. +#' The metadata also ensures the proper handling of duplicate sample_id +#' across seq_types and ensures the seq_type in the metadata faithfully +#' represents the seq_type of the data +#' @param projection Desired genome coordinate system for returned CN segments. +#' Default is "grch37". #' @param this_seq_type Deprecated. #' @param ... Additional parameters to be passed to the function. #' @@ -38,9 +21,9 @@ create_seg_data <- function(seg_df, genome_build) { #' #' @examples #' # Example for the capture samples: -#' -#' genome_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter="genome") -#' +#' +#' genome_metadata = get_gambl_metadata(seq_type_filter="genome") +#' #' genome_segments_hg38 = get_cn_segments( #' these_samples_metadata = genome_metadata, #' projection="hg38") @@ -48,7 +31,7 @@ create_seg_data <- function(seg_df, genome_build) { #' get_cn_segments = function(these_samples_metadata, projection = "grch37", - this_seq_type,...){ + this_seq_type, ...) { #warn/notify the user what version of this function they are using message("Using the bundled CN segments (.seg) calls in GAMBLR.data...") @@ -56,34 +39,35 @@ get_cn_segments = function(these_samples_metadata, check_excess_params(...) #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) + valid_projections = grep("meta", names(GAMBLR.data::sample_data), + value = TRUE, invert = TRUE) metadata = these_samples_metadata sample_ids = metadata$sample_id #return CN segments based on the selected projection - if(projection %in% valid_projections){ + if (projection %in% valid_projections) { all_segs = GAMBLR.data::sample_data[[projection]]$seg %>% dplyr::filter(ID %in% sample_ids) - }else{ - stop(paste("please provide a valid projection. The following are available:", - paste(valid_projections,collapse=", "))) + }else { + stop(paste("please provide a valid projection.", + paste(valid_projections, collapse = ", "))) } - #ensure chr prefixes are there when necessary - if(projection=="grch37"){ - if(grepl("chr",all_segs$chrom[1])){ + #ensure chr prefixes are there when necessary + if(projection == "grch37") { + if(grepl("chr", all_segs$chrom[1])) { all_segs = all_segs %>% dplyr::mutate(chrom = gsub("chr", "", chrom)) } - }else{ - if(!grepl("chr",all_segs$chrom[1])){ + }else { + if (!grepl("chr",all_segs$chrom[1])) { all_segs = all_segs %>% dplyr::mutate(chrom = paste0("chr", chrom)) } } - #return S3 class with CN segments and genome_build - all_segs = create_seg_data(all_segs,projection) + #return S3 class with CN segments and genome_build + all_segs = create_seg_data(all_segs, projection) return(all_segs) } diff --git a/R/get_ssm_by_patients.R b/R/get_ssm_by_patients.R index d634ce9..817e1fe 100644 --- a/R/get_ssm_by_patients.R +++ b/R/get_ssm_by_patients.R @@ -31,12 +31,12 @@ #' @examples #' #' # Lets find which patient_id occur more than once in the metadata first -#' my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) %>% -#' dplyr::group_by(patient_id) %>% -#' dplyr::tally() %>% -#' dplyr::filter(n>1) %>% +#' my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) %>% +#' dplyr::group_by(patient_id) %>% +#' dplyr::tally() %>% +#' dplyr::filter(n>1) %>% #' dplyr::pull(patient_id) -#' +#' #' #now let's get every SSM for all samples from these patients #' patient_maf = get_ssm_by_patients(these_patient_ids = my_ids) #' patient_maf %>% dplyr::group_by(Tumor_Sample_Barcode) %>% @@ -49,21 +49,24 @@ get_ssm_by_patients = function(these_patient_ids, tool_name = "slms-3", this_study, verbose = FALSE, - ...){ + ...) { #check if any invalid parameters are provided check_excess_params(...) #figure out what patients the user wants - if(missing(these_patient_ids)){ - if(missing(these_samples_metadata)){ - stop("You must provide either patient IDs (`these_patient_ids`) or a metadata table with the patient IDs of interest (`these_samples_metadata`)...") + if(missing(these_patient_ids)) { + if(missing(these_samples_metadata)) { + stop("You must provide patient IDs (`these_patient_ids`)or a metadata + table with the patient IDs of interest (`these_samples_metadata`)...") }else{ - message("No patient IDs were provided, this function will resort to all available patient IDs in the provided metadata.") + message("No patient IDs were provided, this function will resort to + all available patient IDs in the provided metadata.") } }else{ if(missing(these_samples_metadata)){ - these_samples_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter = this_seq_type) + these_samples_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter = + this_seq_type) } message("Patient IDs and metadata were provided, this function will resort to all available patient IDs in the provided metadata.") these_samples_metadata = these_samples_metadata %>% @@ -71,13 +74,15 @@ get_ssm_by_patients = function(these_patient_ids, } #run get_ssm_by_samples with these_samples_metadata parameter - samples_ssm = GAMBLR.data::get_ssm_by_samples(these_samples_metadata = these_samples_metadata, - projection = projection, - this_seq_type = this_seq_type, + samples_ssm = get_ssm_by_samples(these_samples_metadata = these_samples_metadata, + projection = projection, + this_seq_type = this_seq_type, tool_name = tool_name, verbose = verbose, ...) + samples_ssm = create_maf_data(samples_ssm,projection) # use S3-safe version of dplyr function + samples_ssm = mutate.genomic_data(samples_ssm,maf_seq_type = this_seq_type) } diff --git a/R/get_ssm_by_samples.R b/R/get_ssm_by_samples.R index 5c93669..2aeb5fb 100644 --- a/R/get_ssm_by_samples.R +++ b/R/get_ssm_by_samples.R @@ -1,17 +1,25 @@ #' @title Get SSM By Samples. #' -#' @description Get the SSMs (i.e. load MAF) for a single sample or a collection of samples. +#' @description Get the SSMs (i.e. load MAF) for a single sample or a +#' collection of samples. #' #' @details Retrieve a maf for a specific sample or a set of samples. #' Either specify the sample IDs of interest with `these_sample_ids`. -#' Or a metadata table subset to the sample IDs of interest with `these_samples_metadata`. +#' Or a metadata table subset to the sample IDs of interest with +#' `these_samples_metadata`. #' -#' @param these_sample_ids A vector of one or more sample IDs that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to auto-subset the data to samples in that table before returning. -#' If not provided and these_sample_ids is also not provided, the function will return SSM for all samples from the specified seq_type in the bundled metadata. +#' @param these_sample_ids A vector of one or more sample IDs that you +#' want results for. +#' @param these_samples_metadata Optional, a metadata table (with sample_id +#' column) to auto-subset the data to samples in that table before returning. +#' If not provided and these_sample_ids is also not provided, the function will +#' return SSM for all samples from the specified seq_type in the bundled +#' metadata. #' @param this_seq_type Default is genome. #' @param projection The projection genome build. Supports hg38 and grch37. -#' @param tool_name Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers. +#' @param tool_name Optionally specify which tool to report variant from. +#' The default is slms-3, also supports "publication" to return the exact +#' variants as reported in the original papers. #' @param verbose Enable for debugging/noisier output. #' @param ... Any additional parameters. #' @@ -22,22 +30,27 @@ #' @export #' #' @examples -#' #load a common dependency -#' library(dplyr) #' #' #Get genome-wide set of mutations from all DLBCL cell lines -#' cell_line_meta = get_gambl_metadata() %>% +#' +#' # 1. get our metadata for the DLBCL cell lines +#' cell_line_meta = get_gambl_metadata() %>% #' dplyr::filter(cohort == "DLBCL_cell_lines") #' +#' # 2. get the SSMs for the DLBCL cell lines #' dlbcl_maf = get_ssm_by_samples(these_samples_metadata = cell_line_meta) #' +#' # 3. have a look: +#' dlbcl_maf %>% dplyr::group_by(Tumor_Sample_Barcode) %>% +#' dplyr::count() +#' get_ssm_by_samples <- function(these_sample_ids = NULL, these_samples_metadata = NULL, this_seq_type = "genome", projection = "grch37", tool_name = "slms-3", verbose = FALSE, - ...){ + ...) { #warn/notify the user what version of this function they are using message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...") @@ -54,29 +67,33 @@ get_ssm_by_samples <- function(these_sample_ids = NULL, sample_ids = metadata$sample_id #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) + valid_projections = grep("meta", names(GAMBLR.data::sample_data), + value = TRUE, invert = TRUE) #return SSMs based on the selected projection - if(projection %in% valid_projections){ + if(projection %in% valid_projections) { sample_ssm = GAMBLR.data::sample_data[[projection]]$maf %>% + dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% + dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) + sample_ssm <- bind_rows(sample_ssm, + GAMBLR.data::sample_data[[projection]]$ashm %>% dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - sample_ssm <- bind_rows( - sample_ssm, - GAMBLR.data::sample_data[[projection]]$ashm %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) ) - - }else{ - stop(paste("please provide a valid projection. The following are available:", + + }else { + stop(paste("please provide a valid projection. Available options:", paste(valid_projections,collapse=", "))) } # Handle possible duplicates sample_ssm <- sample_ssm %>% - distinct(Tumor_Sample_Barcode, Chromosome, Start_Position, End_Position, .keep_all = TRUE) + distinct(Tumor_Sample_Barcode, + Chromosome, + Start_Position, + End_Position, + .keep_all = TRUE) # bundle genome_build with the maf_data sample_ssm = create_maf_data(sample_ssm,projection) # use S3-safe version of dplyr function diff --git a/man/annotate_hotspots.Rd b/man/annotate_hotspots.Rd index 68be61d..04936a9 100644 --- a/man/annotate_hotspots.Rd +++ b/man/annotate_hotspots.Rd @@ -15,21 +15,24 @@ annotate_hotspots(mutation_maf, ...) The same data frame with one additional column "hot_spot". } \description{ -Annotate MAF-like data frome with a hot_spot column indicating recurrent mutations. +Annotate MAF-like data frome with a hot_spot column indicating +recurrent mutations. } \details{ -This function takes an already loaded MAF data frame with the \code{mutation_maf} parameter. +This function takes an already loaded MAF data frame with the +\code{mutation_maf} parameter. } \examples{ my_metadata = get_gambl_metadata() all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, projection = "grch37", this_seq_type = "genome") \%>\% - dplyr::filter(Hugo_Symbol \%in\% c("EZH2","MEF2B","MYD88","KMT2D")) \%>\% + dplyr::filter(Hugo_Symbol \%in\% c("EZH2", + "MEF2B","MYD88","KMT2D")) \%>\% dplyr::arrange(Hugo_Symbol) hot_ssms = annotate_hotspots(all_coding_ssm) -hot_ssms \%>\% dplyr::filter(!is.na(hot_spot)) \%>\% - dplyr::select(1:5,37,hot_spot) +hot_ssms \%>\% dplyr::filter(!is.na(hot_spot)) \%>\% + dplyr::select(1:5,37,hot_spot) } diff --git a/man/bind_genomic_data.Rd b/man/bind_genomic_data.Rd index 1bc28f6..bca456b 100644 --- a/man/bind_genomic_data.Rd +++ b/man/bind_genomic_data.Rd @@ -25,3 +25,4 @@ An error will also be thrown if the same sample id is found in more than one of merged_maf = bind_genomic_data(maf1, maf2,check_id=FALSE) } +\keyword{internal} diff --git a/man/calc_mutation_frequency_bin_region.Rd b/man/calc_mutation_frequency_bin_region.Rd index 83a50c3..dec76e3 100644 --- a/man/calc_mutation_frequency_bin_region.Rd +++ b/man/calc_mutation_frequency_bin_region.Rd @@ -6,9 +6,6 @@ \usage{ calc_mutation_frequency_bin_region( region, - chromosome, - start_pos, - end_pos, these_samples_metadata = NULL, these_sample_ids = NULL, this_seq_type = "genome", @@ -27,12 +24,6 @@ calc_mutation_frequency_bin_region( \item{region}{A string describing a genomic region in the "chrom:start-end" format. The region must be specified in this format OR as separate chromosome, start_pos, end_pos arguments.} -\item{chromosome}{Chromosome name in region.} - -\item{start_pos}{Start coordinate of region.} - -\item{end_pos}{End coordinate of region.} - \item{these_samples_metadata}{Optional data frame containing a sample_id column. If not providing a maf file, seq_type is also a required column.} @@ -69,6 +60,12 @@ or binary mutated/unmutated status (FALSE). Default is TRUE.} Only effective with "long" return format.} \item{...}{Any additional parameters.} + +\item{chromosome}{Chromosome name in region.} + +\item{start_pos}{Start coordinate of region.} + +\item{end_pos}{End coordinate of region.} } \value{ Either a matrix or a long tidy table of counts per window. @@ -87,8 +84,10 @@ To return a matrix of sliding window counts over multiple regions, see \code{calc_mutation_frequency_bin_regions}. } \examples{ -myc_mut_freq = calc_mutation_frequency_bin_region(region = "8:128747680-128753674", +myc_region = "8:128747680-128753674" +myc_mut_freq = calc_mutation_frequency_bin_region(region = myc_region, slide_by = 10, window_size = 10000) +dplyr::arrange(myc_mut_freq,desc(mutation_count)) } diff --git a/man/calc_mutation_frequency_bin_regions.Rd b/man/calc_mutation_frequency_bin_regions.Rd index 7ef307b..1c091c7 100644 --- a/man/calc_mutation_frequency_bin_regions.Rd +++ b/man/calc_mutation_frequency_bin_regions.Rd @@ -82,20 +82,21 @@ May optionally provide any combination of a maf data frame, existing metadata, or a regions data frame or named vector. } \examples{ -#get some regions -these_regions <- process_regions(only_regions = c("MYC", "BCL2", "BCL6")) -reg_vec <- these_regions$regions_list -reg_bed <- these_regions$regions_bed - -# use a set of user defined regions (from genes) and -# calculate mut frequency across all available samples -mult_freq_all = calc_mutation_frequency_bin_regions(regions_list = reg_vec) -mult_freq_all = calc_mutation_frequency_bin_regions(regions_bed = reg_bed) - -#restrict the analysis to specific samples using the metadata -my_meta = get_gambl_metadata() \%>\% - dplyr::filter(pathology \%in\% c("DLBCL","FL")) -mult_reg_freq_fl_dlbcl = calc_mutation_frequency_bin_regions(regions_list = reg_vec, - these_sample_ids = "DOHH-2") - + #load metadata. + my_meta = get_gambl_metadata() + dlbcl_bl_meta = dplyr::filter(my_meta, pathology \%in\% c("DLBCL", "BL")) + + + #get ashm regions + some_regions = create_bed_data(grch37_ashm_regions, + fix_names = "concat", + concat_cols = c("gene","region"), + sep="-") + print(some_regions) + mut_count_matrix <- calc_mutation_frequency_bin_regions( + these_samples_metadata = dlbcl_bl_meta, + regions_bed = some_regions + ) +dim(mut_count_matrix) +tail(mut_count_matrix[,c(1:10)]) } diff --git a/man/check_excess_params.Rd b/man/check_excess_params.Rd index a4f5064..07ebe45 100644 --- a/man/check_excess_params.Rd +++ b/man/check_excess_params.Rd @@ -20,3 +20,4 @@ This function is designed to work as internal function-call in already available \details{ Catch function calls containing unsupported arguments. } +\keyword{internal} diff --git a/man/check_get_projection.Rd b/man/check_get_projection.Rd new file mode 100644 index 0000000..6afbbd6 --- /dev/null +++ b/man/check_get_projection.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/genomic_data.R +\name{check_get_projection} +\alias{check_get_projection} +\title{Check and set the genome_build/projection} +\usage{ +check_get_projection(genomic_data_list, suggested) +} +\arguments{ +\item{genomic_data_list}{A list of genomic data objects. Each object should +have a genome build that can be retrieved by \code{get_genome_build()}.} + +\item{suggested}{An optional character string specifying a genome build +(projection) to be used. If provided, it must match the genome build inferred +from the data objects.} +} +\value{ +A character string representing the genome build to be used. +} +\description{ +This helper function checks the genome build of each genomic data object in +\code{genomic_data_list} (using \code{get_genome_build()}) and ensures +they are consistent. If all objects share a single, unique genome build, +that value is returned. If a user-specified genome build (\code{suggested}) +is provided, it is compared to the inferred build and must match; otherwise, +an error is raised. If the genomic data objects have conflicting genome +builds or if no genome build can be inferred and no \code{suggested} +value is provided, the function stops with an error. +} +\examples{ +# Example 1: When genomic data objects all have the same genome build. +# Assuming maf_data and seg_data both have a genome build of "hg38": +genomic_data <- list(maf_data = maf_data, seg_data = seg_data) +projection <- check_get_projection(genomic_data, suggested = "hg38") + +# Example 2: When the genomic data objects conflict or no genome build +# is available. +# This will raise an error: +genomic_data <- list(maf_data = maf_data, + seg_data = seg_data_with_different_build) +projection <- check_get_projection(genomic_data, suggested = "hg38") + +} +\keyword{internal} diff --git a/man/chromosome_arms_grch37.Rd b/man/chromosome_arms_grch37.Rd index 093dcdf..d7f3464 100644 --- a/man/chromosome_arms_grch37.Rd +++ b/man/chromosome_arms_grch37.Rd @@ -22,4 +22,4 @@ chromosome_arms_grch37 \description{ A data frame with the chromosome arm coordinates in respect to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/chromosome_arms_hg38.Rd b/man/chromosome_arms_hg38.Rd index ae9f864..ffc0bde 100644 --- a/man/chromosome_arms_hg38.Rd +++ b/man/chromosome_arms_hg38.Rd @@ -22,4 +22,4 @@ chromosome_arms_hg38 \description{ A data frame with the chromosome arm coordinates in respect to hg38. } -\keyword{datasets} +\keyword{internal} diff --git a/man/colour_codes.Rd b/man/colour_codes.Rd index 32c4283..cdbdaef 100644 --- a/man/colour_codes.Rd +++ b/man/colour_codes.Rd @@ -23,4 +23,4 @@ colour_codes \description{ A data frame with colour codes (hex) arranged into different categories, groups. } -\keyword{datasets} +\keyword{internal} diff --git a/man/cool_overlaps.Rd b/man/cool_overlaps.Rd index dc7b9d4..44099e9 100644 --- a/man/cool_overlaps.Rd +++ b/man/cool_overlaps.Rd @@ -96,3 +96,4 @@ overlap <- cool_overlaps( ) } +\keyword{internal} diff --git a/man/create_bed_data.Rd b/man/create_bed_data.Rd index 537a835..97aa626 100644 --- a/man/create_bed_data.Rd +++ b/man/create_bed_data.Rd @@ -15,16 +15,22 @@ create_bed_data( \arguments{ \item{bed_df}{A data frame containing the BED data.} -\item{genome_build}{A string specifying the genome build ("grch37" or "hg38"). -If NULL, the function will try to infer the genome build from the object name.} +\item{genome_build}{A string specifying the genome build +("grch37" or "hg38"). +If NULL, the function will try to infer the genome build +from the object name.} -\item{fix_names}{Either NULL (the default), or one of "chrom_start_end" or "concat". -If not NULL and duplicate names are detected, the function will apply the chosen fix.} +\item{fix_names}{Either NULL (the default), or one of "chrom_start_end" +or "concat". +If not NULL and duplicate names are detected, the function will +apply the chosen fix.} -\item{concat_cols}{When \code{fix_names = "concat"}, a character vector specifying which columns +\item{concat_cols}{When \code{fix_names = "concat"}, a character vector +specifying which columns from the original data to merge.} -\item{sep}{The separator to use when concatenating columns if fix_names = "concat". +\item{sep}{The separator to use when concatenating columns if +fix_names = "concat". Defaults to "" (no separator).} } \value{ @@ -63,15 +69,13 @@ ashm_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, concat_cols = c("gene","region"), sep="-") # the build is automatically inferred if it is in the variable name -# get_genome_build(ashm_bed) -# [1] "grch37" - +get_genome_build(ashm_bed) +print(ashm_bed) another_bed = create_bed_data(somatic_hypermutation_locations_GRCh37_v_latest, fix_names = "concat", concat_cols = c("chr_name","hg19_start","hg19_end")) -# get_genome_build(another_bed) -# [1] "grch37" +get_genome_build(another_bed) # get a bed_data object for all gene regions and combine several columns to make a unique name gene_regions <- create_bed_data(hg38_gene_coordinates, @@ -79,8 +83,6 @@ gene_regions <- create_bed_data(hg38_gene_coordinates, sep="-", concat_cols = c("chromosome","start","end","gene_name")) -#get_genome_build(gene_regions) -# [1] "hg38" - +get_genome_build(gene_regions) } diff --git a/man/create_seg_data.Rd b/man/create_seg_data.Rd index af31259..e261bea 100644 --- a/man/create_seg_data.Rd +++ b/man/create_seg_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_cn_segments.R +% Please edit documentation in R/genomic_data.R \name{create_seg_data} \alias{create_seg_data} \title{Create Segmented Data} @@ -9,7 +9,8 @@ create_seg_data(seg_df, genome_build) \arguments{ \item{seg_df}{A data frame containing the segmented data.} -\item{genome_build}{A string specifying the genome build ("grch37" or "hg38").} +\item{genome_build}{Required character vector specifying the genome build +currently supported: "grch37" or "hg38".} } \value{ A data frame with class attributes for segmented data. diff --git a/man/cytobands_grch37.Rd b/man/cytobands_grch37.Rd index 83dd684..65003f1 100644 --- a/man/cytobands_grch37.Rd +++ b/man/cytobands_grch37.Rd @@ -23,4 +23,4 @@ cytobands_grch37 \description{ A data frame in bed format with coordinates of cytobands relative to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/cytobands_hg38.Rd b/man/cytobands_hg38.Rd index 0607a39..7882096 100644 --- a/man/cytobands_hg38.Rd +++ b/man/cytobands_hg38.Rd @@ -23,4 +23,4 @@ cytobands_hg38 \description{ A data frame in bed format with coordinates of cytobands relative to hg38. } -\keyword{datasets} +\keyword{internal} diff --git a/man/dhitsig_genes_with_weights.Rd b/man/dhitsig_genes_with_weights.Rd index 5dbab0f..95e7c9d 100644 --- a/man/dhitsig_genes_with_weights.Rd +++ b/man/dhitsig_genes_with_weights.Rd @@ -21,4 +21,4 @@ dhitsig_genes_with_weights \description{ A data frame with double hit signature genes (both as ensembl IDs and Hugo symbols) and importance scores. } -\keyword{datasets} +\keyword{internal} diff --git a/man/dlbcl90_genes.Rd b/man/dlbcl90_genes.Rd index 1317117..3a0f22d 100644 --- a/man/dlbcl90_genes.Rd +++ b/man/dlbcl90_genes.Rd @@ -24,4 +24,4 @@ dlbcl90_genes \description{ A data frame with genes and their weights for DLBCL90. } -\keyword{datasets} +\keyword{internal} diff --git a/man/gambl_metadata.Rd b/man/gambl_metadata.Rd index 0bd0e95..03d44c1 100644 --- a/man/gambl_metadata.Rd +++ b/man/gambl_metadata.Rd @@ -45,4 +45,4 @@ gambl_metadata \description{ A data frame with metadata for a collection of GAMBL samples. } -\keyword{datasets} +\keyword{internal} diff --git a/man/gene_blacklist.Rd b/man/gene_blacklist.Rd index 09b67a9..5edb1d1 100644 --- a/man/gene_blacklist.Rd +++ b/man/gene_blacklist.Rd @@ -19,4 +19,4 @@ gene_blacklist \description{ A tibble with gene symbols (Hugo) that falls within blacklisted regions of the genome. } -\keyword{datasets} +\keyword{internal} diff --git a/man/get_cn_segments.Rd b/man/get_cn_segments.Rd index 369bce9..9fd58b7 100644 --- a/man/get_cn_segments.Rd +++ b/man/get_cn_segments.Rd @@ -12,11 +12,14 @@ get_cn_segments( ) } \arguments{ -\item{these_samples_metadata}{User must provide a metadata table to restrict the data to the samples in your table. -The metadata also ensures the proper handling of duplicate sample_id across seq_types and ensures the -seq_type in the metadata faithfully represents the seq_type of the data} +\item{these_samples_metadata}{User must provide a metadata table to +restrict the data to the samples in your table. +The metadata also ensures the proper handling of duplicate sample_id +across seq_types and ensures the seq_type in the metadata faithfully +represents the seq_type of the data} -\item{projection}{Desired genome coordinate system for returned CN segments. Default is "grch37".} +\item{projection}{Desired genome coordinate system for returned CN segments. +Default is "grch37".} \item{this_seq_type}{Deprecated.} @@ -29,13 +32,14 @@ A data frame with CN segments for the specified region. Retrieve all copy number segments from the GAMBL outputs } \details{ -This function merely loads and returns all the seg_data available for a projection (genome build) +This function merely loads and returns all the seg_data +available for a projection (genome build) } \examples{ # Example for the capture samples: -genome_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter="genome") - +genome_metadata = get_gambl_metadata(seq_type_filter="genome") + genome_segments_hg38 = get_cn_segments( these_samples_metadata = genome_metadata, projection="hg38") diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd index 6a40f4e..85e5910 100644 --- a/man/get_genome_build.Rd +++ b/man/get_genome_build.Rd @@ -15,3 +15,4 @@ A string specifying the genome build. \description{ This function retrieves the genome build attribute from the data. } +\keyword{internal} diff --git a/man/get_ssm_by_patients.Rd b/man/get_ssm_by_patients.Rd index 2acd48a..6ef0cf7 100644 --- a/man/get_ssm_by_patients.Rd +++ b/man/get_ssm_by_patients.Rd @@ -53,10 +53,10 @@ or an already subset metadata table (\code{these_samples_metadata}). \examples{ # Lets find which patient_id occur more than once in the metadata first -my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) \%>\% - dplyr::group_by(patient_id) \%>\% - dplyr::tally() \%>\% - dplyr::filter(n>1) \%>\% +my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) \%>\% + dplyr::group_by(patient_id) \%>\% + dplyr::tally() \%>\% + dplyr::filter(n>1) \%>\% dplyr::pull(patient_id) #now let's get every SSM for all samples from these patients diff --git a/man/get_ssm_by_samples.Rd b/man/get_ssm_by_samples.Rd index d975111..66f0e2c 100644 --- a/man/get_ssm_by_samples.Rd +++ b/man/get_ssm_by_samples.Rd @@ -15,16 +15,22 @@ get_ssm_by_samples( ) } \arguments{ -\item{these_sample_ids}{A vector of one or more sample IDs that you want results for.} +\item{these_sample_ids}{A vector of one or more sample IDs that you +want results for.} -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to auto-subset the data to samples in that table before returning. -If not provided and these_sample_ids is also not provided, the function will return SSM for all samples from the specified seq_type in the bundled metadata.} +\item{these_samples_metadata}{Optional, a metadata table (with sample_id +column) to auto-subset the data to samples in that table before returning. +If not provided and these_sample_ids is also not provided, the function will +return SSM for all samples from the specified seq_type in the bundled +metadata.} \item{this_seq_type}{Default is genome.} \item{projection}{The projection genome build. Supports hg38 and grch37.} -\item{tool_name}{Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers.} +\item{tool_name}{Optionally specify which tool to report variant from. +The default is slms-3, also supports "publication" to return the exact +variants as reported in the original papers.} \item{verbose}{Enable for debugging/noisier output.} @@ -34,21 +40,28 @@ If not provided and these_sample_ids is also not provided, the function will ret data frame in MAF format. } \description{ -Get the SSMs (i.e. load MAF) for a single sample or a collection of samples. +Get the SSMs (i.e. load MAF) for a single sample or a +collection of samples. } \details{ Retrieve a maf for a specific sample or a set of samples. Either specify the sample IDs of interest with \code{these_sample_ids}. -Or a metadata table subset to the sample IDs of interest with \code{these_samples_metadata}. +Or a metadata table subset to the sample IDs of interest with +\code{these_samples_metadata}. } \examples{ -#load a common dependency -library(dplyr) #Get genome-wide set of mutations from all DLBCL cell lines -cell_line_meta = get_gambl_metadata() \%>\% + +# 1. get our metadata for the DLBCL cell lines +cell_line_meta = get_gambl_metadata() \%>\% dplyr::filter(cohort == "DLBCL_cell_lines") +# 2. get the SSMs for the DLBCL cell lines dlbcl_maf = get_ssm_by_samples(these_samples_metadata = cell_line_meta) +# 3. have a look: +dlbcl_maf \%>\% dplyr::group_by(Tumor_Sample_Barcode) \%>\% + dplyr::count() + } diff --git a/man/grch37_all_gene_coordinates.Rd b/man/grch37_all_gene_coordinates.Rd index bf65a0a..9e7a24c 100644 --- a/man/grch37_all_gene_coordinates.Rd +++ b/man/grch37_all_gene_coordinates.Rd @@ -24,4 +24,4 @@ grch37_all_gene_coordinates \description{ All gene coordinates in respect to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/grch37_gene_coordinates.Rd b/man/grch37_gene_coordinates.Rd index 4077132..e8bc59e 100644 --- a/man/grch37_gene_coordinates.Rd +++ b/man/grch37_gene_coordinates.Rd @@ -24,4 +24,4 @@ grch37_gene_coordinates \description{ All gene coordinates in respect to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/grch37_lymphoma_genes_bed.Rd b/man/grch37_lymphoma_genes_bed.Rd index 23e0fa0..5a2d355 100644 --- a/man/grch37_lymphoma_genes_bed.Rd +++ b/man/grch37_lymphoma_genes_bed.Rd @@ -22,4 +22,4 @@ grch37_lymphoma_genes_bed \description{ Lymphoma associated genes in respect to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/grch37_partners.Rd b/man/grch37_partners.Rd index cfd6ac2..ac52b4a 100644 --- a/man/grch37_partners.Rd +++ b/man/grch37_partners.Rd @@ -23,4 +23,4 @@ grch37_partners \description{ Translocation partners for oncogenes in with coordinates in respect to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/hg38_gene_coordinates.Rd b/man/hg38_gene_coordinates.Rd index 44fb3c1..f6ad2b8 100644 --- a/man/hg38_gene_coordinates.Rd +++ b/man/hg38_gene_coordinates.Rd @@ -24,4 +24,4 @@ hg38_gene_coordinates \description{ All gene coordinates in respect to hg38. } -\keyword{datasets} +\keyword{internal} diff --git a/man/hg38_lymphoma_genes_bed.Rd b/man/hg38_lymphoma_genes_bed.Rd index 45dfe9b..063c700 100644 --- a/man/hg38_lymphoma_genes_bed.Rd +++ b/man/hg38_lymphoma_genes_bed.Rd @@ -22,4 +22,4 @@ hg38_lymphoma_genes_bed \description{ Lymphoma associated genes in respect to hg38. } -\keyword{datasets} +\keyword{internal} diff --git a/man/hg38_partners.Rd b/man/hg38_partners.Rd index 5ee2e08..a939b8f 100644 --- a/man/hg38_partners.Rd +++ b/man/hg38_partners.Rd @@ -23,4 +23,4 @@ hg38_partners \description{ Translocation partners for oncogenes in with coordinates in respect to hg38. } -\keyword{datasets} +\keyword{internal} diff --git a/man/hgnc2pfam.df.Rd b/man/hgnc2pfam.df.Rd index 5705a2f..ece97ba 100644 --- a/man/hgnc2pfam.df.Rd +++ b/man/hgnc2pfam.df.Rd @@ -31,4 +31,4 @@ https://github.com/morinlab/g3viz/tree/master/data \examples{ hgnc2pfam.df } -\keyword{datasets} +\keyword{internal} diff --git a/man/hotspot_regions_grch37.Rd b/man/hotspot_regions_grch37.Rd index 634b931..309d0ae 100644 --- a/man/hotspot_regions_grch37.Rd +++ b/man/hotspot_regions_grch37.Rd @@ -21,4 +21,4 @@ hotspot_regions_grch37 \description{ Mutation hotspot regions in respect to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/hotspot_regions_hg38.Rd b/man/hotspot_regions_hg38.Rd index c0ca099..2b6dfb2 100644 --- a/man/hotspot_regions_hg38.Rd +++ b/man/hotspot_regions_hg38.Rd @@ -21,4 +21,4 @@ hotspot_regions_hg38 \description{ Mutation hotspot regions in respect to hg38. } -\keyword{datasets} +\keyword{internal} diff --git a/man/hotspots_annotations.Rd b/man/hotspots_annotations.Rd index b10e632..413b73b 100644 --- a/man/hotspots_annotations.Rd +++ b/man/hotspots_annotations.Rd @@ -24,4 +24,4 @@ A data frame with high-quality positions of ssm hotspots in selected genes. This resource is based on GAMBL data and was used for hotspot annotation in cFL/dFL classifier. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_bl_v0.1.Rd b/man/lymphoma_genes_bl_v0.1.Rd index b98990b..6de2cae 100644 --- a/man/lymphoma_genes_bl_v0.1.Rd +++ b/man/lymphoma_genes_bl_v0.1.Rd @@ -29,4 +29,4 @@ lymphoma_genes_bl_v0.1 \description{ Genes frequently associated with Burkitt Lymphoma (BL). This is version 0.1. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_bl_v0.2.Rd b/man/lymphoma_genes_bl_v0.2.Rd index 062c2f6..e366a6f 100644 --- a/man/lymphoma_genes_bl_v0.2.Rd +++ b/man/lymphoma_genes_bl_v0.2.Rd @@ -27,4 +27,4 @@ lymphoma_genes_bl_v0.2 \description{ Genes frequently associated with Burkitt Lymphoma (BL). This is version 0.2. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_comprehensive.Rd b/man/lymphoma_genes_comprehensive.Rd index 99a39cd..727dc2a 100644 --- a/man/lymphoma_genes_comprehensive.Rd +++ b/man/lymphoma_genes_comprehensive.Rd @@ -27,4 +27,4 @@ lymphoma_genes_comprehensive \description{ A detailed data frame with lymphoma genes, annotated with evidence from literature and aSHM. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_dlbcl_v0.1.Rd b/man/lymphoma_genes_dlbcl_v0.1.Rd index d5d4096..594eacb 100644 --- a/man/lymphoma_genes_dlbcl_v0.1.Rd +++ b/man/lymphoma_genes_dlbcl_v0.1.Rd @@ -31,4 +31,4 @@ lymphoma_genes_dlbcl_v0.1 \description{ Genes frequently associated with Diffuse large B cell lymphoma (DLBCL). This is version 0.1. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_dlbcl_v0.2.Rd b/man/lymphoma_genes_dlbcl_v0.2.Rd index 706a36c..cb7ba46 100644 --- a/man/lymphoma_genes_dlbcl_v0.2.Rd +++ b/man/lymphoma_genes_dlbcl_v0.2.Rd @@ -29,4 +29,4 @@ lymphoma_genes_dlbcl_v0.2 \description{ Genes frequently associated with Diffuse large B cell lymphoma (DLBCL). This is version 0.2. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_lymphoma_genes_v0.0.Rd b/man/lymphoma_genes_lymphoma_genes_v0.0.Rd index b1fd8e4..8c182d0 100644 --- a/man/lymphoma_genes_lymphoma_genes_v0.0.Rd +++ b/man/lymphoma_genes_lymphoma_genes_v0.0.Rd @@ -31,4 +31,4 @@ lymphoma_genes_lymphoma_genes_v0.0 \description{ A comprehenssive resource of genes associated with different types of lymphomas. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_mcl_v0.1.Rd b/man/lymphoma_genes_mcl_v0.1.Rd index 55e14db..432879b 100644 --- a/man/lymphoma_genes_mcl_v0.1.Rd +++ b/man/lymphoma_genes_mcl_v0.1.Rd @@ -31,4 +31,4 @@ lymphoma_genes_mcl_v0.1 \description{ Genes frequently associated with Mantle cell lymphoma (MCL). This is version 0.1. } -\keyword{datasets} +\keyword{internal} diff --git a/man/lymphoma_genes_mcl_v0.2.Rd b/man/lymphoma_genes_mcl_v0.2.Rd index 01cad11..7bdf987 100644 --- a/man/lymphoma_genes_mcl_v0.2.Rd +++ b/man/lymphoma_genes_mcl_v0.2.Rd @@ -35,4 +35,4 @@ lymphoma_genes_mcl_v0.2 \description{ Genes frequently associated with Mantle cell lymphoma (MCL). This is version 0.2. } -\keyword{datasets} +\keyword{internal} diff --git a/man/mutation.table.df.Rd b/man/mutation.table.df.Rd index fa87eb7..f2b8d52 100644 --- a/man/mutation.table.df.Rd +++ b/man/mutation.table.df.Rd @@ -23,4 +23,4 @@ https://github.com/morinlab/g3viz/tree/master/data \examples{ mutation.table.df } -\keyword{datasets} +\keyword{internal} diff --git a/man/preserve_genomic_attributes.Rd b/man/preserve_genomic_attributes.Rd index 45ed20b..69e4d56 100644 --- a/man/preserve_genomic_attributes.Rd +++ b/man/preserve_genomic_attributes.Rd @@ -17,3 +17,4 @@ A data frame with preserved genomic attributes. \description{ This function preserves the genomic attributes and class after dplyr operations. } +\keyword{internal} diff --git a/man/process_regions.Rd b/man/process_regions.Rd index 18c727d..4dc42e3 100644 --- a/man/process_regions.Rd +++ b/man/process_regions.Rd @@ -10,7 +10,8 @@ process_regions( region_padding = 0, skip_regions = NULL, only_regions = NULL, - projection = "grch37" + projection = "grch37", + sort = FALSE ) } \arguments{ @@ -25,6 +26,8 @@ process_regions( \item{only_regions}{Character vector of genes to include from GAMBLR aSHM regions.} \item{projection}{Specify which genome build projection to use. The default is "grch37", also accepts "hg38".} + +\item{sort}{Set to TRUE to force regions_bed to be ordered on chromosome and coordinate} } \value{ A list with two objects, regions as a vector and in bed format. diff --git a/man/protein_domains.Rd b/man/protein_domains.Rd index 0436d7a..452f76b 100644 --- a/man/protein_domains.Rd +++ b/man/protein_domains.Rd @@ -31,4 +31,4 @@ protein_domains A data frame with high-quality positions of amino acid positions in their corresponding domains. } -\keyword{datasets} +\keyword{internal} diff --git a/man/reddy_genes.Rd b/man/reddy_genes.Rd index 6226291..3664342 100644 --- a/man/reddy_genes.Rd +++ b/man/reddy_genes.Rd @@ -22,4 +22,4 @@ reddy_genes \description{ Genes identified as significantly mutated in DLBCL by the study of Reddy et al. } -\keyword{datasets} +\keyword{internal} diff --git a/man/review_hotspots.Rd b/man/review_hotspots.Rd index 3308781..f0f253e 100644 --- a/man/review_hotspots.Rd +++ b/man/review_hotspots.Rd @@ -7,7 +7,7 @@ review_hotspots( annotated_maf, genes_of_interest = c("FOXO1", "MYD88", "CREBBP", "NOTCH1", "NOTCH2", "CD79B", "EZH2"), - genome_build = "grch37" + genome_build ) } \arguments{ diff --git a/man/sample_data.Rd b/man/sample_data.Rd index 0ec9ea4..5f67941 100644 --- a/man/sample_data.Rd +++ b/man/sample_data.Rd @@ -23,4 +23,4 @@ sample_data Sample data bundled as a list of 3 elements. Metadata (data frame) and sample data from two projections (grch37 and hg38), Each projection is organized as a list of 3 elements; maf, seg, and bedpe (all data frames). } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh37_v0.0.Rd b/man/somatic_hypermutation_locations_GRCh37_v0.0.Rd index 2132682..2ee2139 100644 --- a/man/somatic_hypermutation_locations_GRCh37_v0.0.Rd +++ b/man/somatic_hypermutation_locations_GRCh37_v0.0.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh37_v0.0 \description{ A data frame with somatic hypermutation locations in respect to GRCh37, version 0.0. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh37_v0.1.Rd b/man/somatic_hypermutation_locations_GRCh37_v0.1.Rd index 2624577..c6ef1f8 100644 --- a/man/somatic_hypermutation_locations_GRCh37_v0.1.Rd +++ b/man/somatic_hypermutation_locations_GRCh37_v0.1.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh37_v0.1 \description{ A data frame with somatic hypermutation locations in respect to GRCh37, version 0.1. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh37_v0.2.Rd b/man/somatic_hypermutation_locations_GRCh37_v0.2.Rd index 39e05fa..052058c 100644 --- a/man/somatic_hypermutation_locations_GRCh37_v0.2.Rd +++ b/man/somatic_hypermutation_locations_GRCh37_v0.2.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh37_v0.2 \description{ A data frame with somatic hypermutation locations in respect to GRCh37, version 0.2. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh37_v0.3.Rd b/man/somatic_hypermutation_locations_GRCh37_v0.3.Rd index bf62855..4fec04a 100644 --- a/man/somatic_hypermutation_locations_GRCh37_v0.3.Rd +++ b/man/somatic_hypermutation_locations_GRCh37_v0.3.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh37_v0.3 \description{ A data frame with somatic hypermutation locations in respect to GRCh37, version 0.3. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh37_v0.4.Rd b/man/somatic_hypermutation_locations_GRCh37_v0.4.Rd index 89228e5..abaa3c5 100644 --- a/man/somatic_hypermutation_locations_GRCh37_v0.4.Rd +++ b/man/somatic_hypermutation_locations_GRCh37_v0.4.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh37_v0.4 \description{ A data frame with somatic hypermutation locations in respect to GRCh37, version 0.4. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh37_v0.5.Rd b/man/somatic_hypermutation_locations_GRCh37_v0.5.Rd index ce6651a..4e008ba 100644 --- a/man/somatic_hypermutation_locations_GRCh37_v0.5.Rd +++ b/man/somatic_hypermutation_locations_GRCh37_v0.5.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh37_v0.5 \description{ A data frame with somatic hypermutation locations in respect to GRCh37, version 0.5. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh37_v_latest.Rd b/man/somatic_hypermutation_locations_GRCh37_v_latest.Rd index 77cd2a4..d765745 100644 --- a/man/somatic_hypermutation_locations_GRCh37_v_latest.Rd +++ b/man/somatic_hypermutation_locations_GRCh37_v_latest.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh37_v_latest \description{ A data frame with somatic hypermutation locations in respect to GRCh37, the latest version. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh38_v0.0.Rd b/man/somatic_hypermutation_locations_GRCh38_v0.0.Rd index 6c10894..9c12c02 100644 --- a/man/somatic_hypermutation_locations_GRCh38_v0.0.Rd +++ b/man/somatic_hypermutation_locations_GRCh38_v0.0.Rd @@ -25,4 +25,4 @@ somatic_hypermutation_locations_GRCh38_v0.0 \description{ A data frame with somatic hypermutation locations in respect to GRCh38, version 0.0. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh38_v0.1.Rd b/man/somatic_hypermutation_locations_GRCh38_v0.1.Rd index 1f6bf78..37fb605 100644 --- a/man/somatic_hypermutation_locations_GRCh38_v0.1.Rd +++ b/man/somatic_hypermutation_locations_GRCh38_v0.1.Rd @@ -25,4 +25,4 @@ somatic_hypermutation_locations_GRCh38_v0.1 \description{ A data frame with somatic hypermutation locations in respect to GRCh38, version 0.1. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh38_v0.2.Rd b/man/somatic_hypermutation_locations_GRCh38_v0.2.Rd index 752fd13..d3bd3ef 100644 --- a/man/somatic_hypermutation_locations_GRCh38_v0.2.Rd +++ b/man/somatic_hypermutation_locations_GRCh38_v0.2.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh38_v0.2 \description{ A data frame with somatic hypermutation locations in respect to GRCh38, version 0.2. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh38_v0.3.Rd b/man/somatic_hypermutation_locations_GRCh38_v0.3.Rd index 3dab717..160751e 100644 --- a/man/somatic_hypermutation_locations_GRCh38_v0.3.Rd +++ b/man/somatic_hypermutation_locations_GRCh38_v0.3.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh38_v0.3 \description{ A data frame with somatic hypermutation locations in respect to GRCh38, version 0.3. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh38_v0.4.Rd b/man/somatic_hypermutation_locations_GRCh38_v0.4.Rd index cac19ef..fbe57a7 100644 --- a/man/somatic_hypermutation_locations_GRCh38_v0.4.Rd +++ b/man/somatic_hypermutation_locations_GRCh38_v0.4.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh38_v0.4 \description{ A data frame with somatic hypermutation locations in respect to GRCh38, version 0.4. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh38_v0.5.Rd b/man/somatic_hypermutation_locations_GRCh38_v0.5.Rd index 95688b9..cedd5dc 100644 --- a/man/somatic_hypermutation_locations_GRCh38_v0.5.Rd +++ b/man/somatic_hypermutation_locations_GRCh38_v0.5.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh38_v0.5 \description{ A data frame with somatic hypermutation locations in respect to GRCh38, version 0.5. } -\keyword{datasets} +\keyword{internal} diff --git a/man/somatic_hypermutation_locations_GRCh38_v_latest.Rd b/man/somatic_hypermutation_locations_GRCh38_v_latest.Rd index c1a589e..b94c72d 100644 --- a/man/somatic_hypermutation_locations_GRCh38_v_latest.Rd +++ b/man/somatic_hypermutation_locations_GRCh38_v_latest.Rd @@ -24,4 +24,4 @@ somatic_hypermutation_locations_GRCh38_v_latest \description{ A data frame with somatic hypermutation locations in respect to GRCh38, the latest version. } -\keyword{datasets} +\keyword{internal} diff --git a/man/strip_genomic_classes.Rd b/man/strip_genomic_classes.Rd index 5c08846..508463f 100644 --- a/man/strip_genomic_classes.Rd +++ b/man/strip_genomic_classes.Rd @@ -21,3 +21,4 @@ This function removes custom classes associated with genomic data objects of an object. This can be useful when you want to revert an S3 object to its underlying data.frame (or data.table) classes without converting the object. } +\keyword{internal} diff --git a/man/target_regions_grch37.Rd b/man/target_regions_grch37.Rd index 4f13f15..612731b 100644 --- a/man/target_regions_grch37.Rd +++ b/man/target_regions_grch37.Rd @@ -21,4 +21,4 @@ target_regions_grch37 \description{ Target regions in respect to grch37. } -\keyword{datasets} +\keyword{internal} diff --git a/man/target_regions_hg38.Rd b/man/target_regions_hg38.Rd index 676e4dd..7705f97 100644 --- a/man/target_regions_hg38.Rd +++ b/man/target_regions_hg38.Rd @@ -21,4 +21,4 @@ target_regions_hg38 \description{ Target regions in respect to hg38. } -\keyword{datasets} +\keyword{internal} diff --git a/man/wright_genes_with_weights.Rd b/man/wright_genes_with_weights.Rd index ea2c1f0..f7e7fd5 100644 --- a/man/wright_genes_with_weights.Rd +++ b/man/wright_genes_with_weights.Rd @@ -21,4 +21,4 @@ wright_genes_with_weights \description{ Description. } -\keyword{datasets} +\keyword{internal} From 7ac090dac5c3afc8a0ea6a53b4558b047cac9d5e Mon Sep 17 00:00:00 2001 From: Ryan Morin Date: Fri, 7 Feb 2025 17:59:09 -0800 Subject: [PATCH 11/19] fix examples --- R/calc_mutation_frequency_bin_regions.R | 32 +++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/R/calc_mutation_frequency_bin_regions.R b/R/calc_mutation_frequency_bin_regions.R index 540a13d..52c436e 100644 --- a/R/calc_mutation_frequency_bin_regions.R +++ b/R/calc_mutation_frequency_bin_regions.R @@ -41,26 +41,28 @@ #' #' @return A table of mutation counts for sliding windows across one or more regions. May be long or wide. #' -#' @import dplyr tidyr tibble +#' @import dplyr tidyr tibble parallel #' @export #' #' @examples -#' #get some regions -#' these_regions <- process_regions(only_regions = c("MYC", "BCL2", "BCL6")) -#' reg_vec <- these_regions$regions_list -#' reg_bed <- these_regions$regions_bed -#' -#' # use a set of user defined regions (from genes) and -#' # calculate mut frequency across all available samples -#' mult_freq_all = calc_mutation_frequency_bin_regions(regions_list = reg_vec) -#' mult_freq_all = calc_mutation_frequency_bin_regions(regions_bed = reg_bed) + +#' #load metadata. +#' my_meta = get_gambl_metadata() +#' dlbcl_bl_meta = dplyr::filter(my_meta, pathology %in% c("DLBCL", "BL")) #' -#' #restrict the analysis to specific samples using the metadata -#' my_meta = get_gambl_metadata() %>% -#' dplyr::filter(pathology %in% c("DLBCL","FL")) -#' mult_reg_freq_fl_dlbcl = calc_mutation_frequency_bin_regions(regions_list = reg_vec, -#' these_sample_ids = "DOHH-2") #' +#' #get ashm regions +#' some_regions = create_bed_data(grch37_ashm_regions, +#' fix_names = "concat", +#' concat_cols = c("gene","region"), +#' sep="-") +#' print(some_regions) +#' mut_count_matrix <- calc_mutation_frequency_bin_regions( +#' these_samples_metadata = dlbcl_bl_meta, +#' regions_bed = some_regions +#' ) +#' dim(mut_count_matrix) +#' tail(mut_count_matrix[,c(1:10)]) calc_mutation_frequency_bin_regions <- function(regions_list = NULL, regions_bed = NULL, these_samples_metadata = NULL, From 814003cabc9168e5f97509205724fd84bcffb36e Mon Sep 17 00:00:00 2001 From: Ryan Morin Date: Fri, 7 Feb 2025 17:59:18 -0800 Subject: [PATCH 12/19] hide from docs --- R/check_excess_params.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_excess_params.R b/R/check_excess_params.R index 68eea07..4350dd4 100644 --- a/R/check_excess_params.R +++ b/R/check_excess_params.R @@ -9,9 +9,9 @@ #' @param ... Parameters to check. #' #' @return Nothing -#' -#' @export #' +#' @export +#' @keywords internal check_excess_params = function(...){ callingFun = as.list(sys.call(-1))[[1]] arguments <- list(...) From da280795e293147eede549fdd2a5bd25289d232c Mon Sep 17 00:00:00 2001 From: Ryan Morin Date: Fri, 7 Feb 2025 18:12:04 -0800 Subject: [PATCH 13/19] clean up code --- R/calc_mutation_frequency_bin_region.R | 3 - R/get_ashm_count_matrix.R | 99 +++++++++++------------ man/calc_mutation_frequency_bin_region.Rd | 6 -- 3 files changed, 48 insertions(+), 60 deletions(-) diff --git a/R/calc_mutation_frequency_bin_region.R b/R/calc_mutation_frequency_bin_region.R index 1bf9592..936e046 100644 --- a/R/calc_mutation_frequency_bin_region.R +++ b/R/calc_mutation_frequency_bin_region.R @@ -13,9 +13,6 @@ #' #' @param region A string describing a genomic region in the "chrom:start-end" format. #' The region must be specified in this format OR as separate chromosome, start_pos, end_pos arguments. -#' @param chromosome Chromosome name in region. -#' @param start_pos Start coordinate of region. -#' @param end_pos End coordinate of region. #' @param these_samples_metadata Optional data frame containing a sample_id column. #' If not providing a maf file, seq_type is also a required column. #' @param these_sample_ids Optional vector of sample IDs. Output will be subset diff --git a/R/get_ashm_count_matrix.R b/R/get_ashm_count_matrix.R index 6f4fd81..10b7d81 100644 --- a/R/get_ashm_count_matrix.R +++ b/R/get_ashm_count_matrix.R @@ -30,75 +30,72 @@ #' this_seq_type = "genome" #' ) #' -#' #this example intentionally fails -#' matrix <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", +#' #this example should fail because the regions_bed is not hg38 +#' matrix <- get_ashm_count_matrix(regions_bed=regions_bed, +#' this_seq_type = "genome", #' these_samples_metadata = my_meta, #' projection = "hg38") #' # Error in get_ashm_count_matrix( #' # Your projection argument does not match the genome_build of regions_bed -#' -#' # format the name column to include the chromosome coordinates instead of the gene +#' +#' # format the name column to include the coordinates instead of the gene #' regions_bed = create_bed_data(GAMBLR.data::hg38_ashm_regions, #' fix_names="concat", #' concat_cols=c("chr_name","hg38_start","hg38_end"), #' sep="-") -#' -#' matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", -#' these_samples_metadata = my_meta, -#' projection = "hg38") #' +#' matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed, +#' this_seq_type = "genome", +#' these_samples_metadata = my_meta, +#' projection = "hg38") +#' print(dim(matrix_hg38)) +#' print(head(matrix_hg38[,c(1:8)])) get_ashm_count_matrix = function( - regions_bed, - these_samples_metadata, - this_seq_type, - projection = "grch37" - ){ - if(missing(this_seq_type)){ - if(missing(these_samples_metadata)){ - stop( - "Please supply either the this_seq_type or a metadata from which it can be retrieved" - ) - } - this_seq_type <- these_samples_metadata %>% - pull(seq_type) %>% - unique() + regions_bed, + these_samples_metadata, + this_seq_type, + projection = "grch37" +) { + if (missing(this_seq_type)) { + if (missing(these_samples_metadata)) { + stop("Please supply either the this_seq_type or a + metadata from which it can be retrieved") } + this_seq_type <- these_samples_metadata %>% + pull(seq_type) %>% + unique() + } - if(missing(regions_bed)){ - message( - "Using aSHM regions in grch37 genome_build as regions_bed" - ) - if(projection=="grch37"){ - regions_bed <- GAMBLR.data::grch37_ashm_regions %>% - mutate(name = paste(gene, region, sep = "_")) %>% - create_bed_data(genome_build = projection) - }else if(projection=="hg38"){ - regions_bed <- GAMBLR.data::hg38_ashm_regions %>% - mutate(name = paste(gene, region, sep = "_")) %>% - create_bed_data(genome_build = projection) - }else{ - stop(paste("unsupported genome build",projection)) - } - + if (missing(regions_bed)){ + message( + "Using aSHM regions in grch37 genome_build as regions_bed" + ) + if (projection=="grch37"){ + regions_bed <- GAMBLR.data::grch37_ashm_regions %>% + mutate(name = paste(gene, region, sep = "_")) %>% + create_bed_data(genome_build = projection) + } else if(projection=="hg38") { + regions_bed <- GAMBLR.data::hg38_ashm_regions %>% + mutate(name = paste(gene, region, sep = "_")) %>% + create_bed_data(genome_build = projection) }else{ - if("bed_data" %in% class(regions_bed)){ - if(!get_genome_build(regions_bed)==projection){ - stop(paste("Your genome_build argument does not match the genome_build of regions_bed",get_genome_build(regions_bed),genome_build)) - } + stop(paste("unsupported genome build",projection)) + } + }else { + if ("bed_data" %in% class(regions_bed)) { + if(!get_genome_build(regions_bed)==projection) { + stop(paste("Your genome_build argument does not match the genome_build of regions_bed",get_genome_build(regions_bed),genome_build)) } } + } - - - if(missing(these_samples_metadata)){ - all_meta <- get_gambl_metadata( - seq_type_filter=this_seq_type - ) %>% - dplyr::select(sample_id) - }else{ + if (missing(these_samples_metadata)){ + all_meta <- get_gambl_metadata(seq_type_filter=this_seq_type) %>% + dplyr::select(sample_id) + }else { all_meta <- these_samples_metadata %>% dplyr::select(sample_id) - } + } ashm_maf <- get_ssm_by_regions( regions_bed = regions_bed, diff --git a/man/calc_mutation_frequency_bin_region.Rd b/man/calc_mutation_frequency_bin_region.Rd index dec76e3..bb6d06c 100644 --- a/man/calc_mutation_frequency_bin_region.Rd +++ b/man/calc_mutation_frequency_bin_region.Rd @@ -60,12 +60,6 @@ or binary mutated/unmutated status (FALSE). Default is TRUE.} Only effective with "long" return format.} \item{...}{Any additional parameters.} - -\item{chromosome}{Chromosome name in region.} - -\item{start_pos}{Start coordinate of region.} - -\item{end_pos}{End coordinate of region.} } \value{ Either a matrix or a long tidy table of counts per window. From ddb5328c0bc28c4edd28f10a7d8b7ccd9ef94f54 Mon Sep 17 00:00:00 2001 From: Ryan Morin Date: Fri, 7 Feb 2025 18:12:48 -0800 Subject: [PATCH 14/19] rebuild docs --- man/get_ashm_count_matrix.Rd | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/man/get_ashm_count_matrix.Rd b/man/get_ashm_count_matrix.Rd index eb9943e..1c95999 100644 --- a/man/get_ashm_count_matrix.Rd +++ b/man/get_ashm_count_matrix.Rd @@ -45,21 +45,24 @@ matrix <- get_ashm_count_matrix( this_seq_type = "genome" ) -#this example intentionally fails - matrix <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", +#this example should fail because the regions_bed is not hg38 + matrix <- get_ashm_count_matrix(regions_bed=regions_bed, + this_seq_type = "genome", these_samples_metadata = my_meta, projection = "hg38") # Error in get_ashm_count_matrix( # Your projection argument does not match the genome_build of regions_bed -# format the name column to include the chromosome coordinates instead of the gene +# format the name column to include the coordinates instead of the gene regions_bed = create_bed_data(GAMBLR.data::hg38_ashm_regions, fix_names="concat", concat_cols=c("chr_name","hg38_start","hg38_end"), sep="-") - - matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", - these_samples_metadata = my_meta, - projection = "hg38") + matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed, + this_seq_type = "genome", + these_samples_metadata = my_meta, + projection = "hg38") +print(dim(matrix_hg38)) +print(head(matrix_hg38[,c(1:8)])) } From 90c70aaabdf049d98bffae0252d647a896f4e5e5 Mon Sep 17 00:00:00 2001 From: Ryan Morin Date: Fri, 7 Feb 2025 18:45:33 -0800 Subject: [PATCH 15/19] suppress verbose warnings with global vars --- R/annotate_hotspots.R | 46 --- R/assign_cn_to_ssm.R | 190 --------- R/calc_mutation_frequency_bin_region.R | 284 ------------- R/calc_mutation_frequency_bin_regions.R | 149 ------- R/check_excess_params.R | 26 -- R/data-vars.R | 18 + R/genomic_data.R | 526 ------------------------ R/get_ashm_count_matrix.R | 133 ------ R/get_cn_segments.R | 73 ---- R/get_manta_sv.R | 164 -------- R/get_ssm_by_patients.R | 88 ---- R/get_ssm_by_region.R | 138 ------- R/get_ssm_by_samples.R | 102 ----- 13 files changed, 18 insertions(+), 1919 deletions(-) delete mode 100644 R/annotate_hotspots.R delete mode 100644 R/assign_cn_to_ssm.R delete mode 100644 R/calc_mutation_frequency_bin_region.R delete mode 100644 R/calc_mutation_frequency_bin_regions.R delete mode 100644 R/check_excess_params.R delete mode 100644 R/genomic_data.R delete mode 100644 R/get_ashm_count_matrix.R delete mode 100644 R/get_cn_segments.R delete mode 100644 R/get_manta_sv.R delete mode 100644 R/get_ssm_by_patients.R delete mode 100644 R/get_ssm_by_region.R delete mode 100644 R/get_ssm_by_samples.R diff --git a/R/annotate_hotspots.R b/R/annotate_hotspots.R deleted file mode 100644 index 1da616d..0000000 --- a/R/annotate_hotspots.R +++ /dev/null @@ -1,46 +0,0 @@ -#' @title Annotate Hotspots. -#' -#' @description Annotate MAF-like data frome with a hot_spot column indicating -#' recurrent mutations. -#' -#' @details This function takes an already loaded MAF data frame with the -#' `mutation_maf` parameter. -#' -#' @param mutation_maf A data frame in MAF format. -#' @param ... Any other parameter. These parameters will be ignored. -#' -#' @return The same data frame with one additional column "hot_spot". -#' -#' @import dplyr -#' @export -#' -#' @examples -#' my_metadata = get_gambl_metadata() -#' all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, -#' projection = "grch37", -#' this_seq_type = "genome") %>% -#' dplyr::filter(Hugo_Symbol %in% c("EZH2", -#' "MEF2B","MYD88","KMT2D")) %>% -#' dplyr::arrange(Hugo_Symbol) -#' -#' hot_ssms = annotate_hotspots(all_coding_ssm) -#' hot_ssms %>% dplyr::filter(!is.na(hot_spot)) %>% -#' dplyr::select(1:5,37,hot_spot) -#' -annotate_hotspots = function( - mutation_maf, - ... -) { - - # check if any invalid parameters are provided - check_excess_params(...) - - filled_coords <- GAMBLR.data::hotspots_annotations - # just the ssms that match these coordinates! - hot_ssms <- left_join( - mutation_maf, - filled_coords, - by = c("Chromosome", "Start_Position") - ) - return(hot_ssms) -} diff --git a/R/assign_cn_to_ssm.R b/R/assign_cn_to_ssm.R deleted file mode 100644 index 35e2884..0000000 --- a/R/assign_cn_to_ssm.R +++ /dev/null @@ -1,190 +0,0 @@ -#' @title Assign CN to SSM. -#' -#' @description Annotate mutations with their copy number information. -#' -#' @details This function takes a metadata table and returns all mutations -#' for the samples in that metadata. Each mutation is annotated with the -#' local copy number state of each mutated site. The user can specify if -#' only coding mutations are of interest. To do so, -#' set `coding_only = TRUE`. When necessary, this function relies on -#' `get_ssm_by_samples` and `get_cn_segments` to obtain the required data. -#' @param these_samples_metadata Metadata table with one or more rows to specify -#' the samples to process. -#' @param maf_data A data frame of mutations in MAF format or maf_data object -#' (e.g. from `get_coding_ssm` or `get_ssm_by_sample`). -#' @param seg_data A data frame of segmented copy number data or seg_data object -#' @param projection Specified genome projection that returned data is relative to. -#' This is only required when it cannot be inferred from maf_df or seg_df -#' (or they are not provided). -#' @param coding_only Optional. Set to TRUE to restrict to only variants in coding space -#' Default is to work with genome-wide variants. -#' @param assume_diploid Optional, this parameter annotates every mutation as -#' copy neutral. Default is FALSE. -#' @param include_silent Logical parameter indicating whether to include silent -#' mutations in coding space. Default is FALSE. This parameter only -#' makes sense if `coding_only` is set to TRUE. -#' @param ... Any additional parameters. -#' -#' @return A list containing a data frame (MAF-like format) with three extra -#' columns: -#' - log.ratio is the log ratio from the seg file (NA when no overlap). -#' - LOH -#' - CN (the rounded absolute copy number estimate of the region based on -#' log.ratio, NA when no overlap was found). -#' -#' @import dplyr -#' @export -#' -#' @examples -#' # long-handed way -#' # 1. get some metadata for a collection of samples -#' some_meta = get_gambl_metadata() %>% -#' dplyr::filter(cohort=="FL_Dreval", -#' grepl("SP",sample_id)) -#' # 2. Get the SSMs for these samples -#' -#' ssm_genomes_grch37 = get_coding_ssm(projection = "grch37", -#' these_samples_metadata = some_meta) -#' # peek at the results -#' ssm_genomes_grch37 %>% dplyr::select(1:8) -#' -#' # 3. Lazily let this function obtain the corresponding seg_data for the right genome_build -#' cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_grch37) -#' -#' cn_list$maf %>% dplyr::select(1:8,log.ratio,CN) -#' -#' # This won't work because the hg38 seg_data is not bundled -#' ssm_genomes_hg38 = get_coding_ssm(projection = "hg38", -#' these_samples_metadata = some_meta) -#' cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_hg38) -#' -#' # Easiest/laziest way: -#' cn_list = assign_cn_to_ssm(projection = "grch37") -#' -#' -#' cn_list$maf %>% dplyr::group_by(Tumor_Sample_Barcode,CN) %>% -#' dplyr::count() -#' -assign_cn_to_ssm = function( - these_samples_metadata, - maf_data, - seg_data, - projection, - coding_only = FALSE, - assume_diploid = FALSE, - include_silent = FALSE, - ... -){ - if(missing(these_samples_metadata)){ - stop("No metadata provided. these_samples_metadata is required") - } - #check if any invalid parameters are provided - check_excess_params(...) - genomic_data = list() - if(!missing(maf_data)){ - genomic_data[["maf_data"]] = maf_data - } - if(!missing(seg_data)){ - genomic_data[["seg_data"]] = seg_data - } - - projection <- check_get_projection(genomic_data, suggested = projection) - - if(missing(seg_data)){ - seg_sample = get_cn_segments( - these_samples_metadata = these_samples_metadata, - projection = projection - ) - missing_from_seg = dplyr::filter(these_samples_metadata, - !sample_id %in% seg_sample$ID) %>% - pull(sample_id) %>% - unique() - if(length(missing_from_seg) == length(unique(these_samples_metadata$sample_id))){ - stop(paste("No seg_data could be found for ANY of the samples provided for",projection)) - } - if(length(missing_from_seg)){ - warning(paste("missing seg_data for",length(missing_from_seg),"samples")) - } - }else{ - seg_sample = seg_data - } - - if(missing(maf_data)){ - #get maf - maf_sample = get_ssm_by_samples( - these_samples_metadata = these_samples_metadata, - projection = projection, - ) - missing_from_maf = dplyr::filter(these_samples_metadata, - !sample_id %in% maf_sample$Tumor_Sample_Barcode) %>% - pull(sample_id) %>% - unique() - if(length(missing_from_maf) == length(unique(these_samples_metadata$sample_id))){ - stop(paste("No mutation could be found for ANY of the samples provided for",projection)) - } - if(length(missing_from_maf)){ - warning(paste("missing mutation for",length(missing_from_maf),"samples")) - } - }else{ - maf_sample = maf_data - } - - #maf filtering - #silent mutations - if(!include_silent){ - coding_class = coding_class[coding_class != "Silent"] - } - - #coding mutations - if(coding_only){ - maf_sample = dplyr::filter( - maf_sample, - Variant_Classification %in% coding_class - ) - } - - - - #annotate all CN segments as copy number neutral - if(assume_diploid){ - diploid = dplyr::mutate(maf_sample, CN = 2) - return(list(maf = diploid)) - } - - #wrangle the seg file - seg_sample = seg_sample %>% - dplyr::filter(end - start > 100) %>% - rename( - Chromosome = chrom, - Start_Position = start, - End_Position = end, - LOH = LOH_flag, - Tumor_Sample_Barcode = ID - ) %>% - mutate(across(LOH, as.factor)) - - #perform an overlap join and add CN columns from the seg file and subset - # MAF to basic columns (first 45) - maf_tmp = cool_overlaps(maf_sample, seg_sample, - type = "any", - columns1=c("Chromosome","Start_Position","End_Position","Tumor_Sample_Barcode"), - columns2=c("Chromosome","Start_Position","End_Position","Tumor_Sample_Barcode")) - - #rename and change order of columns to match expected format - maf_with_segs = maf_tmp %>% - rename( - Start_Position = Start_Position.x, - End_Position = End_Position.x - ) %>% - dplyr::select( - colnames(maf_sample), - LOH, log.ratio, CN - ) - - return( - list( - maf = maf_with_segs, - seg = seg_sample - ) - ) -} diff --git a/R/calc_mutation_frequency_bin_region.R b/R/calc_mutation_frequency_bin_region.R deleted file mode 100644 index 936e046..0000000 --- a/R/calc_mutation_frequency_bin_region.R +++ /dev/null @@ -1,284 +0,0 @@ -#' @title Calculate Mutation Frequency By Sliding Window. -#' -#' @description Count the number of mutations in a sliding window across a -#' region for all samples. -#' -#' @details This function is called to return the mutation frequency for a given -#' region, either from a provided input maf data frame or from the GAMBL maf data. -#' Regions are specified with the `region` parameter. Alternatively, the region of -#' interest can also be specified by calling the function with `chromosome`, -#' `start_pos`, and `end_pos` parameters. This function operates on a single region. -#' To return a matrix of sliding window counts over multiple regions, -#' see `calc_mutation_frequency_bin_regions`. -#' -#' @param region A string describing a genomic region in the "chrom:start-end" format. -#' The region must be specified in this format OR as separate chromosome, start_pos, end_pos arguments. -#' @param these_samples_metadata Optional data frame containing a sample_id column. -#' If not providing a maf file, seq_type is also a required column. -#' @param these_sample_ids Optional vector of sample IDs. Output will be subset -#' to IDs present in this vector. -#' @param this_seq_type Optional vector of seq_types to include in heatmap. -#' Default is "genome". Uses default seq_type priority for samples -#' with >1 seq_type. -#' @param maf_data Optional maf data frame. Will be subset to rows where -#' Tumor_Sample_Barcode matches provided sample IDs or metadata table. -#' If not provided, maf data will be obtained with get_ssm_by_regions(). -#' @param projection Specify which genome build to use. Required. Default grch37. -#' @param slide_by Slide size for sliding window. Default 100. -#' @param window_size Size of sliding window. Default 1000. -#' @param return_format Return format of mutations. Accepted inputs are "long" -#' and "wide". Long returns a data frame of one sample ID/window per row. -#' Wide returns a matrix with one sample ID per row and one window per column. -#' Using the "wide" format will retain all samples and windows regardless of -#' the drop_unmutated or min_count_per_bin parameters. -#' @param min_count_per_bin Minimum counts per bin, default is 0. Setting this -#' greater than 0 will drop unmutated windows only when return_format is long. -#' @param return_count Boolean statement to return mutation count per window (TRUE) -#' or binary mutated/unmutated status (FALSE). Default is TRUE. -#' @param drop_unmutated Boolean for whether to drop windows with 0 mutations. -#' Only effective with "long" return format. -#' @param ... Any additional parameters. -#' -#' @return Either a matrix or a long tidy table of counts per window. -#' -#' @import dplyr tidyr -#' @export -#' -#' @examples -#' myc_region = "8:128747680-128753674" -#' myc_mut_freq = calc_mutation_frequency_bin_region(region = myc_region, -#' slide_by = 10, -#' window_size = 10000) -#' dplyr::arrange(myc_mut_freq,desc(mutation_count)) -#' -calc_mutation_frequency_bin_region <- function(region, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - slide_by = 100, - window_size = 1000, - return_format = "long", - min_count_per_bin = 0, - return_count = TRUE, - drop_unmutated = FALSE, - ...) { - - #check if any invalid parameters are provided - check_excess_params(...) - - # Create objects to describe region both as string and individual objects - try(if (missing(region)) { - stop("No region information provided. Please provide a region as a string in the chrom:start-end format") - }) - - if ((drop_unmutated | min_count_per_bin > 0) & return_format == "wide") { - message("To return a wide table, all samples and windows must be kept. Ignoring drop_unmutated and min_count_per_bin arguments. ") - } - - - chunks <- region_to_chunks(region) - chromosome <- chunks$chromosome - start_pos <- as.numeric(chunks$start) - end_pos <- as.numeric(chunks$end) - - # Harmonize metadata and sample IDs - metadata <- id_ease( - these_samples_metadata, - these_sample_ids, - this_seq_type - ) - these_sample_ids <- metadata$sample_id - - - if ( - (grepl("chr", chromosome) & projection == "grch37") | - (!grepl("chr", chromosome) & projection == "hg38") - ) { - stop("chr prefixing status of region and specified projection don't match. ") - } - - - # Check region size and compare to max region size - # Is this really needed? - max_region <- 5e+06 - - region_size <- end_pos - start_pos - if (region_size < max_region) { - message(paste( - "processing bins of size", window_size, - "across", region_size, "bp region" - )) - } else { - message(paste("CAUTION!\n", region_size, "exceeds maximum size recommended by this function.")) - } - - # Split region into windows - windows <- data.frame( - chrom = chromosome, - window_start = seq(start_pos, end_pos, by = slide_by) - ) %>% - dplyr::mutate(window_end = window_start + window_size - 1) %>% - dplyr::select(chrom, window_start, window_end) - - # Option to return full region count instead of sliding window - if (window_size == 0) { - windows <- data.frame( - chrom = chromosome, - window_start = start_pos, - window_end = end_pos - ) - } - - # Obtain SSM coordinates from GAMBL if no maf_data was provided - if (is.null(maf_data)) { - try( - if (!"seq_type" %in% colnames(metadata)) { - stop("seq_type must be present in metadata for compatibility with get_ssm_by_samples") - } - ) - message("Using GAMBLR.data::get_ssm_by_region...") - region_ssm <- list() - for (st in unique(metadata$seq_type)) { - this_seq_type <- get_ssm_by_region( - region = region, - projection = projection, - streamlined = FALSE, - this_seq_type = st - ) %>% - dplyr::mutate(end = Start_Position + 1) %>% - dplyr::select( - chrom = Chromosome, - start = Start_Position, - end, - sample_id = Tumor_Sample_Barcode - ) %>% - dplyr::mutate(mutated = 1, seq_type = st) %>% - dplyr::filter(sample_id %in% these_sample_ids) - region_ssm[[st]] <- data.frame(metadata) %>% - dplyr::select(sample_id, seq_type) %>% - dplyr::filter(seq_type == st) %>% - dplyr::left_join(this_seq_type, by = c("sample_id", "seq_type")) %>% - dplyr::filter(!is.na(mutated)) %>% - dplyr::select(-seq_type) - } - region_ssm <- dplyr::bind_rows(region_ssm) - } else { - # Subset provided maf to specified region - message("Using provided maf...") - region_bed <- data.frame( - "chrom" = as.character(chromosome), - "start" = as.numeric(start_pos), - "end" = as.numeric(end_pos) - ) - region_ssm <- cool_overlaps( - maf_data, region_bed, - columns2 = c("chrom", "start", "end") - ) %>% - dplyr::filter(!is.na(Start_Position)) %>% - dplyr::mutate(end = Start_Position - 1) %>% - dplyr::select( - chrom = Chromosome, - start = Start_Position, - end, - sample_id = Tumor_Sample_Barcode - ) %>% - dplyr::mutate(mutated = 1) - - region_ssm <- data.frame(metadata) %>% - dplyr::select(sample_id) %>% - dplyr::left_join(region_ssm) %>% - dplyr::filter(!is.na(mutated)) - } - - # Check if the region is empty. - # If yes return NULL so that running this function with lapply will allow bind_rows to run on the output. - if (nrow(region_ssm) == 0 & (drop_unmutated | min_count_per_bin > 0)) { - message(paste0("No mutations found in region ", region, " for this sample set. ")) - return(NULL) - } - - # Count mutations per window - windows_tallied <- dplyr::inner_join( - windows, - region_ssm, - by = "chrom" - ) %>% - dplyr::filter( - start >= window_start, - start <= window_end - ) %>% - dplyr::group_by( - sample_id, - window_start - ) %>% - dplyr::tally() %>% - dplyr::ungroup() %>% - dplyr::full_join(select(metadata, sample_id)) %>% - dplyr::arrange(sample_id) %>% - dplyr::full_join(select(windows, window_start)) %>% - dplyr::distinct() %>% - tidyr::pivot_wider( - names_from = window_start, - values_from = n, - values_fill = 0 - ) %>% - dplyr::select(-matches("^NA$")) %>% - tidyr::pivot_longer( - -c(sample_id), - names_to = "window_start", - values_to = "n" - ) %>% - dplyr::distinct() %>% - dplyr::filter(!is.na(sample_id)) - - # Remove unmutated windows if requested - if (drop_unmutated | min_count_per_bin > 0) { - windows_tallied <- windows_tallied %>% - dplyr::filter(n >= min_count_per_bin) - if (drop_unmutated & min_count_per_bin == 0) { - windows_tallied %>% - dplyr::filter(n > 0) - } - } - - # Create requested data output format - if (return_count) { - # Return table of mutation counts per bin - windows_tallied_final <- mutate( - windows_tallied, - bin = paste0(chromosome, "_", window_start) - ) %>% - dplyr::mutate(mutation_count = n) %>% - dplyr::select( - sample_id, - bin, - mutation_count - ) - } else { - # Return table of binary mutated/unmutated status per bin - windows_tallied_final <- mutate( - windows_tallied, - bin = paste0(chromosome, "_", window_start) - ) %>% - dplyr::mutate(mutated = ifelse(n > 0, 1, 0)) %>% - dplyr::select( - sample_id, - bin, - mutated - ) - } - - if (return_format == "wide") { - widened <- windows_tallied_final %>% - tidyr::pivot_wider( - names_from = bin, - values_from = matches("mutat"), - values_fill = 0 - ) - return(widened) - } else { - return(windows_tallied_final) - } -} diff --git a/R/calc_mutation_frequency_bin_regions.R b/R/calc_mutation_frequency_bin_regions.R deleted file mode 100644 index 52c436e..0000000 --- a/R/calc_mutation_frequency_bin_regions.R +++ /dev/null @@ -1,149 +0,0 @@ -#' @title Mutation counts across sliding windows for multiple regions. -#' -#' @description Obtain a long tidy or wide matrix of mutation counts across -#' sliding windows for multiple regions. -#' -#' @details This function takes a metadata table with `these_samples_metadata` -#' parameter and internally calls `calc_mutation_frequency_bin_region` -#' (that internally calls `get_ssm_by_regions`). -#' to retrieve mutation counts for sliding windows across one or more regions. -#' May optionally provide any combination of a maf data frame, existing metadata, -#' or a regions data frame or named vector. -#' -#' @param regions_list Named vector of regions in the format -#' c(name1 = "chr:start-end", name2 = "chr:start-end"). If neither `regions` nor -#' `regions_bed` is specified, the function will use GAMBLR aSHM region information. -#' @param regions_bed Data frame of regions with four columns (chrom, start, end, name). -#' @param these_samples_metadata Metadata with at least sample_id column. -#' If not providing a maf data frame, seq_type is also required. -#' @param these_sample_ids Vector of sample IDs. Metadata will be subset to -#' sample IDs present in this vector. -#' @param this_seq_type Optional vector of seq_types to include in heatmap. -#' Default "genome". Uses default seq_type priority for samples with >1 seq_type. -#' @param maf_data Optional maf data frame. Will be subset to rows where -#' Tumor_Sample_Barcode matches provided sample IDs or metadata table. -#' If not provided, maf data will be obtained with get_ssm_by_regions(). -#' @param region_padding Amount to pad the start and end coordinates by. Default 1000. -#' @param projection Genome build the function will operate in. Ensure this -#' matches your provided regions and maf data for correct chr prefix handling. Default "grch37". -#' @param drop_unmutated Whether to drop bins with 0 mutations. If returning a -#' matrix format, this will only drop bins with no mutations in any samples. -#' @param skip_regions Optional character vector of genes to exclude from the default aSHM regions. -#' @param only_regions Optional character vector of genes to include from the default aSHM regions. -#' @param slide_by Slide size for sliding window. Default 100. -#' @param window_size Size of sliding window. Default 500. -#' @param return_format Return format of mutations. Accepted inputs are "long" and -#' "wide". Long returns a data frame of one sample ID/window per row. Wide returns -#' a matrix with one sample ID per row and one window per column. Using the "wide" -#' format will retain all samples and windows regardless of the drop_unmutated or -#' min_count_per_bin parameters. Default wide. -#' @param ... Any additional parameters. -#' -#' @return A table of mutation counts for sliding windows across one or more regions. May be long or wide. -#' -#' @import dplyr tidyr tibble parallel -#' @export -#' -#' @examples - -#' #load metadata. -#' my_meta = get_gambl_metadata() -#' dlbcl_bl_meta = dplyr::filter(my_meta, pathology %in% c("DLBCL", "BL")) -#' -#' -#' #get ashm regions -#' some_regions = create_bed_data(grch37_ashm_regions, -#' fix_names = "concat", -#' concat_cols = c("gene","region"), -#' sep="-") -#' print(some_regions) -#' mut_count_matrix <- calc_mutation_frequency_bin_regions( -#' these_samples_metadata = dlbcl_bl_meta, -#' regions_bed = some_regions -#' ) -#' dim(mut_count_matrix) -#' tail(mut_count_matrix[,c(1:10)]) -calc_mutation_frequency_bin_regions <- function(regions_list = NULL, - regions_bed = NULL, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - region_padding = 1000, - drop_unmutated = FALSE, - skip_regions = NULL, - only_regions = NULL, - slide_by = 100, - window_size = 500, - return_format = "wide", - ...){ - - #check if any invalid parameters are provided - check_excess_params(...) - - regions <- process_regions(regions_list = regions_list, - regions_bed = regions_bed, - region_padding = region_padding, - skip_regions = skip_regions, - only_regions = only_regions) - - regions_bed <- regions$regions_bed - regions <- regions$regions_list - - if ( - (grepl("chr", regions_bed$chrom[1]) & projection == "grch37") | - (!grepl("chr", regions_bed$chrom[1]) & projection == "hg38") - ) { - stop("chr prefixing status of provided regions and specified projection don't match. ") - } - # Harmonize metadata and sample IDs - metadata <- id_ease( - these_samples_metadata, - these_sample_ids, - this_seq_type - ) - - these_sample_ids <- metadata$sample_id - - # Obtain sliding window mutation frequencies for all regions - dfs <- mclapply(names(regions), function(x) { - df <- calc_mutation_frequency_bin_region( - region = regions[x], - these_samples_metadata = metadata, - maf_data = maf_data, - projection = projection, - drop_unmutated = drop_unmutated, - slide_by = slide_by, - window_size = window_size, - min_count_per_bin = 0, - return_count = TRUE, - ... - ) %>% - dplyr::mutate(name = x) - return(df) - }) - - all <- dplyr::bind_rows(dfs) %>% - dplyr::distinct(bin, sample_id, .keep_all = TRUE) - - # If none of the samples are mutated, return the mutation frequency df and exit. - if (max(all$mutation_count) == 0) { - message("No mutations found in specified regions for specified samples. Exiting. ") - return(all) - } - - if (return_format == "wide") { - # Convert mutation frequency table to a matrix - all_wide <- all %>% - dplyr::select(sample_id, mutation_count, bin) %>% - pivot_wider( - names_from = bin, - values_from = mutation_count, - values_fill = 0 - ) - return(all_wide) - } else { - return(all) - } -} diff --git a/R/check_excess_params.R b/R/check_excess_params.R deleted file mode 100644 index 4350dd4..0000000 --- a/R/check_excess_params.R +++ /dev/null @@ -1,26 +0,0 @@ -#' @title Check Excess Params -#' -#' @description Function for checking excessive parameter names. -#' This function will notify the user if any unavailable parameters are called for any given given function. -#' This function is designed to work as internal function-call in already available GAMBLR functions. -#' -#' @details Catch function calls containing unsupported arguments. -#' -#' @param ... Parameters to check. -#' -#' @return Nothing -#' -#' @export -#' @keywords internal -check_excess_params = function(...){ - callingFun = as.list(sys.call(-1))[[1]] - arguments <- list(...) - extraneous = names(arguments) - if(length(arguments)>0){ - k <- gettextf("Warning: You have given one or more unsupported or deprecated arguments to %s and they are going to be ignored. Please check the documentation and spelling of your arguments.\nIgnored argument(s): %s.", - as.character(callingFun), - paste(extraneous, collapse = ", ")) - message(k) - } - -} diff --git a/R/data-vars.R b/R/data-vars.R index d137888..11cf5ab 100644 --- a/R/data-vars.R +++ b/R/data-vars.R @@ -10,3 +10,21 @@ coding_class = c("Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Silent", "Splice_Region", "Splice_Site", "Targeted_Region", "Translation_Start_Site") + +if (getRversion() >= "2.15.1") { + utils::globalVariables(c( + ".", ":=", "CHROM_A", "CHROM_B", "CN", "Chromosome", + "End_Position", "End_Position.x", "FILTER", "Gene", + "HGVSp_Short", "Hugo_Symbol", "ID", "LOH", "LOH_flag", + "SCORE", "START_A", "START_B", "Start_Position", "Start_Position.x", + "Tumor_Sample_Barcode", "VAF_tumour", "Variant_Classification", + "bin", "category", "chrom", "cohort", "colour", "curated", "end", + "ensembl_gene_id", "gambl_metadata", "gene", "genome_build", + "grch37_ashm_regions", "group", "head", "hg38_ashm_regions", + "hot_spot", "hotspot_regions_grch37", "hotspot_regions_hg38", + "is_alias", "log.ratio", "mutated", "mutation_count", "n_mut", + "name", "pair_status", "pathology", "patient_id", "region", + "row_id", "sample_id", "seq_type", "start", "t_alt_count", + "tumour_sample_id", "window_end", "window_start" + )) +} diff --git a/R/genomic_data.R b/R/genomic_data.R deleted file mode 100644 index d56e5a0..0000000 --- a/R/genomic_data.R +++ /dev/null @@ -1,526 +0,0 @@ -# functions for creating and working with S3 objects - -#' Check and set the genome_build/projection -#' -#' This helper function checks the genome build of each genomic data object in -#' \code{genomic_data_list} (using \code{get_genome_build()}) and ensures -#' they are consistent. If all objects share a single, unique genome build, -#' that value is returned. If a user-specified genome build (\code{suggested}) -#' is provided, it is compared to the inferred build and must match; otherwise, -#' an error is raised. If the genomic data objects have conflicting genome -#' builds or if no genome build can be inferred and no \code{suggested} -#' value is provided, the function stops with an error. -#' -#' @param genomic_data_list A list of genomic data objects. Each object should -#' have a genome build that can be retrieved by \code{get_genome_build()}. -#' @param suggested An optional character string specifying a genome build -#' (projection) to be used. If provided, it must match the genome build inferred -#' from the data objects. -#' -#' @return A character string representing the genome build to be used. -#' @export -#' @keywords internal -#' @examples -#' # Example 1: When genomic data objects all have the same genome build. -#' # Assuming maf_data and seg_data both have a genome build of "hg38": -#' genomic_data <- list(maf_data = maf_data, seg_data = seg_data) -#' projection <- check_get_projection(genomic_data, suggested = "hg38") -#' -#' # Example 2: When the genomic data objects conflict or no genome build -#' # is available. -#' # This will raise an error: -#' genomic_data <- list(maf_data = maf_data, -#' seg_data = seg_data_with_different_build) -#' projection <- check_get_projection(genomic_data, suggested = "hg38") -#' -check_get_projection <- function(genomic_data_list, suggested) { - # Extract genome builds from each genomic data object - builds <- sapply(genomic_data_list, get_genome_build) - uniq_builds <- unique(builds) - - if (length(uniq_builds) == 1) { - # A single, consistent genome build was inferred. - if (!missing(suggested) && suggested != uniq_builds) { - stop("Mismatch between user-specified genome_build and - the genome_build inferred from objects.") - } - return(uniq_builds) - } - - if (length(uniq_builds) > 1) { - # Conflicting genome builds among the objects. - stop("Conflicting genome_build values found: ", - paste(uniq_builds, collapse = ", ")) - } - - # No genome build could be inferred. - if (missing(suggested)) { - stop("No projection provided and genome_build - cannot be inferred from the inputs.") - } - - return(suggested) -} - -## GAMBLR.data -#' Create Segmented Data -#' -#' This function creates segmented data from the given input. -#' -#' @param seg_df A data frame containing the segmented data. -#' @param genome_build Required character vector specifying the genome build -#' currently supported: "grch37" or "hg38". -#' @return A data frame with class attributes for segmented data. -#' @export -#' @examples -#' seg_df <- data.frame(...) -#' create_seg_data(seg_df, "grch37") -create_seg_data <- function(seg_df, genome_build) { - if (!inherits(seg_df, "data.frame")) stop("data must be a data frame") - if (!genome_build %in% c("grch37", "hg38")) stop("Invalid genome build") - structure(seg_df, - class = c("seg_data", class(seg_df)), - genome_build = genome_build) -} - - -#' Create MAF Data -#' -#' This function creates MAF (Mutation Annotation Format) data from the given input. -#' -#' @param maf_df A data frame containing the MAF data. -#' @param genome_build A string specifying the genome build ("grch37" or "hg38"). -#' @return A data frame with class attributes for MAF data. -#' @export -create_maf_data <- function(maf_df, genome_build) { - if (!inherits(maf_df, "data.frame")) stop("data must be a data frame") - if (!genome_build %in% c("grch37", "hg38")) stop("Invalid genome build") - - structure(maf_df, - class = c("maf_data", "genomic_data", class(maf_df)), # "genomic_data" for generic methods - genome_build = genome_build) -} - -#' @export -#' @keywords internal -print.maf_data <- function(x, ...) { - cat("MAF Data Object\n") - cat("Genome Build:", attr(x, "genome_build"), "\n") - cat("Showing first 10 rows:\n") - # Convert to a plain data.frame (if not already) so that printing uses the default - # data.frame print method rather than printing as a list. - print(utils::head(as.data.frame(x), 10)) -} - - -#' Get Genome Build -#' -#' This function retrieves the genome build attribute from the data. -#' -#' @param data A data frame with genome build attribute. -#' @return A string specifying the genome build. -#' @export -#' @keywords internal -get_genome_build <- function(data) { - attr(data, "genome_build") -} - -#' Preserve Genomic Attributes -#' -#' This function preserves the genomic attributes and class after dplyr operations. -#' -#' @param new_data A data frame resulting from dplyr operations. -#' @param old_data The original data frame with genomic attributes. -#' @return A data frame with preserved genomic attributes. -#' @export -#' @keywords internal -preserve_genomic_attributes <- function(new_data, old_data) { - # Preserve the genome_build attribute - attr(new_data, "genome_build") <- attr(old_data, "genome_build") - - # Combine the new data’s classes with the genomic classes - new_data_classes <- class(new_data) - # Ensure the genomic classes are at the front - new_classes <- unique(c("maf_data", "genomic_data", new_data_classes)) - class(new_data) <- new_classes - - return(new_data) -} - -#' Strip Genomic Data Classes -#' -#' This function removes custom classes associated with genomic data objects -#' (by default, "genomic_data", "maf_data", and "bed_data") from the class attribute -#' of an object. This can be useful when you want to revert an S3 object to its -#' underlying data.frame (or data.table) classes without converting the object. -#' -#' @param x An object, such as one of your genomic data objects. -#' @param classes A character vector of class names to remove. The default is -#' c("genomic_data", "maf_data", "bed_data"). -#' @return The object with the specified classes removed. -#' @export -#' @keywords internal -strip_genomic_classes <- function(x, classes = c("genomic_data", "maf_data", "bed_data")) { - current_classes <- class(x) - new_classes <- setdiff(current_classes, classes) - class(x) <- new_classes - return(x) -} - - -# S3 methods for genomic_data class -#' @export -#' @keywords internal -mutate.genomic_data <- function(.data, ...) { - new_data <- dplyr::mutate(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -#' @keywords internal -filter.genomic_data <- function(.data, ...) { - new_data <- dplyr::filter(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -#' @keywords internal -select.genomic_data <- function(.data, ...) { - new_data <- dplyr::select(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -#' @keywords internal -rename.genomic_data <- function(.data, ...) { - new_data <- dplyr::rename(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -#' @keywords internal -arrange.genomic_data <- function(.data, ...) { - new_data <- dplyr::arrange(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -#' @keywords internal -group_by.genomic_data <- function(.data, ..., .add = FALSE) { - new_data <- dplyr::group_by(as.data.frame(.data), ..., .add = .add) - preserve_genomic_attributes(new_data, .data) -} -#' @export -ungroup.genomic_data <- function(x, ...) { - new_data <- dplyr::ungroup(as.data.frame(x), ...) - preserve_genomic_attributes(new_data, x) -} - -#' Bind maf or other genomic data together -#' -#' @description Combine multiple maf_data objects and retain metadata such as genome_build. -#' This function will not allow you to combine maf_data objects that have different genome_build values. -#' An error will also be thrown if the same sample id is found in more than one of the inputs (if check_id is TRUE). -#' -#' @param ... All maf_data or seg_data objects to be combined. -#' @param check_id Logical. If TRUE (the default), the function will check for the presence of the expected ID column -#' and for duplicate sample IDs across the inputs. Set to FALSE to skip this check. -#' -#' @return data.frame with combined data and preserved genome_build metadata. -#' @export -#' -#' @examples -#' -#' merged_maf = bind_genomic_data(maf1, maf2,check_id=FALSE) -#' -#' @keywords internal -bind_genomic_data <- function(..., check_id = TRUE) { - - in_list <- list(...) - - if ("maf_data" %in% class(in_list[[1]])) { - # MAF format, ID column is Tumor_Sample_Barcode - id_col <- "Tumor_Sample_Barcode" - } else if ("seg_data" %in% class(in_list[[1]])) { - # SEG format, ID column is ID - id_col <- "ID" - } else { - stop(paste("Unsure how to merge:", class(in_list[[1]]))) - } - - # Ensure all inputs are either maf_data or seg_data objects - if (!all(sapply(in_list, inherits, "maf_data")) && - !all(sapply(in_list, inherits, "seg_data"))) { - stop("All inputs must be maf_data objects or seg_data objects.") - } - - # Extract genome builds - genome_builds <- unique(sapply(in_list, get_genome_build)) - - if (length(genome_builds) > 1) { - stop("Cannot bind seg_data or maf_data objects - with different genome builds: ", - paste(genome_builds, collapse = ", ")) - } - - # If check_id is TRUE, verify that the expected ID column exists and - # that IDs are unique. - if (check_id) { - # Collect unique sample IDs from each dataset - id_sets <- lapply(in_list, function(df) { - if (!(id_col %in% colnames(df))) { - stop("ID column '", id_col, "' not found in input data.") - } - unique(df[[id_col]]) - }) - - # Flatten the list and count occurrences of each ID - all_ids <- unlist(id_sets) - duplicate_ids <- names(table(all_ids)[table(all_ids) > 1]) - - # If any ID is found in multiple datasets, throw an error - if (length(duplicate_ids) > 0) { - stop("Duplicate IDs found in multiple input data frames: ", - paste(duplicate_ids, collapse = ", ")) - } - } - - combined <- dplyr::bind_rows(in_list) - attr(combined, "genome_build") <- genome_builds[1] -# Assign the common genome build - - if (!"maf_data" %in% class(combined)) { - class(combined) <- c("maf_data", "genomic_data", class(combined)) -# Preserve class - } - - return(combined) -} - - - -#' Create BED Data -#' -#' This function creates BED (Browser Extensible Data) objects from the given input. -#' It assumes that the BED data should have columns corresponding to chromosome, start, -#' and end. If the second and third columns are not numeric (as expected for start and end), -#' the function will attempt to identify the proper columns by matching column names. -#' -#' In the output, the first three columns will be renamed to "chrom", "start", and "end". -#' If a fourth column exists, it is renamed to "name" (and any additional columns are preserved). -#' -#' Additionally, if a "name" column exists and its values are not unique, the function -#' will warn the user. The user can optionally supply a method to automatically fix the -#' names via the `fix_names` argument: -#' -#' - If `fix_names = "chrom_start_end"`, the new name will be built as "chrom:start-end". -#' -#' - If `fix_names = "concat"`, then the columns specified by `concat_cols` (using the -#' original column names in the input data) will be concatenated to form the new name. -#' By default, no separator is used, but a separator can be specified via the `sep` -#' argument. -#' -#' After applying the fix, the function checks if the new names are unique. If they are not, -#' a warning is issued that includes up to five examples of duplicate names and the row numbers -#' where they occur. -#' -#' @param bed_df A data frame containing the BED data. -#' @param genome_build A string specifying the genome build -#' ("grch37" or "hg38"). -#' If NULL, the function will try to infer the genome build -#' from the object name. -#' @param fix_names Either NULL (the default), or one of "chrom_start_end" -#' or "concat". -#' If not NULL and duplicate names are detected, the function will -#' apply the chosen fix. -#' @param concat_cols When `fix_names = "concat"`, a character vector -#' specifying which columns -#' from the original data to merge. -#' @param sep The separator to use when concatenating columns if -#' fix_names = "concat". -#' Defaults to "" (no separator). -#' @return A data frame with class attributes for BED data. -#' -#' @export -#' -#' @examples -#' -#' # get a abed_data object for all aSHM regions -#' ashm_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, -#' fix_names = "concat", -#' concat_cols = c("gene","region"), -#' sep="-") -#' # the build is automatically inferred if it is in the variable name -#' get_genome_build(ashm_bed) -#' print(ashm_bed) -#' another_bed = create_bed_data(somatic_hypermutation_locations_GRCh37_v_latest, -#' fix_names = "concat", -#' concat_cols = c("chr_name","hg19_start","hg19_end")) -#' -#' get_genome_build(another_bed) -#' -#' # get a bed_data object for all gene regions and combine several columns to make a unique name -#' gene_regions <- create_bed_data(hg38_gene_coordinates, -#' fix_names = "concat", -#' sep="-", -#' concat_cols = c("chromosome","start","end","gene_name")) -#' -#' get_genome_build(gene_regions) -#' -create_bed_data <- function(bed_df, - genome_build = NULL, - fix_names = NULL, - concat_cols = NULL, - sep = "") { - # Check that input is a data frame. - if (!inherits(bed_df, "data.frame")) { - stop("Input data must be a data frame") - } - - # Capture the original data and column names (before any reordering or renaming) - orig_df <- bed_df - orig_names <- names(bed_df) - - # If genome_build is not provided, attempt to infer it from the object name. - if (is.null(genome_build)) { - object_name <- deparse(substitute(bed_df)) - possible_builds <- character(0) - - if (grepl("grch37", object_name, ignore.case = TRUE)) { - possible_builds <- c(possible_builds, "grch37") - } - if (grepl("hg38", object_name, ignore.case = TRUE)) { - possible_builds <- c(possible_builds, "hg38") - } - - if (length(possible_builds) == 1) { - genome_build <- possible_builds - } else if (length(possible_builds) == 0) { - stop("Could not determine genome build from object name; please supply genome_build argument.") - } else { - stop("Ambiguous genome build in object name; please supply genome_build argument explicitly.") - } - } - - # Validate genome build. - if (!genome_build %in% c("grch37", "hg38")) { - stop("Invalid genome build. Please choose either 'grch37' or 'hg38'.") - } - - # Helper function to force column naming for the BED data. - force_bed_column_names <- function(df) { - new_names <- names(df) - # Force first three columns to be "chrom", "start", "end" - new_names[1:3] <- c("chrom", "start", "end") - # If there's a fourth column, force it to "name" - if (ncol(df) >= 4) { - new_names[4] <- "name" - } - names(df) <- new_names - return(df) - } - - # Check if the first three columns (as supplied) are in the expected form. - # We expect columns 2 and 3 (start and end) to be numeric. - if (ncol(bed_df) >= 3 && is.numeric(bed_df[[2]]) && is.numeric(bed_df[[3]])) { - # The data is assumed to be in the correct order. - bed_df <- force_bed_column_names(bed_df) - } else { - # Attempt to guess the proper columns based on names. - names_lower <- tolower(names(bed_df)) - - chrom_idx <- which(names_lower %in% c("chrom", "chromosome")) - start_idx <- which(names_lower %in% c("start", "start_position", "startpos")) - end_idx <- which(names_lower %in% c("end", "end_position", "endpos")) - - if (length(chrom_idx) != 1 || length(start_idx) != 1 || length(end_idx) != 1) { - stop("Columns 2 and 3 (start and end) are not numeric and the chromosome/start/end columns ", - "cannot be unambiguously identified from the column names.") - } - - # Reorder the data frame so that the candidate columns come first. - remaining_idx <- setdiff(seq_len(ncol(bed_df)), c(chrom_idx, start_idx, end_idx)) - new_order <- c(chrom_idx, start_idx, end_idx, remaining_idx) - bed_df <- bed_df[, new_order, drop = FALSE] - - # After reordering, check that the new second and third columns are numeric. - if (!is.numeric(bed_df[[2]]) || !is.numeric(bed_df[[3]])) { - stop("After reordering based on column names, the start and end columns are not numeric.") - } - - # Force the first three (and optionally the fourth) column names. - bed_df <- force_bed_column_names(bed_df) - } - - # If a "name" column exists, check that its values are unique. - if (ncol(bed_df) >= 4) { - if (anyDuplicated(bed_df[[4]]) > 0) { - # If no fix is provided, issue a generic warning. - if (is.null(fix_names)) { - warning("The values in the 'name' column are not unique.") - } else { - # Apply the requested fix. - if (fix_names == "chrom_start_end") { - new_names_vec <- paste0(bed_df$chrom, ":", bed_df$start, "-", bed_df$end) - bed_df[[4]] <- new_names_vec - if (length(unique(new_names_vec)) != nrow(bed_df)) { - # Identify duplicate examples. - dup_idx <- which(duplicated(new_names_vec) | duplicated(new_names_vec, fromLast = TRUE)) - dup_names <- unique(new_names_vec[dup_idx]) - dup_info <- sapply(dup_names, function(nm) { - rows <- which(new_names_vec == nm) - paste0(nm, " (rows: ", paste(rows, collapse = ", "), ")") - }) - warning("The 'chrom_start_end' fix did not result in a unique set of names. Examples: ", - paste(dup_info[1:min(5, length(dup_info))], collapse = "; "), - ". Please review your data or consider an alternative fix.") - } - } else if (fix_names == "concat") { - if (is.null(concat_cols)) { - stop("For fix_names = 'concat', you must supply concat_cols indicating which columns to merge.") - } - if (!is.character(concat_cols)) { - stop("For fix_names = 'concat', concat_cols must be a character vector referring to the original column names.") - } - if (!all(concat_cols %in% orig_names)) { - stop("One or more column names specified in concat_cols do not exist in the original data.") - } - # Build new names using the original data. - # Use paste with the specified separator. - new_names_vec <- do.call(paste, c(orig_df[, concat_cols, drop = FALSE], sep = sep)) - bed_df[[4]] <- new_names_vec - if (length(unique(new_names_vec)) != nrow(bed_df)) { - dup_idx <- which(duplicated(new_names_vec) | duplicated(new_names_vec, fromLast = TRUE)) - dup_names <- unique(new_names_vec[dup_idx]) - dup_info <- sapply(dup_names, function(nm) { - rows <- which(new_names_vec == nm) - paste0(nm, " (rows: ", paste(rows, collapse = ", "), ")") - }) - warning("The 'concat' fix did not result in a unique set of names. Examples: ", - paste(dup_info[1:min(5, length(dup_info))], collapse = "; "), - ". Please review your data or consider an alternative fix.") - } - } else { - stop("Invalid value for fix_names. Use 'chrom_start_end' or 'concat'.") - } - } - } - } - # enforce strict matching of chr prefixing - if(genome_build == "grch37"){ - if(any(grepl("chr",bed_df$chrom))){ - bed_df = mutate(bed_df,chrom = gsub("chr", "", chrom)) - } - } - # Create the S3 object with additional class attributes and genome_build attribute. - structure(bed_df, - class = c("bed_data", "genomic_data", class(bed_df)), - genome_build = genome_build) -} - -#' @export -print.bed_data <- function(x, ...) { - cat("BED Data Object\n") - cat("Genome Build:", attr(x, "genome_build"), "\n") - cat("Showing first 10 rows:\n") - # Convert to a plain data.frame (if not already) so that printing uses the default - # data.frame print method rather than printing as a list. - print(utils::head(as.data.frame(x), 10)) -} - - diff --git a/R/get_ashm_count_matrix.R b/R/get_ashm_count_matrix.R deleted file mode 100644 index 10b7d81..0000000 --- a/R/get_ashm_count_matrix.R +++ /dev/null @@ -1,133 +0,0 @@ -#' @title Get ASHM Count Matrix. -#' -#' @description Prepare a matrix with one row per sample and one column per -#' region using a set of hypermutated regions. -#' -#' @details Values are the number of mutations in that patient in the region. -#' -#' @param regions_bed A bed file with one row for each region. -#' @param these_samples_metadata This is used to complete your matrix. All GAMBL -#' samples will be used by default. Provide a data frame with at least -#' sample_id for all samples if you are using non-GAMBL data. -#' @param this_seq_type The seq type to return results for. Only used if no -#' metadata is provided with these_samples_metadata. -#' @param projection Which genome build to use for the mutations -#' (must match the coordinate system your regions to avoid a nonsense result) -#' -#' @return matrix -#' -#' @import dplyr tibble -#' @export -#' -#' @examples -#' regions_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, -#' fix_names="concat", -#' concat_cols=c("gene","region"), -#' sep="-") -#' my_meta = get_gambl_metadata() %>% dplyr::filter(pathology=="DLBCL") -#' matrix <- get_ashm_count_matrix( -#' regions_bed = regions_bed, -#' this_seq_type = "genome" -#' ) -#' -#' #this example should fail because the regions_bed is not hg38 -#' matrix <- get_ashm_count_matrix(regions_bed=regions_bed, -#' this_seq_type = "genome", -#' these_samples_metadata = my_meta, -#' projection = "hg38") -#' # Error in get_ashm_count_matrix( -#' # Your projection argument does not match the genome_build of regions_bed -#' -#' # format the name column to include the coordinates instead of the gene -#' regions_bed = create_bed_data(GAMBLR.data::hg38_ashm_regions, -#' fix_names="concat", -#' concat_cols=c("chr_name","hg38_start","hg38_end"), -#' sep="-") -#' -#' matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed, -#' this_seq_type = "genome", -#' these_samples_metadata = my_meta, -#' projection = "hg38") -#' print(dim(matrix_hg38)) -#' print(head(matrix_hg38[,c(1:8)])) -get_ashm_count_matrix = function( - regions_bed, - these_samples_metadata, - this_seq_type, - projection = "grch37" -) { - if (missing(this_seq_type)) { - if (missing(these_samples_metadata)) { - stop("Please supply either the this_seq_type or a - metadata from which it can be retrieved") - } - this_seq_type <- these_samples_metadata %>% - pull(seq_type) %>% - unique() - } - - if (missing(regions_bed)){ - message( - "Using aSHM regions in grch37 genome_build as regions_bed" - ) - if (projection=="grch37"){ - regions_bed <- GAMBLR.data::grch37_ashm_regions %>% - mutate(name = paste(gene, region, sep = "_")) %>% - create_bed_data(genome_build = projection) - } else if(projection=="hg38") { - regions_bed <- GAMBLR.data::hg38_ashm_regions %>% - mutate(name = paste(gene, region, sep = "_")) %>% - create_bed_data(genome_build = projection) - }else{ - stop(paste("unsupported genome build",projection)) - } - }else { - if ("bed_data" %in% class(regions_bed)) { - if(!get_genome_build(regions_bed)==projection) { - stop(paste("Your genome_build argument does not match the genome_build of regions_bed",get_genome_build(regions_bed),genome_build)) - } - } - } - - if (missing(these_samples_metadata)){ - all_meta <- get_gambl_metadata(seq_type_filter=this_seq_type) %>% - dplyr::select(sample_id) - }else { - all_meta <- these_samples_metadata %>% - dplyr::select(sample_id) - } - - ashm_maf <- get_ssm_by_regions( - regions_bed = regions_bed, - streamlined = TRUE, - these_samples_metadata = these_samples_metadata, - use_name_column = TRUE, - projection = projection - ) - # Not sure why this was necessary. Possibly because it's also a data.table? - ashm_maf = strip_genomic_classes(ashm_maf) - - ashm_counted <- ashm_maf %>% - group_by(sample_id, region) %>% - tally() - - - #fill out all combinations so we can get the cases with zero mutations - eg <- expand_grid( - sample_id = pull(all_meta, sample_id), - region = unique(ashm_counted$region) - ) - all_counts <- left_join(eg, ashm_counted) %>% - mutate(n = replace_na(n, 0)) %>% - unique() #not sure where the duplicates are coming from but its annoying - - all_counts_wide <- pivot_wider( - all_counts, - id_cols = sample_id, - names_from = region, - values_from = n - ) %>% - column_to_rownames(var = "sample_id") - - return(all_counts_wide) -} diff --git a/R/get_cn_segments.R b/R/get_cn_segments.R deleted file mode 100644 index 18174e9..0000000 --- a/R/get_cn_segments.R +++ /dev/null @@ -1,73 +0,0 @@ -#' @title Get CN Segments. -#' -#' @description Retrieve all copy number segments from the GAMBL outputs -#' -#' @details This function merely loads and returns all the seg_data -#' available for a projection (genome build) -#' @param these_samples_metadata User must provide a metadata table to -#' restrict the data to the samples in your table. -#' The metadata also ensures the proper handling of duplicate sample_id -#' across seq_types and ensures the seq_type in the metadata faithfully -#' represents the seq_type of the data -#' @param projection Desired genome coordinate system for returned CN segments. -#' Default is "grch37". -#' @param this_seq_type Deprecated. -#' @param ... Additional parameters to be passed to the function. -#' -#' @return A data frame with CN segments for the specified region. -#' -#' @import dplyr -#' @export -#' -#' @examples -#' # Example for the capture samples: -#' -#' genome_metadata = get_gambl_metadata(seq_type_filter="genome") -#' -#' genome_segments_hg38 = get_cn_segments( -#' these_samples_metadata = genome_metadata, -#' projection="hg38") -#' -#' -get_cn_segments = function(these_samples_metadata, - projection = "grch37", - this_seq_type, ...) { - #warn/notify the user what version of this function they are using - message("Using the bundled CN segments (.seg) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), - value = TRUE, invert = TRUE) - - metadata = these_samples_metadata - - sample_ids = metadata$sample_id - #return CN segments based on the selected projection - if (projection %in% valid_projections) { - all_segs = GAMBLR.data::sample_data[[projection]]$seg %>% - dplyr::filter(ID %in% sample_ids) - }else { - stop(paste("please provide a valid projection.", - paste(valid_projections, collapse = ", "))) - } - - #ensure chr prefixes are there when necessary - if(projection == "grch37") { - if(grepl("chr", all_segs$chrom[1])) { - all_segs = all_segs %>% - dplyr::mutate(chrom = gsub("chr", "", chrom)) - } - }else { - if (!grepl("chr",all_segs$chrom[1])) { - all_segs = all_segs %>% - dplyr::mutate(chrom = paste0("chr", chrom)) - } - } - - #return S3 class with CN segments and genome_build - all_segs = create_seg_data(all_segs, projection) - return(all_segs) -} diff --git a/R/get_manta_sv.R b/R/get_manta_sv.R deleted file mode 100644 index dd9f697..0000000 --- a/R/get_manta_sv.R +++ /dev/null @@ -1,164 +0,0 @@ -#' @title Get Manta SVs -#' -#' @description Convenience function for retrieving Manta Structural Variants (SVs) from the bundled data [GAMBLR.data::sample_data]. -#' -#' @details To obtain SV calls for multiple samples, give `these_sample_ids` a vector of sample IDs. -#' Alternatively, the user can also provide the `these_samples_metadata` parameter to make use of an already subset metadata table. -#' In this case, the returned SVs will be restricted to the sample_ids within that data frame. -#' This function internally calls [GAMBLR.data::id_ease] to streamline sample ID/metadata parameters. -#' This function can also restrict the returned calls to any genomic regions specified within `chromosome`, `qstart`, `qend`, -#' or the complete region specified under `region` (in chr:start-end format), note that chromosome can be either prefixed or not prefixed. -#' Useful filtering parameters are also available, use `min_vaf` to set the minimum tumour VAF for a SV to be returned and `min_score` -#' to set the lowest Manta somatic score for a SV to be returned. `pair_status` can be used to return variants from either matched or unmatched samples. -#' In addition, the user can chose to return all variants, even the ones not passing the filter criteria. To do so, set `pass = FALSE` (default is TRUE). -#' -#' @param these_sample_ids Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to subset the return to. -#' If not provided (and if `these_sample_ids` is not provided), the function will return all samples from the specified seq_type in the metadata. -#' @param projection The projection genome build. Default is grch37. -#' @param this_seq_type The this_seq_type you want back, default is genome. -#' @param chromosome Optional, the chromosome you are restricting to (can be prefixed or not prefixed). -#' @param qstart Optional, query start coordinate of the range you are restricting to. -#' @param qend Optional, query end coordinate of the range you are restricting to. -#' @param region Optional, region formatted like chrX:1234-5678 (chromosome can be prefixed or not prefixed) instead of specifying chromosome, start and end separately. -#' @param pairing_status Use to restrict results (if desired) to matched or unmatched results (default is to return all). This parameter takes the filtering condition as a string ("matched" or "unmatched"). -#' @param min_vaf The minimum tumour VAF for a SV to be returned. Default is 0.1. -#' @param min_score The lowest Manta somatic score for a SV to be returned. Default is 40. -#' @param pass If TRUE (default) only return SVs that are annotated with PASS in the FILTER column. Set to FALSE to keep all variants, regardless if they PASS the filters. -#' @param verbose Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function. -#' @param ... Any additional parameters. -#' -#' @export -#' -#' @import dplyr -#' -#' @examples -#' #load packages -#' library(dplyr) -#' -#' #lazily get every SV in the table with default quality filters -#' all_sv = get_manta_sv() -#' -#' #get all SVs DLBCL cell line samples -#' cell_line_meta = GAMBLR.data::sample_data$meta %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' dlbcl_sv = get_manta_sv(these_samples_metadata = cell_line_meta) -#' -#' #get the SVs in a region around MYC -#' myc_locus_sv = get_manta_sv(region = "8:128723128-128774067") -#' -get_manta_sv = function(these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - chromosome, - qstart, - qend, - region, - pairing_status, - min_vaf = 0.1, - min_score = 40, - pass = TRUE, - verbose = FALSE, - ...){ - - #warn/notify the user what version of this function they are using - message("Using the bundled Manta SV (.bedpe) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - #return manta SV based on the selected projection - if(projection %in% valid_projections){ - manta_sv = GAMBLR.data::sample_data[[projection]]$bedpe %>% - dplyr::filter(tumour_sample_id %in% sample_ids) - }else{ - stop(paste("please provide a valid projection. The following are available:", - paste(valid_projections,collapse=", "))) - } - - if(!missing(region)){ - region = gsub(",", "", region) - split_chunks = unlist(strsplit(region, ":")) - chromosome = split_chunks[1] - startend = unlist(strsplit(split_chunks[2], "-")) - qstart = startend[1] - qend = startend[2] - } - - manta_sv = manta_sv %>% - dplyr::filter(VAF_tumour >= min_vaf, - SCORE >= min_score) - - if(verbose){ - no_manta = setdiff(metadata$sample_id, manta_sv$tumour_sample_id) - - if(length(no_manta) > 0){ - message(paste0("No Manta results found for ", length(no_manta), " samples...")) - print(no_manta) - } - } - - #deal with chr prefixes based on the selected projection (if return is to be subset to regions...) - if(!missing(region) || !missing(chromosome)){ - if(projection == "grch37"){ - if(grepl("chr", chromosome)){ - chromosome = gsub("chr", "", chromosome) - } - }else if(projection == "hg38"){ - if(!grepl("chr", chromosome)){ - chromosome = paste0("chr", chromosome) - } - } - - manta_sv = manta_sv %>% - dplyr::filter((CHROM_A == chromosome & START_A >= qstart & START_A <= qend) | (CHROM_B == chromosome & START_B >= qstart & START_B <= qend)) - } - - if(verbose){ - message("\nThe following VCF filters are applied;") - message(paste0(" Minimum VAF: ", min_vaf)) - message(paste0(" Minimum Score: ", min_score)) - message(paste0(" Only keep variants passing the quality filter: ", pass)) - } - - #PASS filter - if(pass){ - manta_sv = manta_sv %>% - dplyr::filter(FILTER == "PASS") - } - - #pairing status filter - if(!missing(pairing_status)){ - if(verbose){ - message(paste0(" Pairing status: ", pairing_status)) - } - - manta_sv = manta_sv %>% - dplyr::filter(pair_status == pairing_status) - } - - #convert to data frame and print some metrics - manta_sv = as.data.frame(manta_sv) - - if(verbose){ - n_variants = nrow(manta_sv) - unique_samples = unique(manta_sv$tumour_sample_id) - message(paste0("\nReturning ", n_variants, " variants from ", length(unique_samples), " sample(s)")) - message("\nDone!") - } - - return(manta_sv) -} diff --git a/R/get_ssm_by_patients.R b/R/get_ssm_by_patients.R deleted file mode 100644 index 817e1fe..0000000 --- a/R/get_ssm_by_patients.R +++ /dev/null @@ -1,88 +0,0 @@ -#' @title Get SSM By Patients. -#' -#' @description Get MAF-format data frame for more than one patient. -#' -#' @details This function returns variants from a set of patients. -#' This function internally calls [GAMBLR.data::get_ssm_by_samples]. -#' Thus, the main contents of this function is to wrangle the provided patient IDs, -#' so that the corresponding sample IDs can be provided to the internal call of `get_ssm_by_samples`. -#' This function expects either a vector of patient IDs (`these_patients_ids`) -#' or an already subset metadata table (`these_samples_metadata`). -#' -#' @param these_patient_ids A vector of patient IDs that you want results for. -#' The user can also use a metadata table that has been subset to the patient IDs of interest (see `these_samples_metadata`). -#' @param these_samples_metadata A metadata subset to contain the rows corresponding to the patients of interest. -#' If the vector of patient IDs is missing (`these_patient_ids`), this function will default to all patient IDs in the metadata table given to this parameter. -#' @param projection Obtain variants projected to this reference (one of grch37 or hg38). Default is grch37. -#' @param this_seq_type The seq type you want results for. Default is "genome". -#' @param tool_name Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers. -#' @param this_study Optionally specify first name of the author for the paper -#' from which the variants should be returned for. -#' This parameter can either be a vector of indexes (integer) or a vector of characters (matching columns in MAF). -#' @param verbose Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function. -#' @param ... Any additional parameters. -#' -#' @return A data frame with SSM calls for the selected patients in MAF format. -#' -#' @import dplyr -#' -#' @export -#' -#' @examples -#' -#' # Lets find which patient_id occur more than once in the metadata first -#' my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) %>% -#' dplyr::group_by(patient_id) %>% -#' dplyr::tally() %>% -#' dplyr::filter(n>1) %>% -#' dplyr::pull(patient_id) -#' -#' #now let's get every SSM for all samples from these patients -#' patient_maf = get_ssm_by_patients(these_patient_ids = my_ids) -#' patient_maf %>% dplyr::group_by(Tumor_Sample_Barcode) %>% -#' dplyr::count() %>% head() -#' -get_ssm_by_patients = function(these_patient_ids, - these_samples_metadata, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ...) { - - #check if any invalid parameters are provided - check_excess_params(...) - - #figure out what patients the user wants - if(missing(these_patient_ids)) { - if(missing(these_samples_metadata)) { - stop("You must provide patient IDs (`these_patient_ids`)or a metadata - table with the patient IDs of interest (`these_samples_metadata`)...") - }else{ - message("No patient IDs were provided, this function will resort to - all available patient IDs in the provided metadata.") - } - }else{ - if(missing(these_samples_metadata)){ - these_samples_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter = - this_seq_type) - } - message("Patient IDs and metadata were provided, this function will resort to all available patient IDs in the provided metadata.") - these_samples_metadata = these_samples_metadata %>% - dplyr::filter(patient_id %in% these_patient_ids) - } - - #run get_ssm_by_samples with these_samples_metadata parameter - samples_ssm = get_ssm_by_samples(these_samples_metadata = these_samples_metadata, - projection = projection, - this_seq_type = this_seq_type, - tool_name = tool_name, - verbose = verbose, - ...) - - samples_ssm = create_maf_data(samples_ssm,projection) - # use S3-safe version of dplyr function - - samples_ssm = mutate.genomic_data(samples_ssm,maf_seq_type = this_seq_type) -} diff --git a/R/get_ssm_by_region.R b/R/get_ssm_by_region.R deleted file mode 100644 index 34a6f65..0000000 --- a/R/get_ssm_by_region.R +++ /dev/null @@ -1,138 +0,0 @@ -#' @title Get SSM By Region. -#' -#' @description Retrieve all SSMs from the GAMBL database within a single genomic coordinate range. -#' -#' @details This function lets the user specify a region of interest for returning SSM calls within that region. -#' There are multiple ways a region can be specified. For example, the user can provide the full region in a "region" format (chr:start-end) to the `region` parameter. -#' Or, the user can provide chromosome, start and end coordinates individually with `chr`, `start`, and `end` parameters. -#' -#' @param these_sample_ids Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to subset the return to. -#' If not provided (and if `these_sample_ids` is not provided), the function will return all samples from the specified seq_type in the metadata. -#' @param maf_data Optional data frame with mutations in MAF format. -#' If user provides a maf, the function trusts that the user has already subset this to samples of interest, correct seq_type. -#' i.e the following parameters are ignored; `these_samples_metadata`, `these_sample_ids`, and `this_seq_type` -#' @param chromosome The chromosome you are restricting to (with or without a chr prefix). -#' @param qstart Query start coordinate of the range you are restricting to. -#' @param qend Query end coordinate of the range you are restricting to. -#' @param region Region formatted like chrX:1234-5678 instead of specifying chromosome, start and end separately. -#' @param streamlined Return Start_Position and Tumor_Smaple_Barcode as the only two MAF columns. Default is FALSE. -#' @param projection Obtain variants projected to this reference (one of grch37 or hg38). -#' @param this_seq_type The seq_type you want back, default is genome. -#' @param tool_name Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers. -#' @param this_study Optionally specify first name of the author for the paper -#' from which the variants should be returned for. -#' @param verbose Set to FALSE to prevent ANY message to be printed. -#' In most cases, this parameter should be left to TRUE. -#' The parameter was added to accommodate for noisy output -#' when running this function in a loop for retrieving SSM -#' for multiple regions [GAMBLR.data::get_ssm_by_regions]. -#' @param ... Any additional parameters. -#' -#' @return A data frame containing all mutations (MAF) in the specified region. -#' -#' @import dplyr -#' -#' @examples -#' my_mutations = get_ssm_by_region(region = "chr8:128,723,128-128,774,067") -#' -#' #specifying chromosome, start and end individually -#' my_mutations = get_ssm_by_region(chromosome = "8", -#' qstart = 128723128, -#' qend = 128774067) -#' -get_ssm_by_region = function(these_sample_ids = NULL, - these_samples_metadata = NULL, - maf_data, - chromosome, - qstart, - qend, - region = "", - streamlined = FALSE, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ...){ - - if(verbose){ - if(missing(maf_data)){ - #warn/notify the user what version of this function they are using - message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...") - } - } - - #check if any invalid parameters are provided - check_excess_params(...) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - - - # Optionally return variants from a particular study - if(!missing(this_study)){ - this_maf <- this_maf %>% - dplyr::filter((!!sym("Study")) == this_study) - } - - #split region into chunks (chr, start, end) and deal with chr prefixes based on the selected projection - if(length(region) > 1){ - stop("You are providing more than one region, please refer to get_ssm_by_regions for multiple regions...") - } - - if(!region == ""){ - region = gsub(",", "", region) - split_chunks = unlist(strsplit(region, ":")) - - chromosome = split_chunks[1] - startend = unlist(strsplit(split_chunks[2], "-")) - qstart = as.numeric(startend[1]) - qend = as.numeric(startend[2]) - }else{ - if(projection =="grch37"){ - chromosome = gsub("chr", "", chromosome) - } - region = paste0(chromosome, ":", qstart, "-", qend) - } - - if(projection == "grch37"){ - chromosome = gsub("chr", "", chromosome) - } - - #return SSMs based on the selected projection - if(missing(maf_data)){ - # Filter by position on-the-fly to avoid wastefully building the same large MAF each time - this_maf = GAMBLR.data::sample_data[[projection]]$maf %>% - dplyr::filter(Chromosome == chromosome & Start_Position > qstart & Start_Position < qend) %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - muts_region <- GAMBLR.data::sample_data[[projection]]$ashm %>% - dplyr::filter(Chromosome == chromosome & Start_Position > qstart & Start_Position < qend) %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) %>% - bind_rows(this_maf, .) - }else{ - muts_region = dplyr::filter(maf_data, Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter(Chromosome == chromosome & Start_Position > qstart & Start_Position < qend) - } - - # Handle possible duplicates - muts_region <- muts_region %>% - distinct(Tumor_Sample_Barcode, Chromosome, Start_Position, End_Position, .keep_all = TRUE) - - if(streamlined){ - muts_region = muts_region %>% - dplyr::select(Start_Position, Tumor_Sample_Barcode) - } - muts_region = create_maf_data(muts_region,projection) - # use S3-safe version of dplyr function - muts_region = mutate.genomic_data(muts_region,maf_seq_type = this_seq_type) - return(muts_region) -} diff --git a/R/get_ssm_by_samples.R b/R/get_ssm_by_samples.R deleted file mode 100644 index 2aeb5fb..0000000 --- a/R/get_ssm_by_samples.R +++ /dev/null @@ -1,102 +0,0 @@ -#' @title Get SSM By Samples. -#' -#' @description Get the SSMs (i.e. load MAF) for a single sample or a -#' collection of samples. -#' -#' @details Retrieve a maf for a specific sample or a set of samples. -#' Either specify the sample IDs of interest with `these_sample_ids`. -#' Or a metadata table subset to the sample IDs of interest with -#' `these_samples_metadata`. -#' -#' @param these_sample_ids A vector of one or more sample IDs that you -#' want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample_id -#' column) to auto-subset the data to samples in that table before returning. -#' If not provided and these_sample_ids is also not provided, the function will -#' return SSM for all samples from the specified seq_type in the bundled -#' metadata. -#' @param this_seq_type Default is genome. -#' @param projection The projection genome build. Supports hg38 and grch37. -#' @param tool_name Optionally specify which tool to report variant from. -#' The default is slms-3, also supports "publication" to return the exact -#' variants as reported in the original papers. -#' @param verbose Enable for debugging/noisier output. -#' @param ... Any additional parameters. -#' -#' @return data frame in MAF format. -#' -#' @import dplyr -#' -#' @export -#' -#' @examples -#' -#' #Get genome-wide set of mutations from all DLBCL cell lines -#' -#' # 1. get our metadata for the DLBCL cell lines -#' cell_line_meta = get_gambl_metadata() %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' # 2. get the SSMs for the DLBCL cell lines -#' dlbcl_maf = get_ssm_by_samples(these_samples_metadata = cell_line_meta) -#' -#' # 3. have a look: -#' dlbcl_maf %>% dplyr::group_by(Tumor_Sample_Barcode) %>% -#' dplyr::count() -#' -get_ssm_by_samples <- function(these_sample_ids = NULL, - these_samples_metadata = NULL, - this_seq_type = "genome", - projection = "grch37", - tool_name = "slms-3", - verbose = FALSE, - ...) { - - #warn/notify the user what version of this function they are using - message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), - value = TRUE, invert = TRUE) - - #return SSMs based on the selected projection - if(projection %in% valid_projections) { - sample_ssm = GAMBLR.data::sample_data[[projection]]$maf %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - sample_ssm <- bind_rows(sample_ssm, - GAMBLR.data::sample_data[[projection]]$ashm %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - ) - - }else { - stop(paste("please provide a valid projection. Available options:", - paste(valid_projections,collapse=", "))) - } - - - # Handle possible duplicates - sample_ssm <- sample_ssm %>% - distinct(Tumor_Sample_Barcode, - Chromosome, - Start_Position, - End_Position, - .keep_all = TRUE) - # bundle genome_build with the maf_data - sample_ssm = create_maf_data(sample_ssm,projection) - # use S3-safe version of dplyr function - sample_ssm = mutate.genomic_data(sample_ssm,maf_seq_type = this_seq_type) - return(sample_ssm) -} From 68958dd3b300a3d2fe023764d1e5d137fa145b14 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 7 Feb 2025 22:42:03 -0800 Subject: [PATCH 16/19] document --- NAMESPACE | 28 ------ man/annotate_hotspots.Rd | 38 -------- man/assign_cn_to_ssm.Rd | 92 ------------------- man/bind_genomic_data.Rd | 28 ------ man/calc_mutation_frequency_bin_region.Rd | 87 ------------------ man/calc_mutation_frequency_bin_regions.Rd | 102 --------------------- man/check_excess_params.Rd | 23 ----- man/check_get_projection.Rd | 44 --------- man/create_bed_data.Rd | 88 ------------------ man/create_maf_data.Rd | 19 ---- man/create_seg_data.Rd | 24 ----- man/get_ashm_count_matrix.Rd | 68 -------------- man/get_cn_segments.Rd | 48 ---------- man/get_genome_build.Rd | 18 ---- man/get_manta_sv.Rd | 84 ----------------- man/get_ssm_by_patients.Rd | 67 -------------- man/get_ssm_by_region.Rd | 80 ---------------- man/get_ssm_by_samples.Rd | 67 -------------- man/preserve_genomic_attributes.Rd | 20 ---- man/strip_genomic_classes.Rd | 24 ----- 20 files changed, 1049 deletions(-) delete mode 100644 man/annotate_hotspots.Rd delete mode 100644 man/assign_cn_to_ssm.Rd delete mode 100644 man/bind_genomic_data.Rd delete mode 100644 man/calc_mutation_frequency_bin_region.Rd delete mode 100644 man/calc_mutation_frequency_bin_regions.Rd delete mode 100644 man/check_excess_params.Rd delete mode 100644 man/check_get_projection.Rd delete mode 100644 man/create_bed_data.Rd delete mode 100644 man/create_maf_data.Rd delete mode 100644 man/create_seg_data.Rd delete mode 100644 man/get_ashm_count_matrix.Rd delete mode 100644 man/get_cn_segments.Rd delete mode 100644 man/get_genome_build.Rd delete mode 100644 man/get_manta_sv.Rd delete mode 100644 man/get_ssm_by_patients.Rd delete mode 100644 man/get_ssm_by_region.Rd delete mode 100644 man/get_ssm_by_samples.Rd delete mode 100644 man/preserve_genomic_attributes.Rd delete mode 100644 man/strip_genomic_classes.Rd diff --git a/NAMESPACE b/NAMESPACE index 9c0a5a9..ed7b4cb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,49 +1,21 @@ # Generated by roxygen2: do not edit by hand -S3method(arrange,genomic_data) -S3method(filter,genomic_data) -S3method(group_by,genomic_data) -S3method(mutate,genomic_data) -S3method(print,bed_data) -S3method(print,maf_data) -S3method(rename,genomic_data) -S3method(select,genomic_data) -S3method(ungroup,genomic_data) export("%>%") -export(annotate_hotspots) -export(assign_cn_to_ssm) -export(bind_genomic_data) -export(calc_mutation_frequency_bin_region) -export(calc_mutation_frequency_bin_regions) -export(check_excess_params) -export(check_get_projection) export(collate_results) export(cool_overlaps) -export(create_bed_data) -export(create_maf_data) -export(create_seg_data) -export(get_ashm_count_matrix) -export(get_cn_segments) export(get_coding_ssm) export(get_coding_ssm_status) export(get_colours) export(get_gambl_metadata) export(get_genes) -export(get_genome_build) -export(get_manta_sv) export(get_mapped_colours) -export(get_ssm_by_patients) export(get_ssm_by_regions) -export(get_ssm_by_samples) export(id_ease) -export(preserve_genomic_attributes) export(process_regions) export(region_to_chunks) export(review_hotspots) -export(strip_genomic_classes) import(dplyr) import(ggplot2) -import(parallel) import(purrr) import(tibble) import(tidyr) diff --git a/man/annotate_hotspots.Rd b/man/annotate_hotspots.Rd deleted file mode 100644 index 04936a9..0000000 --- a/man/annotate_hotspots.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/annotate_hotspots.R -\name{annotate_hotspots} -\alias{annotate_hotspots} -\title{Annotate Hotspots.} -\usage{ -annotate_hotspots(mutation_maf, ...) -} -\arguments{ -\item{mutation_maf}{A data frame in MAF format.} - -\item{...}{Any other parameter. These parameters will be ignored.} -} -\value{ -The same data frame with one additional column "hot_spot". -} -\description{ -Annotate MAF-like data frome with a hot_spot column indicating -recurrent mutations. -} -\details{ -This function takes an already loaded MAF data frame with the -\code{mutation_maf} parameter. -} -\examples{ -my_metadata = get_gambl_metadata() -all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, - projection = "grch37", - this_seq_type = "genome") \%>\% - dplyr::filter(Hugo_Symbol \%in\% c("EZH2", - "MEF2B","MYD88","KMT2D")) \%>\% - dplyr::arrange(Hugo_Symbol) - -hot_ssms = annotate_hotspots(all_coding_ssm) -hot_ssms \%>\% dplyr::filter(!is.na(hot_spot)) \%>\% - dplyr::select(1:5,37,hot_spot) - -} diff --git a/man/assign_cn_to_ssm.Rd b/man/assign_cn_to_ssm.Rd deleted file mode 100644 index 2c44aed..0000000 --- a/man/assign_cn_to_ssm.Rd +++ /dev/null @@ -1,92 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/assign_cn_to_ssm.R -\name{assign_cn_to_ssm} -\alias{assign_cn_to_ssm} -\title{Assign CN to SSM.} -\usage{ -assign_cn_to_ssm( - these_samples_metadata, - maf_data, - seg_data, - projection, - coding_only = FALSE, - assume_diploid = FALSE, - include_silent = FALSE, - ... -) -} -\arguments{ -\item{these_samples_metadata}{Metadata table with one or more rows to specify -the samples to process.} - -\item{maf_data}{A data frame of mutations in MAF format or maf_data object -(e.g. from \code{get_coding_ssm} or \code{get_ssm_by_sample}).} - -\item{seg_data}{A data frame of segmented copy number data or seg_data object} - -\item{projection}{Specified genome projection that returned data is relative to. -This is only required when it cannot be inferred from maf_df or seg_df -(or they are not provided).} - -\item{coding_only}{Optional. Set to TRUE to restrict to only variants in coding space -Default is to work with genome-wide variants.} - -\item{assume_diploid}{Optional, this parameter annotates every mutation as -copy neutral. Default is FALSE.} - -\item{include_silent}{Logical parameter indicating whether to include silent -mutations in coding space. Default is FALSE. This parameter only -makes sense if \code{coding_only} is set to TRUE.} - -\item{...}{Any additional parameters.} -} -\value{ -A list containing a data frame (MAF-like format) with three extra -columns: -- log.ratio is the log ratio from the seg file (NA when no overlap). -- LOH -- CN (the rounded absolute copy number estimate of the region based on -log.ratio, NA when no overlap was found). -} -\description{ -Annotate mutations with their copy number information. -} -\details{ -This function takes a metadata table and returns all mutations -for the samples in that metadata. Each mutation is annotated with the -local copy number state of each mutated site. The user can specify if -only coding mutations are of interest. To do so, -set \code{coding_only = TRUE}. When necessary, this function relies on -\code{get_ssm_by_samples} and \code{get_cn_segments} to obtain the required data. -} -\examples{ -# long-handed way -# 1. get some metadata for a collection of samples -some_meta = get_gambl_metadata() \%>\% - dplyr::filter(cohort=="FL_Dreval", - grepl("SP",sample_id)) -# 2. Get the SSMs for these samples - -ssm_genomes_grch37 = get_coding_ssm(projection = "grch37", - these_samples_metadata = some_meta) -# peek at the results -ssm_genomes_grch37 \%>\% dplyr::select(1:8) - -# 3. Lazily let this function obtain the corresponding seg_data for the right genome_build -cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_grch37) - -cn_list$maf \%>\% dplyr::select(1:8,log.ratio,CN) - -# This won't work because the hg38 seg_data is not bundled -ssm_genomes_hg38 = get_coding_ssm(projection = "hg38", - these_samples_metadata = some_meta) -cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_hg38) - -# Easiest/laziest way: -cn_list = assign_cn_to_ssm(projection = "grch37") - - -cn_list$maf \%>\% dplyr::group_by(Tumor_Sample_Barcode,CN) \%>\% - dplyr::count() - -} diff --git a/man/bind_genomic_data.Rd b/man/bind_genomic_data.Rd deleted file mode 100644 index bca456b..0000000 --- a/man/bind_genomic_data.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{bind_genomic_data} -\alias{bind_genomic_data} -\title{Bind maf or other genomic data together} -\usage{ -bind_genomic_data(..., check_id = TRUE) -} -\arguments{ -\item{...}{All maf_data or seg_data objects to be combined.} - -\item{check_id}{Logical. If TRUE (the default), the function will check for the presence of the expected ID column -and for duplicate sample IDs across the inputs. Set to FALSE to skip this check.} -} -\value{ -data.frame with combined data and preserved genome_build metadata. -} -\description{ -Combine multiple maf_data objects and retain metadata such as genome_build. -This function will not allow you to combine maf_data objects that have different genome_build values. -An error will also be thrown if the same sample id is found in more than one of the inputs (if check_id is TRUE). -} -\examples{ - -merged_maf = bind_genomic_data(maf1, maf2,check_id=FALSE) - -} -\keyword{internal} diff --git a/man/calc_mutation_frequency_bin_region.Rd b/man/calc_mutation_frequency_bin_region.Rd deleted file mode 100644 index bb6d06c..0000000 --- a/man/calc_mutation_frequency_bin_region.Rd +++ /dev/null @@ -1,87 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calc_mutation_frequency_bin_region.R -\name{calc_mutation_frequency_bin_region} -\alias{calc_mutation_frequency_bin_region} -\title{Calculate Mutation Frequency By Sliding Window.} -\usage{ -calc_mutation_frequency_bin_region( - region, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - slide_by = 100, - window_size = 1000, - return_format = "long", - min_count_per_bin = 0, - return_count = TRUE, - drop_unmutated = FALSE, - ... -) -} -\arguments{ -\item{region}{A string describing a genomic region in the "chrom:start-end" format. -The region must be specified in this format OR as separate chromosome, start_pos, end_pos arguments.} - -\item{these_samples_metadata}{Optional data frame containing a sample_id column. -If not providing a maf file, seq_type is also a required column.} - -\item{these_sample_ids}{Optional vector of sample IDs. Output will be subset -to IDs present in this vector.} - -\item{this_seq_type}{Optional vector of seq_types to include in heatmap. -Default is "genome". Uses default seq_type priority for samples -with >1 seq_type.} - -\item{maf_data}{Optional maf data frame. Will be subset to rows where -Tumor_Sample_Barcode matches provided sample IDs or metadata table. -If not provided, maf data will be obtained with get_ssm_by_regions().} - -\item{projection}{Specify which genome build to use. Required. Default grch37.} - -\item{slide_by}{Slide size for sliding window. Default 100.} - -\item{window_size}{Size of sliding window. Default 1000.} - -\item{return_format}{Return format of mutations. Accepted inputs are "long" -and "wide". Long returns a data frame of one sample ID/window per row. -Wide returns a matrix with one sample ID per row and one window per column. -Using the "wide" format will retain all samples and windows regardless of -the drop_unmutated or min_count_per_bin parameters.} - -\item{min_count_per_bin}{Minimum counts per bin, default is 0. Setting this -greater than 0 will drop unmutated windows only when return_format is long.} - -\item{return_count}{Boolean statement to return mutation count per window (TRUE) -or binary mutated/unmutated status (FALSE). Default is TRUE.} - -\item{drop_unmutated}{Boolean for whether to drop windows with 0 mutations. -Only effective with "long" return format.} - -\item{...}{Any additional parameters.} -} -\value{ -Either a matrix or a long tidy table of counts per window. -} -\description{ -Count the number of mutations in a sliding window across a -region for all samples. -} -\details{ -This function is called to return the mutation frequency for a given -region, either from a provided input maf data frame or from the GAMBL maf data. -Regions are specified with the \code{region} parameter. Alternatively, the region of -interest can also be specified by calling the function with \code{chromosome}, -\code{start_pos}, and \code{end_pos} parameters. This function operates on a single region. -To return a matrix of sliding window counts over multiple regions, -see \code{calc_mutation_frequency_bin_regions}. -} -\examples{ -myc_region = "8:128747680-128753674" -myc_mut_freq = calc_mutation_frequency_bin_region(region = myc_region, - slide_by = 10, - window_size = 10000) -dplyr::arrange(myc_mut_freq,desc(mutation_count)) - -} diff --git a/man/calc_mutation_frequency_bin_regions.Rd b/man/calc_mutation_frequency_bin_regions.Rd deleted file mode 100644 index 1c091c7..0000000 --- a/man/calc_mutation_frequency_bin_regions.Rd +++ /dev/null @@ -1,102 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calc_mutation_frequency_bin_regions.R -\name{calc_mutation_frequency_bin_regions} -\alias{calc_mutation_frequency_bin_regions} -\title{Mutation counts across sliding windows for multiple regions.} -\usage{ -calc_mutation_frequency_bin_regions( - regions_list = NULL, - regions_bed = NULL, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - region_padding = 1000, - drop_unmutated = FALSE, - skip_regions = NULL, - only_regions = NULL, - slide_by = 100, - window_size = 500, - return_format = "wide", - ... -) -} -\arguments{ -\item{regions_list}{Named vector of regions in the format -c(name1 = "chr:start-end", name2 = "chr:start-end"). If neither \code{regions} nor -\code{regions_bed} is specified, the function will use GAMBLR aSHM region information.} - -\item{regions_bed}{Data frame of regions with four columns (chrom, start, end, name).} - -\item{these_samples_metadata}{Metadata with at least sample_id column. -If not providing a maf data frame, seq_type is also required.} - -\item{these_sample_ids}{Vector of sample IDs. Metadata will be subset to -sample IDs present in this vector.} - -\item{this_seq_type}{Optional vector of seq_types to include in heatmap. -Default "genome". Uses default seq_type priority for samples with >1 seq_type.} - -\item{maf_data}{Optional maf data frame. Will be subset to rows where -Tumor_Sample_Barcode matches provided sample IDs or metadata table. -If not provided, maf data will be obtained with get_ssm_by_regions().} - -\item{projection}{Genome build the function will operate in. Ensure this -matches your provided regions and maf data for correct chr prefix handling. Default "grch37".} - -\item{region_padding}{Amount to pad the start and end coordinates by. Default 1000.} - -\item{drop_unmutated}{Whether to drop bins with 0 mutations. If returning a -matrix format, this will only drop bins with no mutations in any samples.} - -\item{skip_regions}{Optional character vector of genes to exclude from the default aSHM regions.} - -\item{only_regions}{Optional character vector of genes to include from the default aSHM regions.} - -\item{slide_by}{Slide size for sliding window. Default 100.} - -\item{window_size}{Size of sliding window. Default 500.} - -\item{return_format}{Return format of mutations. Accepted inputs are "long" and -"wide". Long returns a data frame of one sample ID/window per row. Wide returns -a matrix with one sample ID per row and one window per column. Using the "wide" -format will retain all samples and windows regardless of the drop_unmutated or -min_count_per_bin parameters. Default wide.} - -\item{...}{Any additional parameters.} -} -\value{ -A table of mutation counts for sliding windows across one or more regions. May be long or wide. -} -\description{ -Obtain a long tidy or wide matrix of mutation counts across -sliding windows for multiple regions. -} -\details{ -This function takes a metadata table with \code{these_samples_metadata} -parameter and internally calls \code{calc_mutation_frequency_bin_region} -(that internally calls \code{get_ssm_by_regions}). -to retrieve mutation counts for sliding windows across one or more regions. -May optionally provide any combination of a maf data frame, existing metadata, -or a regions data frame or named vector. -} -\examples{ - #load metadata. - my_meta = get_gambl_metadata() - dlbcl_bl_meta = dplyr::filter(my_meta, pathology \%in\% c("DLBCL", "BL")) - - - #get ashm regions - some_regions = create_bed_data(grch37_ashm_regions, - fix_names = "concat", - concat_cols = c("gene","region"), - sep="-") - print(some_regions) - mut_count_matrix <- calc_mutation_frequency_bin_regions( - these_samples_metadata = dlbcl_bl_meta, - regions_bed = some_regions - ) -dim(mut_count_matrix) -tail(mut_count_matrix[,c(1:10)]) -} diff --git a/man/check_excess_params.Rd b/man/check_excess_params.Rd deleted file mode 100644 index 07ebe45..0000000 --- a/man/check_excess_params.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_excess_params.R -\name{check_excess_params} -\alias{check_excess_params} -\title{Check Excess Params} -\usage{ -check_excess_params(...) -} -\arguments{ -\item{...}{Parameters to check.} -} -\value{ -Nothing -} -\description{ -Function for checking excessive parameter names. -This function will notify the user if any unavailable parameters are called for any given given function. -This function is designed to work as internal function-call in already available GAMBLR functions. -} -\details{ -Catch function calls containing unsupported arguments. -} -\keyword{internal} diff --git a/man/check_get_projection.Rd b/man/check_get_projection.Rd deleted file mode 100644 index 6afbbd6..0000000 --- a/man/check_get_projection.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{check_get_projection} -\alias{check_get_projection} -\title{Check and set the genome_build/projection} -\usage{ -check_get_projection(genomic_data_list, suggested) -} -\arguments{ -\item{genomic_data_list}{A list of genomic data objects. Each object should -have a genome build that can be retrieved by \code{get_genome_build()}.} - -\item{suggested}{An optional character string specifying a genome build -(projection) to be used. If provided, it must match the genome build inferred -from the data objects.} -} -\value{ -A character string representing the genome build to be used. -} -\description{ -This helper function checks the genome build of each genomic data object in -\code{genomic_data_list} (using \code{get_genome_build()}) and ensures -they are consistent. If all objects share a single, unique genome build, -that value is returned. If a user-specified genome build (\code{suggested}) -is provided, it is compared to the inferred build and must match; otherwise, -an error is raised. If the genomic data objects have conflicting genome -builds or if no genome build can be inferred and no \code{suggested} -value is provided, the function stops with an error. -} -\examples{ -# Example 1: When genomic data objects all have the same genome build. -# Assuming maf_data and seg_data both have a genome build of "hg38": -genomic_data <- list(maf_data = maf_data, seg_data = seg_data) -projection <- check_get_projection(genomic_data, suggested = "hg38") - -# Example 2: When the genomic data objects conflict or no genome build -# is available. -# This will raise an error: -genomic_data <- list(maf_data = maf_data, - seg_data = seg_data_with_different_build) -projection <- check_get_projection(genomic_data, suggested = "hg38") - -} -\keyword{internal} diff --git a/man/create_bed_data.Rd b/man/create_bed_data.Rd deleted file mode 100644 index 97aa626..0000000 --- a/man/create_bed_data.Rd +++ /dev/null @@ -1,88 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{create_bed_data} -\alias{create_bed_data} -\title{Create BED Data} -\usage{ -create_bed_data( - bed_df, - genome_build = NULL, - fix_names = NULL, - concat_cols = NULL, - sep = "" -) -} -\arguments{ -\item{bed_df}{A data frame containing the BED data.} - -\item{genome_build}{A string specifying the genome build -("grch37" or "hg38"). -If NULL, the function will try to infer the genome build -from the object name.} - -\item{fix_names}{Either NULL (the default), or one of "chrom_start_end" -or "concat". -If not NULL and duplicate names are detected, the function will -apply the chosen fix.} - -\item{concat_cols}{When \code{fix_names = "concat"}, a character vector -specifying which columns -from the original data to merge.} - -\item{sep}{The separator to use when concatenating columns if -fix_names = "concat". -Defaults to "" (no separator).} -} -\value{ -A data frame with class attributes for BED data. -} -\description{ -This function creates BED (Browser Extensible Data) objects from the given input. -It assumes that the BED data should have columns corresponding to chromosome, start, -and end. If the second and third columns are not numeric (as expected for start and end), -the function will attempt to identify the proper columns by matching column names. -} -\details{ -In the output, the first three columns will be renamed to "chrom", "start", and "end". -If a fourth column exists, it is renamed to "name" (and any additional columns are preserved). - -Additionally, if a "name" column exists and its values are not unique, the function -will warn the user. The user can optionally supply a method to automatically fix the -names via the \code{fix_names} argument: -\itemize{ -\item If \code{fix_names = "chrom_start_end"}, the new name will be built as "chrom:start-end". -\item If \code{fix_names = "concat"}, then the columns specified by \code{concat_cols} (using the -original column names in the input data) will be concatenated to form the new name. -By default, no separator is used, but a separator can be specified via the \code{sep} -argument. -} - -After applying the fix, the function checks if the new names are unique. If they are not, -a warning is issued that includes up to five examples of duplicate names and the row numbers -where they occur. -} -\examples{ - -# get a abed_data object for all aSHM regions -ashm_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, - fix_names = "concat", - concat_cols = c("gene","region"), - sep="-") -# the build is automatically inferred if it is in the variable name -get_genome_build(ashm_bed) -print(ashm_bed) -another_bed = create_bed_data(somatic_hypermutation_locations_GRCh37_v_latest, - fix_names = "concat", - concat_cols = c("chr_name","hg19_start","hg19_end")) - -get_genome_build(another_bed) - -# get a bed_data object for all gene regions and combine several columns to make a unique name -gene_regions <- create_bed_data(hg38_gene_coordinates, - fix_names = "concat", - sep="-", - concat_cols = c("chromosome","start","end","gene_name")) - -get_genome_build(gene_regions) - -} diff --git a/man/create_maf_data.Rd b/man/create_maf_data.Rd deleted file mode 100644 index a6e8445..0000000 --- a/man/create_maf_data.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{create_maf_data} -\alias{create_maf_data} -\title{Create MAF Data} -\usage{ -create_maf_data(maf_df, genome_build) -} -\arguments{ -\item{maf_df}{A data frame containing the MAF data.} - -\item{genome_build}{A string specifying the genome build ("grch37" or "hg38").} -} -\value{ -A data frame with class attributes for MAF data. -} -\description{ -This function creates MAF (Mutation Annotation Format) data from the given input. -} diff --git a/man/create_seg_data.Rd b/man/create_seg_data.Rd deleted file mode 100644 index e261bea..0000000 --- a/man/create_seg_data.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{create_seg_data} -\alias{create_seg_data} -\title{Create Segmented Data} -\usage{ -create_seg_data(seg_df, genome_build) -} -\arguments{ -\item{seg_df}{A data frame containing the segmented data.} - -\item{genome_build}{Required character vector specifying the genome build -currently supported: "grch37" or "hg38".} -} -\value{ -A data frame with class attributes for segmented data. -} -\description{ -This function creates segmented data from the given input. -} -\examples{ -seg_df <- data.frame(...) -create_seg_data(seg_df, "grch37") -} diff --git a/man/get_ashm_count_matrix.Rd b/man/get_ashm_count_matrix.Rd deleted file mode 100644 index 1c95999..0000000 --- a/man/get_ashm_count_matrix.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ashm_count_matrix.R -\name{get_ashm_count_matrix} -\alias{get_ashm_count_matrix} -\title{Get ASHM Count Matrix.} -\usage{ -get_ashm_count_matrix( - regions_bed, - these_samples_metadata, - this_seq_type, - projection = "grch37" -) -} -\arguments{ -\item{regions_bed}{A bed file with one row for each region.} - -\item{these_samples_metadata}{This is used to complete your matrix. All GAMBL -samples will be used by default. Provide a data frame with at least -sample_id for all samples if you are using non-GAMBL data.} - -\item{this_seq_type}{The seq type to return results for. Only used if no -metadata is provided with these_samples_metadata.} - -\item{projection}{Which genome build to use for the mutations -(must match the coordinate system your regions to avoid a nonsense result)} -} -\value{ -matrix -} -\description{ -Prepare a matrix with one row per sample and one column per -region using a set of hypermutated regions. -} -\details{ -Values are the number of mutations in that patient in the region. -} -\examples{ -regions_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, - fix_names="concat", - concat_cols=c("gene","region"), - sep="-") -my_meta = get_gambl_metadata() \%>\% dplyr::filter(pathology=="DLBCL") -matrix <- get_ashm_count_matrix( - regions_bed = regions_bed, - this_seq_type = "genome" -) - -#this example should fail because the regions_bed is not hg38 - matrix <- get_ashm_count_matrix(regions_bed=regions_bed, - this_seq_type = "genome", - these_samples_metadata = my_meta, - projection = "hg38") -# Error in get_ashm_count_matrix( -# Your projection argument does not match the genome_build of regions_bed - -# format the name column to include the coordinates instead of the gene -regions_bed = create_bed_data(GAMBLR.data::hg38_ashm_regions, - fix_names="concat", - concat_cols=c("chr_name","hg38_start","hg38_end"), - sep="-") - - matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed, - this_seq_type = "genome", - these_samples_metadata = my_meta, - projection = "hg38") -print(dim(matrix_hg38)) -print(head(matrix_hg38[,c(1:8)])) -} diff --git a/man/get_cn_segments.Rd b/man/get_cn_segments.Rd deleted file mode 100644 index 9fd58b7..0000000 --- a/man/get_cn_segments.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_cn_segments.R -\name{get_cn_segments} -\alias{get_cn_segments} -\title{Get CN Segments.} -\usage{ -get_cn_segments( - these_samples_metadata, - projection = "grch37", - this_seq_type, - ... -) -} -\arguments{ -\item{these_samples_metadata}{User must provide a metadata table to -restrict the data to the samples in your table. -The metadata also ensures the proper handling of duplicate sample_id -across seq_types and ensures the seq_type in the metadata faithfully -represents the seq_type of the data} - -\item{projection}{Desired genome coordinate system for returned CN segments. -Default is "grch37".} - -\item{this_seq_type}{Deprecated.} - -\item{...}{Additional parameters to be passed to the function.} -} -\value{ -A data frame with CN segments for the specified region. -} -\description{ -Retrieve all copy number segments from the GAMBL outputs -} -\details{ -This function merely loads and returns all the seg_data -available for a projection (genome build) -} -\examples{ -# Example for the capture samples: - -genome_metadata = get_gambl_metadata(seq_type_filter="genome") - -genome_segments_hg38 = get_cn_segments( - these_samples_metadata = genome_metadata, - projection="hg38") - - -} diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd deleted file mode 100644 index 85e5910..0000000 --- a/man/get_genome_build.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{get_genome_build} -\alias{get_genome_build} -\title{Get Genome Build} -\usage{ -get_genome_build(data) -} -\arguments{ -\item{data}{A data frame with genome build attribute.} -} -\value{ -A string specifying the genome build. -} -\description{ -This function retrieves the genome build attribute from the data. -} -\keyword{internal} diff --git a/man/get_manta_sv.Rd b/man/get_manta_sv.Rd deleted file mode 100644 index 1f71895..0000000 --- a/man/get_manta_sv.Rd +++ /dev/null @@ -1,84 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_manta_sv.R -\name{get_manta_sv} -\alias{get_manta_sv} -\title{Get Manta SVs} -\usage{ -get_manta_sv( - these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - chromosome, - qstart, - qend, - region, - pairing_status, - min_vaf = 0.1, - min_score = 40, - pass = TRUE, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to subset the return to. -If not provided (and if \code{these_sample_ids} is not provided), the function will return all samples from the specified seq_type in the metadata.} - -\item{projection}{The projection genome build. Default is grch37.} - -\item{this_seq_type}{The this_seq_type you want back, default is genome.} - -\item{chromosome}{Optional, the chromosome you are restricting to (can be prefixed or not prefixed).} - -\item{qstart}{Optional, query start coordinate of the range you are restricting to.} - -\item{qend}{Optional, query end coordinate of the range you are restricting to.} - -\item{region}{Optional, region formatted like chrX:1234-5678 (chromosome can be prefixed or not prefixed) instead of specifying chromosome, start and end separately.} - -\item{pairing_status}{Use to restrict results (if desired) to matched or unmatched results (default is to return all). This parameter takes the filtering condition as a string ("matched" or "unmatched").} - -\item{min_vaf}{The minimum tumour VAF for a SV to be returned. Default is 0.1.} - -\item{min_score}{The lowest Manta somatic score for a SV to be returned. Default is 40.} - -\item{pass}{If TRUE (default) only return SVs that are annotated with PASS in the FILTER column. Set to FALSE to keep all variants, regardless if they PASS the filters.} - -\item{verbose}{Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function.} - -\item{...}{Any additional parameters.} -} -\description{ -Convenience function for retrieving Manta Structural Variants (SVs) from the bundled data \link{sample_data}. -} -\details{ -To obtain SV calls for multiple samples, give \code{these_sample_ids} a vector of sample IDs. -Alternatively, the user can also provide the \code{these_samples_metadata} parameter to make use of an already subset metadata table. -In this case, the returned SVs will be restricted to the sample_ids within that data frame. -This function internally calls \link{id_ease} to streamline sample ID/metadata parameters. -This function can also restrict the returned calls to any genomic regions specified within \code{chromosome}, \code{qstart}, \code{qend}, -or the complete region specified under \code{region} (in chr:start-end format), note that chromosome can be either prefixed or not prefixed. -Useful filtering parameters are also available, use \code{min_vaf} to set the minimum tumour VAF for a SV to be returned and \code{min_score} -to set the lowest Manta somatic score for a SV to be returned. \code{pair_status} can be used to return variants from either matched or unmatched samples. -In addition, the user can chose to return all variants, even the ones not passing the filter criteria. To do so, set \code{pass = FALSE} (default is TRUE). -} -\examples{ -#load packages -library(dplyr) - -#lazily get every SV in the table with default quality filters -all_sv = get_manta_sv() - -#get all SVs DLBCL cell line samples -cell_line_meta = GAMBLR.data::sample_data$meta \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -dlbcl_sv = get_manta_sv(these_samples_metadata = cell_line_meta) - -#get the SVs in a region around MYC -myc_locus_sv = get_manta_sv(region = "8:128723128-128774067") - -} diff --git a/man/get_ssm_by_patients.Rd b/man/get_ssm_by_patients.Rd deleted file mode 100644 index 6ef0cf7..0000000 --- a/man/get_ssm_by_patients.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ssm_by_patients.R -\name{get_ssm_by_patients} -\alias{get_ssm_by_patients} -\title{Get SSM By Patients.} -\usage{ -get_ssm_by_patients( - these_patient_ids, - these_samples_metadata, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_patient_ids}{A vector of patient IDs that you want results for. -The user can also use a metadata table that has been subset to the patient IDs of interest (see \code{these_samples_metadata}).} - -\item{these_samples_metadata}{A metadata subset to contain the rows corresponding to the patients of interest. -If the vector of patient IDs is missing (\code{these_patient_ids}), this function will default to all patient IDs in the metadata table given to this parameter.} - -\item{projection}{Obtain variants projected to this reference (one of grch37 or hg38). Default is grch37.} - -\item{this_seq_type}{The seq type you want results for. Default is "genome".} - -\item{tool_name}{Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers.} - -\item{this_study}{Optionally specify first name of the author for the paper -from which the variants should be returned for. -This parameter can either be a vector of indexes (integer) or a vector of characters (matching columns in MAF).} - -\item{verbose}{Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame with SSM calls for the selected patients in MAF format. -} -\description{ -Get MAF-format data frame for more than one patient. -} -\details{ -This function returns variants from a set of patients. -This function internally calls \link{get_ssm_by_samples}. -Thus, the main contents of this function is to wrangle the provided patient IDs, -so that the corresponding sample IDs can be provided to the internal call of \code{get_ssm_by_samples}. -This function expects either a vector of patient IDs (\code{these_patients_ids}) -or an already subset metadata table (\code{these_samples_metadata}). -} -\examples{ - -# Lets find which patient_id occur more than once in the metadata first -my_ids = get_gambl_metadata(seq_type_filter = c("genome","capture")) \%>\% - dplyr::group_by(patient_id) \%>\% - dplyr::tally() \%>\% - dplyr::filter(n>1) \%>\% - dplyr::pull(patient_id) - -#now let's get every SSM for all samples from these patients -patient_maf = get_ssm_by_patients(these_patient_ids = my_ids) -patient_maf \%>\% dplyr::group_by(Tumor_Sample_Barcode) \%>\% - dplyr::count() \%>\% head() - -} diff --git a/man/get_ssm_by_region.Rd b/man/get_ssm_by_region.Rd deleted file mode 100644 index e25a627..0000000 --- a/man/get_ssm_by_region.Rd +++ /dev/null @@ -1,80 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ssm_by_region.R -\name{get_ssm_by_region} -\alias{get_ssm_by_region} -\title{Get SSM By Region.} -\usage{ -get_ssm_by_region( - these_sample_ids = NULL, - these_samples_metadata = NULL, - maf_data, - chromosome, - qstart, - qend, - region = "", - streamlined = FALSE, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to subset the return to. -If not provided (and if \code{these_sample_ids} is not provided), the function will return all samples from the specified seq_type in the metadata.} - -\item{maf_data}{Optional data frame with mutations in MAF format. -If user provides a maf, the function trusts that the user has already subset this to samples of interest, correct seq_type. -i.e the following parameters are ignored; \code{these_samples_metadata}, \code{these_sample_ids}, and \code{this_seq_type}} - -\item{chromosome}{The chromosome you are restricting to (with or without a chr prefix).} - -\item{qstart}{Query start coordinate of the range you are restricting to.} - -\item{qend}{Query end coordinate of the range you are restricting to.} - -\item{region}{Region formatted like chrX:1234-5678 instead of specifying chromosome, start and end separately.} - -\item{streamlined}{Return Start_Position and Tumor_Smaple_Barcode as the only two MAF columns. Default is FALSE.} - -\item{projection}{Obtain variants projected to this reference (one of grch37 or hg38).} - -\item{this_seq_type}{The seq_type you want back, default is genome.} - -\item{tool_name}{Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers.} - -\item{this_study}{Optionally specify first name of the author for the paper -from which the variants should be returned for.} - -\item{verbose}{Set to FALSE to prevent ANY message to be printed. -In most cases, this parameter should be left to TRUE. -The parameter was added to accommodate for noisy output -when running this function in a loop for retrieving SSM -for multiple regions \link{get_ssm_by_regions}.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame containing all mutations (MAF) in the specified region. -} -\description{ -Retrieve all SSMs from the GAMBL database within a single genomic coordinate range. -} -\details{ -This function lets the user specify a region of interest for returning SSM calls within that region. -There are multiple ways a region can be specified. For example, the user can provide the full region in a "region" format (chr:start-end) to the \code{region} parameter. -Or, the user can provide chromosome, start and end coordinates individually with \code{chr}, \code{start}, and \code{end} parameters. -} -\examples{ -my_mutations = get_ssm_by_region(region = "chr8:128,723,128-128,774,067") - -#specifying chromosome, start and end individually -my_mutations = get_ssm_by_region(chromosome = "8", - qstart = 128723128, - qend = 128774067) - -} diff --git a/man/get_ssm_by_samples.Rd b/man/get_ssm_by_samples.Rd deleted file mode 100644 index 66f0e2c..0000000 --- a/man/get_ssm_by_samples.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ssm_by_samples.R -\name{get_ssm_by_samples} -\alias{get_ssm_by_samples} -\title{Get SSM By Samples.} -\usage{ -get_ssm_by_samples( - these_sample_ids = NULL, - these_samples_metadata = NULL, - this_seq_type = "genome", - projection = "grch37", - tool_name = "slms-3", - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{A vector of one or more sample IDs that you -want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample_id -column) to auto-subset the data to samples in that table before returning. -If not provided and these_sample_ids is also not provided, the function will -return SSM for all samples from the specified seq_type in the bundled -metadata.} - -\item{this_seq_type}{Default is genome.} - -\item{projection}{The projection genome build. Supports hg38 and grch37.} - -\item{tool_name}{Optionally specify which tool to report variant from. -The default is slms-3, also supports "publication" to return the exact -variants as reported in the original papers.} - -\item{verbose}{Enable for debugging/noisier output.} - -\item{...}{Any additional parameters.} -} -\value{ -data frame in MAF format. -} -\description{ -Get the SSMs (i.e. load MAF) for a single sample or a -collection of samples. -} -\details{ -Retrieve a maf for a specific sample or a set of samples. -Either specify the sample IDs of interest with \code{these_sample_ids}. -Or a metadata table subset to the sample IDs of interest with -\code{these_samples_metadata}. -} -\examples{ - -#Get genome-wide set of mutations from all DLBCL cell lines - -# 1. get our metadata for the DLBCL cell lines -cell_line_meta = get_gambl_metadata() \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -# 2. get the SSMs for the DLBCL cell lines -dlbcl_maf = get_ssm_by_samples(these_samples_metadata = cell_line_meta) - -# 3. have a look: -dlbcl_maf \%>\% dplyr::group_by(Tumor_Sample_Barcode) \%>\% - dplyr::count() - -} diff --git a/man/preserve_genomic_attributes.Rd b/man/preserve_genomic_attributes.Rd deleted file mode 100644 index 69e4d56..0000000 --- a/man/preserve_genomic_attributes.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{preserve_genomic_attributes} -\alias{preserve_genomic_attributes} -\title{Preserve Genomic Attributes} -\usage{ -preserve_genomic_attributes(new_data, old_data) -} -\arguments{ -\item{new_data}{A data frame resulting from dplyr operations.} - -\item{old_data}{The original data frame with genomic attributes.} -} -\value{ -A data frame with preserved genomic attributes. -} -\description{ -This function preserves the genomic attributes and class after dplyr operations. -} -\keyword{internal} diff --git a/man/strip_genomic_classes.Rd b/man/strip_genomic_classes.Rd deleted file mode 100644 index 508463f..0000000 --- a/man/strip_genomic_classes.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{strip_genomic_classes} -\alias{strip_genomic_classes} -\title{Strip Genomic Data Classes} -\usage{ -strip_genomic_classes(x, classes = c("genomic_data", "maf_data", "bed_data")) -} -\arguments{ -\item{x}{An object, such as one of your genomic data objects.} - -\item{classes}{A character vector of class names to remove. The default is -c("genomic_data", "maf_data", "bed_data").} -} -\value{ -The object with the specified classes removed. -} -\description{ -This function removes custom classes associated with genomic data objects -(by default, "genomic_data", "maf_data", and "bed_data") from the class attribute -of an object. This can be useful when you want to revert an S3 object to its -underlying data.frame (or data.table) classes without converting the object. -} -\keyword{internal} From 8e048af29d74bdcb169ec859a2846bf7edc8a6c5 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 7 Feb 2025 22:46:55 -0800 Subject: [PATCH 17/19] rehome cool overlaps to helpers --- NAMESPACE | 1 - R/cool_overlaps.R | 232 ------------------------------------------- man/cool_overlaps.Rd | 99 ------------------ 3 files changed, 332 deletions(-) delete mode 100644 R/cool_overlaps.R delete mode 100644 man/cool_overlaps.Rd diff --git a/NAMESPACE b/NAMESPACE index ed7b4cb..dfbfea5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,6 @@ export("%>%") export(collate_results) -export(cool_overlaps) export(get_coding_ssm) export(get_coding_ssm_status) export(get_colours) diff --git a/R/cool_overlaps.R b/R/cool_overlaps.R deleted file mode 100644 index e20b1c3..0000000 --- a/R/cool_overlaps.R +++ /dev/null @@ -1,232 +0,0 @@ -#' @title Cool overlap of data frames. -#' -#' @description This function implements overlap of 2 data frames that contain -#' regions of coordinates similar to what data.table::foverlaps does. Unlike -#' foverlaps, this function takes as input data frame class objects, and relies -#' on dplyr solution rather than data.table handling, therefore allowing usage -#' of data frames with virtually unlimited dimensions without crashing. This -#' implementation uses same logic of different types of overlaps as the original -#' foverlaps solution ("any", "start", "end", "within", "equal"). The type "any" -#' is default and allows for any overlapping solution between 2 regions. The -#' type "start" only considers regions with exact same start position as -#' overlap; similarly type "end" considers regions overlapped when the end -#' positions are exact matches. Type "within" means that regions are overlapped -#' when one is contained in another and neither start nor end positions match. -#' Finally, type "equal" only considers overlap when both start and end -#' positions match for both regions. For any type, the presence of any -#' additional column not directly specifying regions (for example, Chromosome) -#' will serve similar to a grouping variable. -#' The generated output of this function will contain the overlapping regions -#' and all columns present in the data frame data1, as well as any columns from -#' the data frame supplied with data2 argument, except for those columns present -#' in data2 that are used for overlap. When the same columns are present in both -#' data1 and data2, the output data frame will have ".x" and ".y" suffixes to -#' indicate which original input data they are coming from. -#' -#' @param data1 Data frame with data to overlap. Required parameter. The minimal -#' required columns are those supplied with the argument columns1. Will -#' dictate the naming of the columns used for overlap in the output. -#' @param data2 Data frame with data to overlap. Required parameter. The minimal -#' required columns are those supplied with the argument columns2. -#' @param columns1 The list of columns from data frame data1 to be used to find -#' overlapping regions. -#' @param columns2 The list of columns from data frame data2 to be used to find -#' overlapping regions. -#' @param type Character specifying the way to find overlaps. Accepted values -#' are "any" (used as default), "start", "end", "within", and "equal". -#' Please see function description for more details of different types. -#' @param nomatch Whether the rows from data1 that do not have overlap in data2 -#' should be returned or not. The default is FALSE (rows without overlap -#' are not returned). If TRUE is specified, the row order in the output -#' data will match the exact order of rows in the input data1. -#' -#' @return data frame -#' @keywords internal -#' -#' @examples -#' # obtain maf data -#' maf1 <- get_coding_ssm( -#' these_sample_ids = "DOHH-2" -#' ) -#' -#' maf2 <- get_coding_ssm( -#' these_sample_ids = "SU-DHL-4" -#' ) -#' -#' # The same mutations are not expected to be present in different samples -#' # so this overlap will produce 0 matching rows -#' overlap <- cool_overlaps( -#' maf1, -#' maf1, -#' type = "equal" -#' ) -#' -#' # To demonstrate functionality we can supply the same maf to the data2 -#' overlap <- cool_overlaps( -#' maf1, -#' maf1 %>% head -#' ) -#' -#' # We can also overlap different formats, for example -#' seg1 <- get_sample_cn_segments(these_sample_ids = "DOHH-2") -#' overlap <- cool_overlaps( -#' data1 = maf1, -#' data2 = seg1, -#' columns2 = c("chrom", "start", "end") -#' ) -#' -#' @import dplyr tidyr -#' @export -#' -cool_overlaps <- function( - data1, - data2, - columns1 = c("Chromosome", "Start_Position", "End_Position"), - columns2 = c("Chromosome", "Start_Position", "End_Position"), - type = "any", - nomatch = FALSE -){ - - # Ensure all columns provided for overlap are present in the data frame - if(! length(columns1) == length(intersect(columns1, colnames(data1)))){ - stop( - "Not all of the requested columns for overlap in data1 are present." - ) - } - - if(! length(columns2) == length(intersect(columns2, colnames(data2)))){ - stop( - "Not all of the requested columns for overlap in data2 are present." - ) - } - - # What is the name of the column in columns1 that specifies start and end? - start1 <- columns1[grepl("start", columns1, ignore.case = TRUE)] - end1 <- columns1[grepl("end", columns1, ignore.case = TRUE)] - - # What is the name of the column in columns1 that specifies start and end? - start2 <- columns2[grepl("start", columns2, ignore.case = TRUE)] - end2 <- columns2[grepl("end", columns2, ignore.case = TRUE)] - - # What are the other columns to be used in overlap? - columns1 <- columns1[!columns1 %in% c(start1, end1)] - columns2 <- columns2[!columns2 %in% c(start2, end2)] - - # When the same columns are provided they will become .x and .y - original_start1 <- start1 - original_end1 <- end1 - if(start1 == start2) { - start1 <- paste0(start1, ".x") - start2 <- paste0(start2, ".y") - - } - if(end1 == end2) { - end1 <- paste0(end1, ".x") - end2 <- paste0(end2, ".y") - - } - - - # Prepare for overlap - overlap <- dplyr::inner_join( - data1, - data2, - by = structure(names = columns1, .Data = columns2), - relationship = "many-to-many" - ) - - # Return matches based on mode - if(type == "any"){ - message( - "Running in default mode of any..." - ) - overlap <- overlap %>% - dplyr::filter( - !!sym(start2) >= !!sym(start1) & !!sym(end2) <= !!sym(end1) | - !!sym(start1) >= !!sym(start2) & !!sym(end1) <= !!sym(end2) - ) - } else if (type == "start"){ - message( - "Running in the mode start..." - ) - overlap <- overlap %>% - dplyr::filter( - !!sym(start1) == !!sym(start2) - ) - } else if (type == "end"){ - message( - "Running in the mode end..." - ) - overlap <- overlap %>% - dplyr::filter( - !!sym(end1) == !!sym(end2) - ) - } else if (type == "within"){ - message( - "Running in the mode within..." - ) - overlap <- overlap %>% - dplyr::filter( - (!!sym(start1) >= !!sym(start2)) & (!!sym(end1) <= !!sym(end2)) | - (!!sym(start2) >= !!sym(start1)) & (!!sym(end2) <= !!sym(end1)) - ) - } else if (type == "equal"){ - message( - "Running in the mode equal..." - ) - overlap <- overlap %>% - dplyr::filter( - (!!sym(start1) == !!sym(start2)) & (!!sym(end1) == !!sym(end2)) - ) - } else { - message( - "You have requested mode that is not supported." - ) - stop( - "Please supply one of any, start, end, within, or equal with type." - ) - } - - # This will ensure that features from data1 that don't have match in data2 - # will be returned with NA annotation - if(nomatch){ - no_annotation <- suppressMessages( - anti_join( - data1, - overlap - ) - ) - if(original_start1 %in% colnames(no_annotation)){ - colnames(no_annotation) = gsub( - original_start1, - start1, - colnames(no_annotation) - ) - } - if(original_end1 %in% colnames(no_annotation)){ - colnames(no_annotation) = gsub( - original_end1, - end1, - colnames(no_annotation) - ) - } - overlap <- bind_rows( - overlap, - no_annotation - ) - - # Ensure order is consistent between input data and the output after - # overlap is found since we used bind_rows - data1 <- data1 %>% - tidyr::unite("row_id", 1:ncol(data1), remove = FALSE) - - colnames(overlap) <- gsub("\\.x$", "", colnames(overlap)) - overlap <- overlap %>% - tidyr::unite("row_id", 1:(ncol(data1)-1), remove = FALSE) %>% - dplyr::arrange(match(row_id, data1$row_id)) %>% - dplyr::select(-row_id) - - } - - return(overlap) -} diff --git a/man/cool_overlaps.Rd b/man/cool_overlaps.Rd deleted file mode 100644 index 44099e9..0000000 --- a/man/cool_overlaps.Rd +++ /dev/null @@ -1,99 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cool_overlaps.R -\name{cool_overlaps} -\alias{cool_overlaps} -\title{Cool overlap of data frames.} -\usage{ -cool_overlaps( - data1, - data2, - columns1 = c("Chromosome", "Start_Position", "End_Position"), - columns2 = c("Chromosome", "Start_Position", "End_Position"), - type = "any", - nomatch = FALSE -) -} -\arguments{ -\item{data1}{Data frame with data to overlap. Required parameter. The minimal -required columns are those supplied with the argument columns1. Will -dictate the naming of the columns used for overlap in the output.} - -\item{data2}{Data frame with data to overlap. Required parameter. The minimal -required columns are those supplied with the argument columns2.} - -\item{columns1}{The list of columns from data frame data1 to be used to find -overlapping regions.} - -\item{columns2}{The list of columns from data frame data2 to be used to find -overlapping regions.} - -\item{type}{Character specifying the way to find overlaps. Accepted values -are "any" (used as default), "start", "end", "within", and "equal". -Please see function description for more details of different types.} - -\item{nomatch}{Whether the rows from data1 that do not have overlap in data2 -should be returned or not. The default is FALSE (rows without overlap -are not returned). If TRUE is specified, the row order in the output -data will match the exact order of rows in the input data1.} -} -\value{ -data frame -} -\description{ -This function implements overlap of 2 data frames that contain -regions of coordinates similar to what data.table::foverlaps does. Unlike -foverlaps, this function takes as input data frame class objects, and relies -on dplyr solution rather than data.table handling, therefore allowing usage -of data frames with virtually unlimited dimensions without crashing. This -implementation uses same logic of different types of overlaps as the original -foverlaps solution ("any", "start", "end", "within", "equal"). The type "any" -is default and allows for any overlapping solution between 2 regions. The -type "start" only considers regions with exact same start position as -overlap; similarly type "end" considers regions overlapped when the end -positions are exact matches. Type "within" means that regions are overlapped -when one is contained in another and neither start nor end positions match. -Finally, type "equal" only considers overlap when both start and end -positions match for both regions. For any type, the presence of any -additional column not directly specifying regions (for example, Chromosome) -will serve similar to a grouping variable. -The generated output of this function will contain the overlapping regions -and all columns present in the data frame data1, as well as any columns from -the data frame supplied with data2 argument, except for those columns present -in data2 that are used for overlap. When the same columns are present in both -data1 and data2, the output data frame will have ".x" and ".y" suffixes to -indicate which original input data they are coming from. -} -\examples{ -# obtain maf data -maf1 <- get_coding_ssm( - these_sample_ids = "DOHH-2" -) - -maf2 <- get_coding_ssm( - these_sample_ids = "SU-DHL-4" -) - -# The same mutations are not expected to be present in different samples -# so this overlap will produce 0 matching rows -overlap <- cool_overlaps( - maf1, - maf1, - type = "equal" -) - -# To demonstrate functionality we can supply the same maf to the data2 -overlap <- cool_overlaps( - maf1, - maf1 \%>\% head -) - -# We can also overlap different formats, for example -seg1 <- get_sample_cn_segments(these_sample_ids = "DOHH-2") -overlap <- cool_overlaps( - data1 = maf1, - data2 = seg1, - columns2 = c("chrom", "start", "end") -) - -} -\keyword{internal} From fe04e48ed6359f07cd214ca504d82ca98deed26c Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 7 Feb 2025 22:58:12 -0800 Subject: [PATCH 18/19] pipe --- DESCRIPTION | 3 --- NAMESPACE | 2 -- man/GAMBLR.data-package.Rd | 21 --------------------- man/pipe.Rd | 20 -------------------- 4 files changed, 46 deletions(-) delete mode 100644 man/GAMBLR.data-package.Rd delete mode 100644 man/pipe.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2bfc439..d31bfd8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,9 +32,6 @@ LazyData: true Imports: dplyr, ggplot2, - magrittr, - purrr, - tibble, tidyr LazyDataCompression: xz Suggests: diff --git a/NAMESPACE b/NAMESPACE index 4420231..964d2eb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,8 @@ # Generated by roxygen2: do not edit by hand -export("%>%") export(get_colours) export(get_genes) export(get_mapped_colours) import(dplyr) import(ggplot2) import(tidyr) -importFrom(magrittr,"%>%") diff --git a/man/GAMBLR.data-package.Rd b/man/GAMBLR.data-package.Rd deleted file mode 100644 index e4a78c4..0000000 --- a/man/GAMBLR.data-package.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GAMBLR.data-package.R -\docType{package} -\name{GAMBLR.data-package} -\alias{GAMBLR.data} -\alias{GAMBLR.data-package} -\title{GAMBLR.data: Collection of Curated Data for Genomic Analysis of Mature B-cell Lymphomas in R} -\description{ -The package contains manually curated data for the genomic Analysis of mature B-cell lymphomas in R, such as regions of somatic hypermutation, lymphoma genes, etc. -} -\author{ -\strong{Maintainer}: Kostiantyn Dreval \email{kdreval@sfu.ca} (\href{https://orcid.org/0000-0002-6214-2843}{ORCID}) - -Authors: -\itemize{ - \item Ryan Morin \email{rdmorin@sfu.ca} (\href{https://orcid.org/0000-0003-2932-7800}{ORCID}) - \item Adam Mattsson \email{cmattsson@bcgsc.ca} (\href{https://orcid.org/0000-0002-6318-7912}{ORCID}) -} - -} -\keyword{internal} diff --git a/man/pipe.Rd b/man/pipe.Rd deleted file mode 100644 index a648c29..0000000 --- a/man/pipe.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-pipe.R -\name{\%>\%} -\alias{\%>\%} -\title{Pipe operator} -\usage{ -lhs \%>\% rhs -} -\arguments{ -\item{lhs}{A value or the magrittr placeholder.} - -\item{rhs}{A function call using the magrittr semantics.} -} -\value{ -The result of calling \code{rhs(lhs)}. -} -\description{ -See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. -} -\keyword{internal} From 28906c365d5de4073b280a6f47f12082c9e3e560 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 7 Feb 2025 23:12:19 -0800 Subject: [PATCH 19/19] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d31bfd8..dde8533 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: GAMBLR.data Title: Collection of Curated Data for Genomic Analysis of Mature B-cell Lymphomas in R -Version: 1.2 +Version: 1.3 Authors@R: c( person(given = "Ryan",