diff --git a/DESCRIPTION b/DESCRIPTION index fcc05d1..be9ceb3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,8 +32,6 @@ LazyData: true Imports: dplyr, ggplot2, - purrr, - tibble, tidyr LazyDataCompression: xz Suggests: diff --git a/NAMESPACE b/NAMESPACE index 3678176..964d2eb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,47 +1,8 @@ # Generated by roxygen2: do not edit by hand -S3method(arrange,genomic_data) -S3method(filter,genomic_data) -S3method(group_by,genomic_data) -S3method(mutate,genomic_data) -S3method(print,bed_data) -S3method(print,maf_data) -S3method(rename,genomic_data) -S3method(select,genomic_data) -S3method(ungroup,genomic_data) -export(annotate_hotspots) -export(assign_cn_to_ssm) -export(bind_genomic_data) -export(calc_mutation_frequency_bin_region) -export(calc_mutation_frequency_bin_regions) -export(check_excess_params) -export(collate_results) -export(cool_overlaps) -export(create_bed_data) -export(create_maf_data) -export(create_seg_data) -export(get_ashm_count_matrix) -export(get_cn_segments) -export(get_coding_ssm) -export(get_coding_ssm_status) export(get_colours) -export(get_gambl_metadata) export(get_genes) -export(get_genome_build) -export(get_manta_sv) export(get_mapped_colours) -export(get_sample_cn_segments) -export(get_ssm_by_patients) -export(get_ssm_by_regions) -export(get_ssm_by_samples) -export(id_ease) -export(preserve_genomic_attributes) -export(process_regions) -export(region_to_chunks) -export(review_hotspots) -export(strip_genomic_classes) import(dplyr) import(ggplot2) -import(purrr) -import(tibble) import(tidyr) diff --git a/R/annotate_hotspots.R b/R/annotate_hotspots.R deleted file mode 100644 index 4f64000..0000000 --- a/R/annotate_hotspots.R +++ /dev/null @@ -1,39 +0,0 @@ -#' @title Annotate Hotspots. -#' -#' @description Annotate MAF-like data frome with a hot_spot column indicating recurrent mutations. -#' -#' @details This function takes an already loaded MAF data frame with the `mutation_maf` parameter. -#' -#' @param mutation_maf A data frame in MAF format. -#' @param ... Any other parameter. These parameters will be ignored. -#' -#' @return The same data frame with one additional column "hot_spot". -#' -#' @import dplyr -#' @export -#' -#' @examples -#' my_metadata = get_gambl_metadata() -#' all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, -#' projection = "grch37", -#' this_seq_type = "genome") -#' -#' hot_ssms = annotate_hotspots(all_coding_ssm) -#' -annotate_hotspots = function( - mutation_maf, - ... -){ - - # check if any invalid parameters are provided - check_excess_params(...) - - filled_coords <- GAMBLR.data::hotspots_annotations - # just the ssms that match these coordinates! - hot_ssms <- left_join( - mutation_maf, - filled_coords, - by = c("Chromosome", "Start_Position") - ) - return(hot_ssms) -} diff --git a/R/assign_cn_to_ssm.R b/R/assign_cn_to_ssm.R deleted file mode 100644 index 8faa091..0000000 --- a/R/assign_cn_to_ssm.R +++ /dev/null @@ -1,143 +0,0 @@ -#' @title Assign CN to SSM. -#' -#' @description Annotate mutations with their copy number information. -#' -#' @details This function takes a sample ID with the `this_sample_id` parameter -#' and annotates mutations with copy number information. A variety of -#' parameters are at hand for a customized workflow. For example, -#' the user can specify if only coding mutations are of interest. To do so, -#' set `coding_only = TRUE`. This function internally calls -#' `get_ssm_by_samples` and `get_sample_cn_segments`. This function can -#' also take a vector with genes of interest (`genes`) that the returned -#' data frame will be restricted to. -#' -#' @param this_sample_id Sample ID of the sample you want to annotate. -#' @param genes A vector of characters with gene symbols (Hugo). -#' @param this_seq_type Specified seq type for returned data. Default is genome. -#' @param projection Specified genome projection that returned data is in -#' reference to. Default is grch37. -#' @param coding_only Optional. Set to TRUE to restrict to only coding variants -#' (ssm). Deafult is FALSE. -#' @param assume_diploid Optional, this parameter annotates every mutation as -#' copy neutral. Default is FALSE. -#' @param include_silent Logical parameter indicating whether to include silent -#' mutations into coding mutations. Default is FALSE. This parameter only -#' makes sense if `coding_only` is set to TRUE. -#' @param ... Any additional parameters. -#' -#' @return A list containing a data frame (MAF-like format) with three extra -#' columns: -#' - log.ratio is the log ratio from the seg file (NA when no overlap). -#' - LOH -#' - CN (the rounded absolute copy number estimate of the region based on -#' log.ratio, NA when no overlap was found). -#' -#' @import dplyr -#' @export -#' -#' @examples -#' cn_list = assign_cn_to_ssm( -#' this_sample_id = "DOHH-2", -#' coding_only = TRUE -#' ) -#' -assign_cn_to_ssm = function( - this_sample_id, - genes, - this_seq_type = "genome", - projection = "grch37", - coding_only = FALSE, - assume_diploid = FALSE, - include_silent = FALSE, - ... -){ - - #warn/notify the user what version of this function they are using - message("Using the bundled CN segments (.seg) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #ensure only one sample ID is provided - if(length(this_sample_id) > 1){ - stop( - "This function only supports queries of 1 sample ID at the time..." - ) - } - - #get maf - maf_sample = get_ssm_by_sample( - this_sample_id = this_sample_id, - projection = projection, - this_seq_type = this_seq_type - ) - - #maf filtering - #silent mutations - if(!include_silent){ - coding_class = coding_class[coding_class != "Silent"] - } - - #coding mutations - if(coding_only){ - maf_sample = dplyr::filter( - maf_sample, - Variant_Classification %in% coding_class - ) - } - - #subset to genes of interest - if(!missing(genes)){ - maf_sample = dplyr::filter(maf_sample, Hugo_Symbol %in% genes) - if(nrow(maf_sample) == 0){ - stop("No variants left after filtering on the provided genes...") - } - } - - #get seg - seg_sample = get_sample_cn_segments( - these_sample_ids = this_sample_id, - projection = projection, - this_seq_type = this_seq_type - ) - - #annotate all CN segments as copy number neutral - if(assume_diploid){ - diploid = dplyr::mutate(maf_sample, CN = 2) - return(list(maf = diploid)) - } - - #wrangle the seg file - seg_sample = seg_sample %>% - dplyr::filter(end - start > 100) %>% - mutate(chrom = gsub("chr", "", chrom)) %>% - rename( - Chromosome = chrom, - Start_Position = start, - End_Position = end, - LOH = LOH_flag - ) %>% - mutate(across(LOH, as.factor)) - - #perform an overlap join and add CN columns from the seg file and subset - # MAF to basic columns (first 45) - maf_tmp = cool_overlaps(maf_sample, seg_sample, type = "any") - - #rename and change order of columns to match expected format - maf_with_segs = maf_tmp %>% - rename( - Start_Position = Start_Position.x, - End_Position = End_Position.x - ) %>% - dplyr::select( - colnames(maf_sample), - LOH, log.ratio, CN - ) - - return( - list( - maf = maf_with_segs, - seg = seg_sample - ) - ) -} diff --git a/R/calc_mutation_frequency_bin_region.R b/R/calc_mutation_frequency_bin_region.R deleted file mode 100644 index 1082a68..0000000 --- a/R/calc_mutation_frequency_bin_region.R +++ /dev/null @@ -1,294 +0,0 @@ -#' @title Calculate Mutation Frequency By Sliding Window. -#' -#' @description Count the number of mutations in a sliding window across a -#' region for all samples. -#' -#' @details This function is called to return the mutation frequency for a given -#' region, either from a provided input maf data frame or from the GAMBL maf data. -#' Regions are specified with the `region` parameter. Alternatively, the region of -#' interest can also be specified by calling the function with `chromosome`, -#' `start_pos`, and `end_pos` parameters. This function operates on a single region. -#' To return a matrix of sliding window counts over multiple regions, -#' see `calc_mutation_frequency_bin_regions`. -#' -#' @param region A string describing a genomic region in the "chrom:start-end" format. -#' The region must be specified in this format OR as separate chromosome, start_pos, end_pos arguments. -#' @param chromosome Chromosome name in region. -#' @param start_pos Start coordinate of region. -#' @param end_pos End coordinate of region. -#' @param these_samples_metadata Optional data frame containing a sample_id column. -#' If not providing a maf file, seq_type is also a required column. -#' @param these_sample_ids Optional vector of sample IDs. Output will be subset -#' to IDs present in this vector. -#' @param this_seq_type Optional vector of seq_types to include in heatmap. -#' Default is "genome". Uses default seq_type priority for samples -#' with >1 seq_type. -#' @param maf_data Optional maf data frame. Will be subset to rows where -#' Tumor_Sample_Barcode matches provided sample IDs or metadata table. -#' If not provided, maf data will be obtained with get_ssm_by_regions(). -#' @param projection Specify which genome build to use. Required. Default grch37. -#' @param slide_by Slide size for sliding window. Default 100. -#' @param window_size Size of sliding window. Default 1000. -#' @param return_format Return format of mutations. Accepted inputs are "long" -#' and "wide". Long returns a data frame of one sample ID/window per row. -#' Wide returns a matrix with one sample ID per row and one window per column. -#' Using the "wide" format will retain all samples and windows regardless of -#' the drop_unmutated or min_count_per_bin parameters. -#' @param min_count_per_bin Minimum counts per bin, default is 0. Setting this -#' greater than 0 will drop unmutated windows only when return_format is long. -#' @param return_count Boolean statement to return mutation count per window (TRUE) -#' or binary mutated/unmutated status (FALSE). Default is TRUE. -#' @param drop_unmutated Boolean for whether to drop windows with 0 mutations. -#' Only effective with "long" return format. -#' @param ... Any additional parameters. -#' -#' @return Either a matrix or a long tidy table of counts per window. -#' -#' @import dplyr tidyr -#' @export -#' -#' @examples -#' myc_mut_freq = calc_mutation_frequency_bin_region(region = "8:128747680-128753674", -#' slide_by = 10, -#' window_size = 10000) -#' -calc_mutation_frequency_bin_region <- function(region, - chromosome, - start_pos, - end_pos, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - slide_by = 100, - window_size = 1000, - return_format = "long", - min_count_per_bin = 0, - return_count = TRUE, - drop_unmutated = FALSE, - ...) { - - #check if any invalid parameters are provided - check_excess_params(...) - - # Create objects to describe region both as string and individual objects - try(if (missing(region) & missing(chromosome)) { - stop("No region information provided. Please provide a region as a string in the chrom:start-end format, or as individual arguments. ") - }) - - if ((drop_unmutated | min_count_per_bin > 0) & return_format == "wide") { - message("To return a wide table, all samples and windows must be kept. Ignoring drop_unmutated and min_count_per_bin arguments. ") - } - - if (missing(region)) { - region <- paste0( - chromosome, ":", start_pos, "-", - end_pos - ) - } else { - chunks <- region_to_chunks(region) - chromosome <- chunks$chromosome - start_pos <- as.numeric(chunks$start) - end_pos <- as.numeric(chunks$end) - } - - # Harmonize metadata and sample IDs - metadata <- id_ease( - these_samples_metadata, - these_sample_ids, - this_seq_type - ) - these_sample_ids <- metadata$sample_id - - - if ( - (grepl("chr", chromosome) & projection == "grch37") | - (!grepl("chr", chromosome) & projection == "hg38") - ) { - stop("chr prefixing status of region and specified projection don't match. ") - } - - - # Check region size and compare to max region size - # Is this really needed? - max_region <- 5e+06 - - region_size <- end_pos - start_pos - if (region_size < max_region) { - message(paste( - "processing bins of size", window_size, - "across", region_size, "bp region" - )) - } else { - message(paste("CAUTION!\n", region_size, "exceeds maximum size recommended by this function.")) - } - - # Split region into windows - windows <- data.frame( - chrom = chromosome, - window_start = seq(start_pos, end_pos, by = slide_by) - ) %>% - dplyr::mutate(window_end = window_start + window_size - 1) %>% - dplyr::select(chrom, window_start, window_end) - - # Option to return full region count instead of sliding window - if (window_size == 0) { - windows <- data.frame( - chrom = chromosome, - window_start = start_pos, - window_end = end_pos - ) - } - - # Obtain SSM coordinates from GAMBL if no maf_data was provided - if (is.null(maf_data)) { - try( - if (!"seq_type" %in% colnames(metadata)) { - stop("seq_type must be present in metadata for compatibility with get_ssm_by_samples") - } - ) - message("Using GAMBLR.data::get_ssm_by_region...") - region_ssm <- list() - for (st in unique(metadata$seq_type)) { - this_seq_type <- get_ssm_by_region( - region = region, - projection = projection, - streamlined = FALSE, - this_seq_type = st - ) %>% - dplyr::mutate(end = Start_Position + 1) %>% - dplyr::select( - chrom = Chromosome, - start = Start_Position, - end, - sample_id = Tumor_Sample_Barcode - ) %>% - dplyr::mutate(mutated = 1, seq_type = st) %>% - dplyr::filter(sample_id %in% these_sample_ids) - region_ssm[[st]] <- data.frame(metadata) %>% - dplyr::select(sample_id, seq_type) %>% - dplyr::filter(seq_type == st) %>% - dplyr::left_join(this_seq_type, by = c("sample_id", "seq_type")) %>% - dplyr::filter(!is.na(mutated)) %>% - dplyr::select(-seq_type) - } - region_ssm <- dplyr::bind_rows(region_ssm) - } else { - # Subset provided maf to specified region - message("Using provided maf...") - region_bed <- data.frame( - "chrom" = as.character(chromosome), - "start" = as.numeric(start_pos), - "end" = as.numeric(end_pos) - ) - region_ssm <- cool_overlaps( - maf_data, region_bed, - columns2 = c("chrom", "start", "end") - ) %>% - dplyr::filter(!is.na(Start_Position)) %>% - dplyr::mutate(end = Start_Position - 1) %>% - dplyr::select( - chrom = Chromosome, - start = Start_Position, - end, - sample_id = Tumor_Sample_Barcode - ) %>% - dplyr::mutate(mutated = 1) - - region_ssm <- data.frame(metadata) %>% - dplyr::select(sample_id) %>% - dplyr::left_join(region_ssm) %>% - dplyr::filter(!is.na(mutated)) - } - - # Check if the region is empty. - # If yes return NULL so that running this function with lapply will allow bind_rows to run on the output. - if (nrow(region_ssm) == 0 & (drop_unmutated | min_count_per_bin > 0)) { - message(paste0("No mutations found in region ", region, " for this sample set. ")) - return(NULL) - } - - # Count mutations per window - windows_tallied <- dplyr::inner_join( - windows, - region_ssm, - by = "chrom" - ) %>% - dplyr::filter( - start >= window_start, - start <= window_end - ) %>% - dplyr::group_by( - sample_id, - window_start - ) %>% - dplyr::tally() %>% - dplyr::ungroup() %>% - dplyr::full_join(select(metadata, sample_id)) %>% - dplyr::arrange(sample_id) %>% - dplyr::full_join(select(windows, window_start)) %>% - dplyr::distinct() %>% - tidyr::pivot_wider( - names_from = window_start, - values_from = n, - values_fill = 0 - ) %>% - dplyr::select(-matches("^NA$")) %>% - tidyr::pivot_longer( - -c(sample_id), - names_to = "window_start", - values_to = "n" - ) %>% - dplyr::distinct() %>% - dplyr::filter(!is.na(sample_id)) - - # Remove unmutated windows if requested - if (drop_unmutated | min_count_per_bin > 0) { - windows_tallied <- windows_tallied %>% - dplyr::filter(n >= min_count_per_bin) - if (drop_unmutated & min_count_per_bin == 0) { - windows_tallied %>% - dplyr::filter(n > 0) - } - } - - # Create requested data output format - if (return_count) { - # Return table of mutation counts per bin - windows_tallied_final <- mutate( - windows_tallied, - bin = paste0(chromosome, "_", window_start) - ) %>% - dplyr::mutate(mutation_count = n) %>% - dplyr::select( - sample_id, - bin, - mutation_count - ) - } else { - # Return table of binary mutated/unmutated status per bin - windows_tallied_final <- mutate( - windows_tallied, - bin = paste0(chromosome, "_", window_start) - ) %>% - dplyr::mutate(mutated = ifelse(n > 0, 1, 0)) %>% - dplyr::select( - sample_id, - bin, - mutated - ) - } - - if (return_format == "wide") { - widened <- windows_tallied_final %>% - tidyr::pivot_wider( - names_from = bin, - values_from = matches("mutat"), - values_fill = 0 - ) - return(widened) - } else { - return(windows_tallied_final) - } -} diff --git a/R/calc_mutation_frequency_bin_regions.R b/R/calc_mutation_frequency_bin_regions.R deleted file mode 100644 index 540a13d..0000000 --- a/R/calc_mutation_frequency_bin_regions.R +++ /dev/null @@ -1,147 +0,0 @@ -#' @title Mutation counts across sliding windows for multiple regions. -#' -#' @description Obtain a long tidy or wide matrix of mutation counts across -#' sliding windows for multiple regions. -#' -#' @details This function takes a metadata table with `these_samples_metadata` -#' parameter and internally calls `calc_mutation_frequency_bin_region` -#' (that internally calls `get_ssm_by_regions`). -#' to retrieve mutation counts for sliding windows across one or more regions. -#' May optionally provide any combination of a maf data frame, existing metadata, -#' or a regions data frame or named vector. -#' -#' @param regions_list Named vector of regions in the format -#' c(name1 = "chr:start-end", name2 = "chr:start-end"). If neither `regions` nor -#' `regions_bed` is specified, the function will use GAMBLR aSHM region information. -#' @param regions_bed Data frame of regions with four columns (chrom, start, end, name). -#' @param these_samples_metadata Metadata with at least sample_id column. -#' If not providing a maf data frame, seq_type is also required. -#' @param these_sample_ids Vector of sample IDs. Metadata will be subset to -#' sample IDs present in this vector. -#' @param this_seq_type Optional vector of seq_types to include in heatmap. -#' Default "genome". Uses default seq_type priority for samples with >1 seq_type. -#' @param maf_data Optional maf data frame. Will be subset to rows where -#' Tumor_Sample_Barcode matches provided sample IDs or metadata table. -#' If not provided, maf data will be obtained with get_ssm_by_regions(). -#' @param region_padding Amount to pad the start and end coordinates by. Default 1000. -#' @param projection Genome build the function will operate in. Ensure this -#' matches your provided regions and maf data for correct chr prefix handling. Default "grch37". -#' @param drop_unmutated Whether to drop bins with 0 mutations. If returning a -#' matrix format, this will only drop bins with no mutations in any samples. -#' @param skip_regions Optional character vector of genes to exclude from the default aSHM regions. -#' @param only_regions Optional character vector of genes to include from the default aSHM regions. -#' @param slide_by Slide size for sliding window. Default 100. -#' @param window_size Size of sliding window. Default 500. -#' @param return_format Return format of mutations. Accepted inputs are "long" and -#' "wide". Long returns a data frame of one sample ID/window per row. Wide returns -#' a matrix with one sample ID per row and one window per column. Using the "wide" -#' format will retain all samples and windows regardless of the drop_unmutated or -#' min_count_per_bin parameters. Default wide. -#' @param ... Any additional parameters. -#' -#' @return A table of mutation counts for sliding windows across one or more regions. May be long or wide. -#' -#' @import dplyr tidyr tibble -#' @export -#' -#' @examples -#' #get some regions -#' these_regions <- process_regions(only_regions = c("MYC", "BCL2", "BCL6")) -#' reg_vec <- these_regions$regions_list -#' reg_bed <- these_regions$regions_bed -#' -#' # use a set of user defined regions (from genes) and -#' # calculate mut frequency across all available samples -#' mult_freq_all = calc_mutation_frequency_bin_regions(regions_list = reg_vec) -#' mult_freq_all = calc_mutation_frequency_bin_regions(regions_bed = reg_bed) -#' -#' #restrict the analysis to specific samples using the metadata -#' my_meta = get_gambl_metadata() %>% -#' dplyr::filter(pathology %in% c("DLBCL","FL")) -#' mult_reg_freq_fl_dlbcl = calc_mutation_frequency_bin_regions(regions_list = reg_vec, -#' these_sample_ids = "DOHH-2") -#' -calc_mutation_frequency_bin_regions <- function(regions_list = NULL, - regions_bed = NULL, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - region_padding = 1000, - drop_unmutated = FALSE, - skip_regions = NULL, - only_regions = NULL, - slide_by = 100, - window_size = 500, - return_format = "wide", - ...){ - - #check if any invalid parameters are provided - check_excess_params(...) - - regions <- process_regions(regions_list = regions_list, - regions_bed = regions_bed, - region_padding = region_padding, - skip_regions = skip_regions, - only_regions = only_regions) - - regions_bed <- regions$regions_bed - regions <- regions$regions_list - - if ( - (grepl("chr", regions_bed$chrom[1]) & projection == "grch37") | - (!grepl("chr", regions_bed$chrom[1]) & projection == "hg38") - ) { - stop("chr prefixing status of provided regions and specified projection don't match. ") - } - # Harmonize metadata and sample IDs - metadata <- id_ease( - these_samples_metadata, - these_sample_ids, - this_seq_type - ) - - these_sample_ids <- metadata$sample_id - - # Obtain sliding window mutation frequencies for all regions - dfs <- mclapply(names(regions), function(x) { - df <- calc_mutation_frequency_bin_region( - region = regions[x], - these_samples_metadata = metadata, - maf_data = maf_data, - projection = projection, - drop_unmutated = drop_unmutated, - slide_by = slide_by, - window_size = window_size, - min_count_per_bin = 0, - return_count = TRUE, - ... - ) %>% - dplyr::mutate(name = x) - return(df) - }) - - all <- dplyr::bind_rows(dfs) %>% - dplyr::distinct(bin, sample_id, .keep_all = TRUE) - - # If none of the samples are mutated, return the mutation frequency df and exit. - if (max(all$mutation_count) == 0) { - message("No mutations found in specified regions for specified samples. Exiting. ") - return(all) - } - - if (return_format == "wide") { - # Convert mutation frequency table to a matrix - all_wide <- all %>% - dplyr::select(sample_id, mutation_count, bin) %>% - pivot_wider( - names_from = bin, - values_from = mutation_count, - values_fill = 0 - ) - return(all_wide) - } else { - return(all) - } -} diff --git a/R/check_excess_params.R b/R/check_excess_params.R deleted file mode 100644 index 68eea07..0000000 --- a/R/check_excess_params.R +++ /dev/null @@ -1,26 +0,0 @@ -#' @title Check Excess Params -#' -#' @description Function for checking excessive parameter names. -#' This function will notify the user if any unavailable parameters are called for any given given function. -#' This function is designed to work as internal function-call in already available GAMBLR functions. -#' -#' @details Catch function calls containing unsupported arguments. -#' -#' @param ... Parameters to check. -#' -#' @return Nothing -#' -#' @export -#' -check_excess_params = function(...){ - callingFun = as.list(sys.call(-1))[[1]] - arguments <- list(...) - extraneous = names(arguments) - if(length(arguments)>0){ - k <- gettextf("Warning: You have given one or more unsupported or deprecated arguments to %s and they are going to be ignored. Please check the documentation and spelling of your arguments.\nIgnored argument(s): %s.", - as.character(callingFun), - paste(extraneous, collapse = ", ")) - message(k) - } - -} diff --git a/R/collate_results.R b/R/collate_results.R deleted file mode 100644 index 8459ccb..0000000 --- a/R/collate_results.R +++ /dev/null @@ -1,83 +0,0 @@ -#' @title Collate Results -#' -#' @description Bring together collated results for a selection of gambl samples. -#' -#' @details Currently, this function only gathers QC metrics (`mirage_metrics`) as the only collated result. -#' Potentially, in the future, additional collated results can be added by this function as well. -#' -#' @param sample_table A vector of characters with sample IDs, or a data frame with sample IDs in a column (sample_id). -#' If provided, this will overwrite any sample subsets provided these_samples_metadata. -#' @param these_samples_metadata A metadata table with sample IDs of interest. -#' If not provided, the function will get metadata for all available samples. -#' This parameter is intended to use in combination with `join_with_full_metadata`. -#' @param join_with_full_metadata Set to TRUE to horizontally expand metadata with QC results. -#' Default is FALSE. If `these_samples_metadata` is provided, collated resutls will be added to this metadata table. -#' If not provided, the function will join collated results with all available metadata in the specified seq_type (`seq_type_filter`). -#' @param seq_type_filter Filtering criteria for `get_gambl_metadata` if `these_samples_metadata` is not provided, default is genomes and captures. -#' @param ... Any additional parameters. -#' -#' @return A data frame with collated results. -#' -#' @import dplyr -#' -#' @export -#' -#' @examples -#' #load packages -#' library(dplyr) -#' -#' #return collated results for all available samples -#' all_collated = collate_results() -#' -#' #return available collated results for a metadata subset -#' fl_collated = collate_results( -#' these_samples_metadata = get_gambl_metadata( -#' seq_type_filter = "genome") %>% -#' dplyr::filter(pathology == "FL")) -#' -#' #horizontally expand a metadata subset with collated results -#' fl_meta_collated = collate_results( -#' join_with_full_metadata = TRUE, -#' these_samples_metadata = get_gambl_metadata( -#' seq_type_filter = "genome") %>% -#' dplyr::filter(pathology == "FL")) -#' -#' #horizontally expand all available metadata with collated results -#' all_meta_collated = collate_results(join_with_full_metadata = TRUE) -#' -collate_results = function(sample_table, - these_samples_metadata, - join_with_full_metadata = FALSE, - seq_type_filter = c("genome", "capture"), - ...){ - - #check if any invalid parameters are provided - check_excess_params(...) - - #warn/notify the user what version of this function they are using - message("Using the bundled collated results in GAMBLR.data...") - - if(missing(these_samples_metadata)){ - these_samples_metadata = get_gambl_metadata(seq_type_filter = seq_type_filter) - } - - if(missing(sample_table)){ - sample_table = these_samples_metadata %>% - pull(sample_id) - }else{ - if(is.data.frame(sample_table)){ - sample_table = sample_table$sample_id - } - } - - #read mirage metrics and subset to the sample IDs (in sample_table) we have QC data for - collated = GAMBLR.data::mirage_metrics %>% - dplyr::filter(sample_id %in% sample_table) - - #horizontally expand the provided metadata with QC results - if(join_with_full_metadata){ - full_table = left_join(these_samples_metadata, collated) - return(full_table) - } - return(collated) -} diff --git a/R/cool_overlaps.R b/R/cool_overlaps.R deleted file mode 100644 index 12d8289..0000000 --- a/R/cool_overlaps.R +++ /dev/null @@ -1,231 +0,0 @@ -#' @title Cool overlap of data frames. -#' -#' @description This function implements overlap of 2 data frames that contain -#' regions of coordinates similar to what data.table::foverlaps does. Unlike -#' foverlaps, this function takes as input data frame class objects, and relies -#' on dplyr solution rather than data.table handling, therefore allowing usage -#' of data frames with virtually unlimited dimensions without crashing. This -#' implementation uses same logic of different types of overlaps as the original -#' foverlaps solution ("any", "start", "end", "within", "equal"). The type "any" -#' is default and allows for any overlapping solution between 2 regions. The -#' type "start" only considers regions with exact same start position as -#' overlap; similarly type "end" considers regions overlapped when the end -#' positions are exact matches. Type "within" means that regions are overlapped -#' when one is contained in another and neither start nor end positions match. -#' Finally, type "equal" only considers overlap when both start and end -#' positions match for both regions. For any type, the presence of any -#' additional column not directly specifying regions (for example, Chromosome) -#' will serve similar to a grouping variable. -#' The generated output of this function will contain the overlapping regions -#' and all columns present in the data frame data1, as well as any columns from -#' the data frame supplied with data2 argument, except for those columns present -#' in data2 that are used for overlap. When the same columns are present in both -#' data1 and data2, the output data frame will have ".x" and ".y" suffixes to -#' indicate which original input data they are coming from. -#' -#' @param data1 Data frame with data to overlap. Required parameter. The minimal -#' required columns are those supplied with the argument columns1. Will -#' dictate the naming of the columns used for overlap in the output. -#' @param data2 Data frame with data to overlap. Required parameter. The minimal -#' required columns are those supplied with the argument columns2. -#' @param columns1 The list of columns from data frame data1 to be used to find -#' overlapping regions. -#' @param columns2 The list of columns from data frame data2 to be used to find -#' overlapping regions. -#' @param type Character specifying the way to find overlaps. Accepted values -#' are "any" (used as default), "start", "end", "within", and "equal". -#' Please see function description for more details of different types. -#' @param nomatch Whether the rows from data1 that do not have overlap in data2 -#' should be returned or not. The default is FALSE (rows without overlap -#' are not returned). If TRUE is specified, the row order in the output -#' data will match the exact order of rows in the input data1. -#' -#' @return data frame -#' -#' @examples -#' # obtain maf data -#' maf1 <- get_coding_ssm( -#' these_sample_ids = "DOHH-2" -#' ) -#' -#' maf2 <- get_coding_ssm( -#' these_sample_ids = "SU-DHL-4" -#' ) -#' -#' # The same mutations are not expected to be present in different samples -#' # so this overlap will produce 0 matching rows -#' overlap <- cool_overlaps( -#' maf1, -#' maf1, -#' type = "equal" -#' ) -#' -#' # To demonstrate functionality we can supply the same maf to the data2 -#' overlap <- cool_overlaps( -#' maf1, -#' maf1 %>% head -#' ) -#' -#' # We can also overlap different formats, for example -#' seg1 <- get_sample_cn_segments(these_sample_ids = "DOHH-2") -#' overlap <- cool_overlaps( -#' data1 = maf1, -#' data2 = seg1, -#' columns2 = c("chrom", "start", "end") -#' ) -#' -#' @import dplyr tidyr -#' @export -#' -cool_overlaps <- function( - data1, - data2, - columns1 = c("Chromosome", "Start_Position", "End_Position"), - columns2 = c("Chromosome", "Start_Position", "End_Position"), - type = "any", - nomatch = FALSE -){ - - # Ensure all columns provided for overlap are present in the data frame - if(! length(columns1) == length(intersect(columns1, colnames(data1)))){ - stop( - "Not all of the requested columns for overlap in data1 are present." - ) - } - - if(! length(columns2) == length(intersect(columns2, colnames(data2)))){ - stop( - "Not all of the requested columns for overlap in data2 are present." - ) - } - - # What is the name of the column in columns1 that specifies start and end? - start1 <- columns1[grepl("start", columns1, ignore.case = TRUE)] - end1 <- columns1[grepl("end", columns1, ignore.case = TRUE)] - - # What is the name of the column in columns1 that specifies start and end? - start2 <- columns2[grepl("start", columns2, ignore.case = TRUE)] - end2 <- columns2[grepl("end", columns2, ignore.case = TRUE)] - - # What are the other columns to be used in overlap? - columns1 <- columns1[!columns1 %in% c(start1, end1)] - columns2 <- columns2[!columns2 %in% c(start2, end2)] - - # When the same columns are provided they will become .x and .y - original_start1 <- start1 - original_end1 <- end1 - if(start1 == start2) { - start1 <- paste0(start1, ".x") - start2 <- paste0(start2, ".y") - - } - if(end1 == end2) { - end1 <- paste0(end1, ".x") - end2 <- paste0(end2, ".y") - - } - - - # Prepare for overlap - overlap <- dplyr::inner_join( - data1, - data2, - by = structure(names = columns1, .Data = columns2), - relationship = "many-to-many" - ) - - # Return matches based on mode - if(type == "any"){ - message( - "Running in default mode of any..." - ) - overlap <- overlap %>% - dplyr::filter( - !!sym(start2) >= !!sym(start1) & !!sym(end2) <= !!sym(end1) | - !!sym(start1) >= !!sym(start2) & !!sym(end1) <= !!sym(end2) - ) - } else if (type == "start"){ - message( - "Running in the mode start..." - ) - overlap <- overlap %>% - dplyr::filter( - !!sym(start1) == !!sym(start2) - ) - } else if (type == "end"){ - message( - "Running in the mode end..." - ) - overlap <- overlap %>% - dplyr::filter( - !!sym(end1) == !!sym(end2) - ) - } else if (type == "within"){ - message( - "Running in the mode within..." - ) - overlap <- overlap %>% - dplyr::filter( - (!!sym(start1) >= !!sym(start2)) & (!!sym(end1) <= !!sym(end2)) | - (!!sym(start2) >= !!sym(start1)) & (!!sym(end2) <= !!sym(end1)) - ) - } else if (type == "equal"){ - message( - "Running in the mode equal..." - ) - overlap <- overlap %>% - dplyr::filter( - (!!sym(start1) == !!sym(start2)) & (!!sym(end1) == !!sym(end2)) - ) - } else { - message( - "You have requested mode that is not supported." - ) - stop( - "Please supply one of any, start, end, within, or equal with type." - ) - } - - # This will ensure that features from data1 that don't have match in data2 - # will be returned with NA annotation - if(nomatch){ - no_annotation <- suppressMessages( - anti_join( - data1, - overlap - ) - ) - if(original_start1 %in% colnames(no_annotation)){ - colnames(no_annotation) = gsub( - original_start1, - start1, - colnames(no_annotation) - ) - } - if(original_end1 %in% colnames(no_annotation)){ - colnames(no_annotation) = gsub( - original_end1, - end1, - colnames(no_annotation) - ) - } - overlap <- bind_rows( - overlap, - no_annotation - ) - - # Ensure order is consistent between input data and the output after - # overlap is found since we used bind_rows - data1 <- data1 %>% - tidyr::unite("row_id", 1:ncol(data1), remove = FALSE) - - colnames(overlap) <- gsub("\\.x$", "", colnames(overlap)) - overlap <- overlap %>% - tidyr::unite("row_id", 1:(ncol(data1)-1), remove = FALSE) %>% - dplyr::arrange(match(row_id, data1$row_id)) %>% - dplyr::select(-row_id) - - } - - return(overlap) -} diff --git a/R/data_comp.R b/R/data_comp.R index c6126bf..c21a25e 100644 --- a/R/data_comp.R +++ b/R/data_comp.R @@ -20,7 +20,7 @@ #' #' @return A character vector of gene symbol or Ensembl IDs or a data frame. #' -#' @import dplyr +#' @import dplyr tidyr #' @export #' #' @examples @@ -223,7 +223,7 @@ get_genes <- function( #' #' @return Either a vector or list of Hex codes. #' -#' @import dplyr +#' @import dplyr tidyr #' @export #' #' @examples @@ -321,7 +321,7 @@ get_mapped_colours <- function( #' #' @return A data frame or named character vector of colour Hex codes. #' -#' @import dplyr ggplot2 tibble +#' @import dplyr ggplot2 tidyr #' @export #' #' @examples diff --git a/R/genomic_data.R b/R/genomic_data.R deleted file mode 100644 index e92b7c7..0000000 --- a/R/genomic_data.R +++ /dev/null @@ -1,417 +0,0 @@ -# functions for creating and working with S3 objects - - -#' Create MAF Data -#' -#' This function creates MAF (Mutation Annotation Format) data from the given input. -#' -#' @param maf_df A data frame containing the MAF data. -#' @param genome_build A string specifying the genome build ("grch37" or "hg38"). -#' @return A data frame with class attributes for MAF data. -#' @export -create_maf_data <- function(maf_df, genome_build) { - if (!inherits(maf_df, "data.frame")) stop("data must be a data frame") - if (!genome_build %in% c("grch37", "hg38")) stop("Invalid genome build") - - structure(maf_df, - class = c("maf_data", "genomic_data", class(maf_df)), # "genomic_data" for generic methods - genome_build = genome_build) -} - -#' @export -print.maf_data <- function(x, ...) { - cat("MAF Data Object\n") - cat("Genome Build:", attr(x, "genome_build"), "\n") - cat("Showing first 10 rows:\n") - # Convert to a plain data.frame (if not already) so that printing uses the default - # data.frame print method rather than printing as a list. - print(utils::head(as.data.frame(x), 10)) -} - - -#' Get Genome Build -#' -#' This function retrieves the genome build attribute from the data. -#' -#' @param data A data frame with genome build attribute. -#' @return A string specifying the genome build. -#' @export -get_genome_build <- function(data) { - attr(data, "genome_build") -} - -#' Preserve Genomic Attributes -#' -#' This function preserves the genomic attributes and class after dplyr operations. -#' -#' @param new_data A data frame resulting from dplyr operations. -#' @param old_data The original data frame with genomic attributes. -#' @return A data frame with preserved genomic attributes. -#' @export -preserve_genomic_attributes <- function(new_data, old_data) { - attr(new_data, "genome_build") <- attr(old_data, "genome_build") - class(new_data) <- class(old_data) - return(new_data) -} - -#' Strip Genomic Data Classes -#' -#' This function removes custom classes associated with genomic data objects -#' (by default, "genomic_data", "maf_data", and "bed_data") from the class attribute -#' of an object. This can be useful when you want to revert an S3 object to its -#' underlying data.frame (or data.table) classes without converting the object. -#' -#' @param x An object, such as one of your genomic data objects. -#' @param classes A character vector of class names to remove. The default is -#' c("genomic_data", "maf_data", "bed_data"). -#' @return The object with the specified classes removed. -#' @export -strip_genomic_classes <- function(x, classes = c("genomic_data", "maf_data", "bed_data")) { - current_classes <- class(x) - new_classes <- setdiff(current_classes, classes) - class(x) <- new_classes - return(x) -} - - -# S3 methods for genomic_data class -#' @export -mutate.genomic_data <- function(.data, ...) { - new_data <- dplyr::mutate(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -filter.genomic_data <- function(.data, ...) { - new_data <- dplyr::filter(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -select.genomic_data <- function(.data, ...) { - new_data <- dplyr::select(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -rename.genomic_data <- function(.data, ...) { - new_data <- dplyr::rename(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -arrange.genomic_data <- function(.data, ...) { - new_data <- dplyr::arrange(as.data.frame(.data), ...) - preserve_genomic_attributes(new_data, .data) -} -#' @export -group_by.genomic_data <- function(.data, ..., .add = FALSE) { - new_data <- dplyr::group_by(as.data.frame(.data), ..., .add = .add) - preserve_genomic_attributes(new_data, .data) -} -#' @export -ungroup.genomic_data <- function(x, ...) { - new_data <- dplyr::ungroup(as.data.frame(x), ...) - preserve_genomic_attributes(new_data, x) -} - -#' Bind maf or other genomic data together -#' -#' @description Combine multiple maf_data objects and retain metadata such as genome_build. -#' This function will not allow you to combine maf_data objects that have different genome_build values. -#' An error will also be thrown if the same sample id is found in more than one of the inputs (if check_id is TRUE). -#' -#' @param ... All maf_data or seg_data objects to be combined. -#' @param check_id Logical. If TRUE (the default), the function will check for the presence of the expected ID column -#' and for duplicate sample IDs across the inputs. Set to FALSE to skip this check. -#' -#' @return data.frame with combined data and preserved genome_build metadata. -#' @export -#' -#' @examples -#' -#' merged_maf = bind_genomic_data(maf1, maf2,check_id=FALSE) -#' -bind_genomic_data <- function(..., check_id = TRUE) { - - in_list <- list(...) - - if ("maf_data" %in% class(in_list[[1]])) { - # MAF format, ID column is Tumor_Sample_Barcode - id_col <- "Tumor_Sample_Barcode" - } else if ("seg_data" %in% class(in_list[[1]])) { - # SEG format, ID column is ID - id_col <- "ID" - } else { - stop(paste("Unsure how to merge:", class(in_list[[1]]))) - } - - # Ensure all inputs are either maf_data or seg_data objects - if (!all(sapply(in_list, inherits, "maf_data")) && !all(sapply(in_list, inherits, "seg_data"))) { - stop("All inputs must be maf_data objects or seg_data objects.") - } - - # Extract genome builds - genome_builds <- unique(sapply(in_list, get_genome_build)) - - if (length(genome_builds) > 1) { - stop("Cannot bind seg_data or maf_data objects with different genome builds: ", - paste(genome_builds, collapse = ", ")) - } - - # If check_id is TRUE, verify that the expected ID column exists and that IDs are unique. - if (check_id) { - # Collect unique sample IDs from each dataset - id_sets <- lapply(in_list, function(df) { - if (!(id_col %in% colnames(df))) { - stop("ID column '", id_col, "' not found in input data.") - } - unique(df[[id_col]]) - }) - - # Flatten the list and count occurrences of each ID - all_ids <- unlist(id_sets) - duplicate_ids <- names(table(all_ids)[table(all_ids) > 1]) - - # If any ID is found in multiple datasets, throw an error - if (length(duplicate_ids) > 0) { - stop("Duplicate IDs found in multiple input data frames: ", paste(duplicate_ids, collapse = ", ")) - } - } - - combined <- dplyr::bind_rows(in_list) - attr(combined, "genome_build") <- genome_builds[1] # Assign the common genome build - - if (!"maf_data" %in% class(combined)) { - class(combined) <- c("maf_data", "genomic_data", class(combined)) # Preserve class - } - - return(combined) -} - - - -#' Create BED Data -#' -#' This function creates BED (Browser Extensible Data) objects from the given input. -#' It assumes that the BED data should have columns corresponding to chromosome, start, -#' and end. If the second and third columns are not numeric (as expected for start and end), -#' the function will attempt to identify the proper columns by matching column names. -#' -#' In the output, the first three columns will be renamed to "chrom", "start", and "end". -#' If a fourth column exists, it is renamed to "name" (and any additional columns are preserved). -#' -#' Additionally, if a "name" column exists and its values are not unique, the function -#' will warn the user. The user can optionally supply a method to automatically fix the -#' names via the `fix_names` argument: -#' -#' - If `fix_names = "chrom_start_end"`, the new name will be built as "chrom:start-end". -#' -#' - If `fix_names = "concat"`, then the columns specified by `concat_cols` (using the -#' original column names in the input data) will be concatenated to form the new name. -#' By default, no separator is used, but a separator can be specified via the `sep` -#' argument. -#' -#' After applying the fix, the function checks if the new names are unique. If they are not, -#' a warning is issued that includes up to five examples of duplicate names and the row numbers -#' where they occur. -#' -#' @param bed_df A data frame containing the BED data. -#' @param genome_build A string specifying the genome build ("grch37" or "hg38"). -#' If NULL, the function will try to infer the genome build from the object name. -#' @param fix_names Either NULL (the default), or one of "chrom_start_end" or "concat". -#' If not NULL and duplicate names are detected, the function will apply the chosen fix. -#' @param concat_cols When `fix_names = "concat"`, a character vector specifying which columns -#' from the original data to merge. -#' @param sep The separator to use when concatenating columns if fix_names = "concat". -#' Defaults to "" (no separator). -#' @return A data frame with class attributes for BED data. -#' -#' @export -#' -#' @examples -#' -#' # get a abed_data object for all aSHM regions -#' ashm_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, -#' fix_names = "concat", -#' concat_cols = c("gene","region"), -#' sep="-") -#' # the build is automatically inferred if it is in the variable name -#' # get_genome_build(ashm_bed) -#' # [1] "grch37" -#' -#' another_bed = create_bed_data(somatic_hypermutation_locations_GRCh37_v_latest, -#' fix_names = "concat", -#' concat_cols = c("chr_name","hg19_start","hg19_end")) -#' -#' # get_genome_build(another_bed) -#' # [1] "grch37" -#' -#' # get a bed_data object for all gene regions and combine several columns to make a unique name -#' gene_regions <- create_bed_data(hg38_gene_coordinates, -#' fix_names = "concat", -#' sep="-", -#' concat_cols = c("chromosome","start","end","gene_name")) -#' -#' #get_genome_build(gene_regions) -#' # [1] "hg38" -#' -#' -create_bed_data <- function(bed_df, - genome_build = NULL, - fix_names = NULL, - concat_cols = NULL, - sep = "") { - # Check that input is a data frame. - if (!inherits(bed_df, "data.frame")) { - stop("Input data must be a data frame") - } - - # Capture the original data and column names (before any reordering or renaming) - orig_df <- bed_df - orig_names <- names(bed_df) - - # If genome_build is not provided, attempt to infer it from the object name. - if (is.null(genome_build)) { - object_name <- deparse(substitute(bed_df)) - possible_builds <- character(0) - - if (grepl("grch37", object_name, ignore.case = TRUE)) { - possible_builds <- c(possible_builds, "grch37") - } - if (grepl("hg38", object_name, ignore.case = TRUE)) { - possible_builds <- c(possible_builds, "hg38") - } - - if (length(possible_builds) == 1) { - genome_build <- possible_builds - } else if (length(possible_builds) == 0) { - stop("Could not determine genome build from object name; please supply genome_build argument.") - } else { - stop("Ambiguous genome build in object name; please supply genome_build argument explicitly.") - } - } - - # Validate genome build. - if (!genome_build %in% c("grch37", "hg38")) { - stop("Invalid genome build. Please choose either 'grch37' or 'hg38'.") - } - - # Helper function to force column naming for the BED data. - force_bed_column_names <- function(df) { - new_names <- names(df) - # Force first three columns to be "chrom", "start", "end" - new_names[1:3] <- c("chrom", "start", "end") - # If there's a fourth column, force it to "name" - if (ncol(df) >= 4) { - new_names[4] <- "name" - } - names(df) <- new_names - return(df) - } - - # Check if the first three columns (as supplied) are in the expected form. - # We expect columns 2 and 3 (start and end) to be numeric. - if (ncol(bed_df) >= 3 && is.numeric(bed_df[[2]]) && is.numeric(bed_df[[3]])) { - # The data is assumed to be in the correct order. - bed_df <- force_bed_column_names(bed_df) - } else { - # Attempt to guess the proper columns based on names. - names_lower <- tolower(names(bed_df)) - - chrom_idx <- which(names_lower %in% c("chrom", "chromosome")) - start_idx <- which(names_lower %in% c("start", "start_position", "startpos")) - end_idx <- which(names_lower %in% c("end", "end_position", "endpos")) - - if (length(chrom_idx) != 1 || length(start_idx) != 1 || length(end_idx) != 1) { - stop("Columns 2 and 3 (start and end) are not numeric and the chromosome/start/end columns ", - "cannot be unambiguously identified from the column names.") - } - - # Reorder the data frame so that the candidate columns come first. - remaining_idx <- setdiff(seq_len(ncol(bed_df)), c(chrom_idx, start_idx, end_idx)) - new_order <- c(chrom_idx, start_idx, end_idx, remaining_idx) - bed_df <- bed_df[, new_order, drop = FALSE] - - # After reordering, check that the new second and third columns are numeric. - if (!is.numeric(bed_df[[2]]) || !is.numeric(bed_df[[3]])) { - stop("After reordering based on column names, the start and end columns are not numeric.") - } - - # Force the first three (and optionally the fourth) column names. - bed_df <- force_bed_column_names(bed_df) - } - - # If a "name" column exists, check that its values are unique. - if (ncol(bed_df) >= 4) { - if (anyDuplicated(bed_df[[4]]) > 0) { - # If no fix is provided, issue a generic warning. - if (is.null(fix_names)) { - warning("The values in the 'name' column are not unique.") - } else { - # Apply the requested fix. - if (fix_names == "chrom_start_end") { - new_names_vec <- paste0(bed_df$chrom, ":", bed_df$start, "-", bed_df$end) - bed_df[[4]] <- new_names_vec - if (length(unique(new_names_vec)) != nrow(bed_df)) { - # Identify duplicate examples. - dup_idx <- which(duplicated(new_names_vec) | duplicated(new_names_vec, fromLast = TRUE)) - dup_names <- unique(new_names_vec[dup_idx]) - dup_info <- sapply(dup_names, function(nm) { - rows <- which(new_names_vec == nm) - paste0(nm, " (rows: ", paste(rows, collapse = ", "), ")") - }) - warning("The 'chrom_start_end' fix did not result in a unique set of names. Examples: ", - paste(dup_info[1:min(5, length(dup_info))], collapse = "; "), - ". Please review your data or consider an alternative fix.") - } - } else if (fix_names == "concat") { - if (is.null(concat_cols)) { - stop("For fix_names = 'concat', you must supply concat_cols indicating which columns to merge.") - } - if (!is.character(concat_cols)) { - stop("For fix_names = 'concat', concat_cols must be a character vector referring to the original column names.") - } - if (!all(concat_cols %in% orig_names)) { - stop("One or more column names specified in concat_cols do not exist in the original data.") - } - # Build new names using the original data. - # Use paste with the specified separator. - new_names_vec <- do.call(paste, c(orig_df[, concat_cols, drop = FALSE], sep = sep)) - bed_df[[4]] <- new_names_vec - if (length(unique(new_names_vec)) != nrow(bed_df)) { - dup_idx <- which(duplicated(new_names_vec) | duplicated(new_names_vec, fromLast = TRUE)) - dup_names <- unique(new_names_vec[dup_idx]) - dup_info <- sapply(dup_names, function(nm) { - rows <- which(new_names_vec == nm) - paste0(nm, " (rows: ", paste(rows, collapse = ", "), ")") - }) - warning("The 'concat' fix did not result in a unique set of names. Examples: ", - paste(dup_info[1:min(5, length(dup_info))], collapse = "; "), - ". Please review your data or consider an alternative fix.") - } - } else { - stop("Invalid value for fix_names. Use 'chrom_start_end' or 'concat'.") - } - } - } - } - # enforce strict matching of chr prefixing - if(genome_build == "grch37"){ - if(any(grepl("chr",bed_df$chrom))){ - bed_df = mutate(bed_df,chrom = gsub("chr", "", chrom)) - } - } - # Create the S3 object with additional class attributes and genome_build attribute. - structure(bed_df, - class = c("bed_data", "genomic_data", class(bed_df)), - genome_build = genome_build) -} - -#' @export -print.bed_data <- function(x, ...) { - cat("BED Data Object\n") - cat("Genome Build:", attr(x, "genome_build"), "\n") - cat("Showing first 10 rows:\n") - # Convert to a plain data.frame (if not already) so that printing uses the default - # data.frame print method rather than printing as a list. - print(utils::head(as.data.frame(x), 10)) -} - - diff --git a/R/get_ashm_count_matrix.R b/R/get_ashm_count_matrix.R deleted file mode 100644 index 6f4fd81..0000000 --- a/R/get_ashm_count_matrix.R +++ /dev/null @@ -1,136 +0,0 @@ -#' @title Get ASHM Count Matrix. -#' -#' @description Prepare a matrix with one row per sample and one column per -#' region using a set of hypermutated regions. -#' -#' @details Values are the number of mutations in that patient in the region. -#' -#' @param regions_bed A bed file with one row for each region. -#' @param these_samples_metadata This is used to complete your matrix. All GAMBL -#' samples will be used by default. Provide a data frame with at least -#' sample_id for all samples if you are using non-GAMBL data. -#' @param this_seq_type The seq type to return results for. Only used if no -#' metadata is provided with these_samples_metadata. -#' @param projection Which genome build to use for the mutations -#' (must match the coordinate system your regions to avoid a nonsense result) -#' -#' @return matrix -#' -#' @import dplyr tibble -#' @export -#' -#' @examples -#' regions_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, -#' fix_names="concat", -#' concat_cols=c("gene","region"), -#' sep="-") -#' my_meta = get_gambl_metadata() %>% dplyr::filter(pathology=="DLBCL") -#' matrix <- get_ashm_count_matrix( -#' regions_bed = regions_bed, -#' this_seq_type = "genome" -#' ) -#' -#' #this example intentionally fails -#' matrix <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", -#' these_samples_metadata = my_meta, -#' projection = "hg38") -#' # Error in get_ashm_count_matrix( -#' # Your projection argument does not match the genome_build of regions_bed -#' -#' # format the name column to include the chromosome coordinates instead of the gene -#' regions_bed = create_bed_data(GAMBLR.data::hg38_ashm_regions, -#' fix_names="concat", -#' concat_cols=c("chr_name","hg38_start","hg38_end"), -#' sep="-") -#' -#' matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", -#' these_samples_metadata = my_meta, -#' projection = "hg38") -#' -get_ashm_count_matrix = function( - regions_bed, - these_samples_metadata, - this_seq_type, - projection = "grch37" - ){ - if(missing(this_seq_type)){ - if(missing(these_samples_metadata)){ - stop( - "Please supply either the this_seq_type or a metadata from which it can be retrieved" - ) - } - this_seq_type <- these_samples_metadata %>% - pull(seq_type) %>% - unique() - } - - if(missing(regions_bed)){ - message( - "Using aSHM regions in grch37 genome_build as regions_bed" - ) - if(projection=="grch37"){ - regions_bed <- GAMBLR.data::grch37_ashm_regions %>% - mutate(name = paste(gene, region, sep = "_")) %>% - create_bed_data(genome_build = projection) - }else if(projection=="hg38"){ - regions_bed <- GAMBLR.data::hg38_ashm_regions %>% - mutate(name = paste(gene, region, sep = "_")) %>% - create_bed_data(genome_build = projection) - }else{ - stop(paste("unsupported genome build",projection)) - } - - }else{ - if("bed_data" %in% class(regions_bed)){ - if(!get_genome_build(regions_bed)==projection){ - stop(paste("Your genome_build argument does not match the genome_build of regions_bed",get_genome_build(regions_bed),genome_build)) - } - } - } - - - - if(missing(these_samples_metadata)){ - all_meta <- get_gambl_metadata( - seq_type_filter=this_seq_type - ) %>% - dplyr::select(sample_id) - }else{ - all_meta <- these_samples_metadata %>% - dplyr::select(sample_id) - } - - ashm_maf <- get_ssm_by_regions( - regions_bed = regions_bed, - streamlined = TRUE, - these_samples_metadata = these_samples_metadata, - use_name_column = TRUE, - projection = projection - ) - # Not sure why this was necessary. Possibly because it's also a data.table? - ashm_maf = strip_genomic_classes(ashm_maf) - - ashm_counted <- ashm_maf %>% - group_by(sample_id, region) %>% - tally() - - - #fill out all combinations so we can get the cases with zero mutations - eg <- expand_grid( - sample_id = pull(all_meta, sample_id), - region = unique(ashm_counted$region) - ) - all_counts <- left_join(eg, ashm_counted) %>% - mutate(n = replace_na(n, 0)) %>% - unique() #not sure where the duplicates are coming from but its annoying - - all_counts_wide <- pivot_wider( - all_counts, - id_cols = sample_id, - names_from = region, - values_from = n - ) %>% - column_to_rownames(var = "sample_id") - - return(all_counts_wide) -} diff --git a/R/get_cn_segments.R b/R/get_cn_segments.R deleted file mode 100644 index c3cecb0..0000000 --- a/R/get_cn_segments.R +++ /dev/null @@ -1,89 +0,0 @@ -## GAMBLR.data -#' Create Segmented Data -#' -#' This function creates segmented data from the given input. -#' -#' @param seg_df A data frame containing the segmented data. -#' @param genome_build A string specifying the genome build ("grch37" or "hg38"). -#' @return A data frame with class attributes for segmented data. -#' @export -#' @examples -#' seg_df <- data.frame(...) -#' create_seg_data(seg_df, "grch37") -create_seg_data <- function(seg_df, genome_build) { - if (!inherits(seg_df, "data.frame")) stop("data must be a data frame") - if (!genome_build %in% c("grch37", "hg38")) stop("Invalid genome build") - - structure(seg_df, - class = c("seg_data", class(seg_df)), - genome_build = genome_build) -} - -#' @title Get CN Segments. -#' -#' @description Retrieve all copy number segments from the GAMBL outputs -#' -#' @details This function merely loads and returns all the seg_data available for a projection (genome build) -#' @param these_samples_metadata User must provide a metadata table to restrict the data to the samples in your table. -#' The metadata also ensures the proper handling of duplicate sample_id across seq_types and ensures the -#' seq_type in the metadata faithfully represents the seq_type of the data -#' @param projection Desired genome coordinate system for returned CN segments. Default is "grch37". -#' @param this_seq_type Deprecated. -#' @param ... Additional parameters to be passed to the function. -#' -#' @return A data frame with CN segments for the specified region. -#' -#' @import dplyr -#' @export -#' -#' @examples -#' # Example for the capture samples: -#' -#' genome_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter="genome") -#' -#' genome_segments_hg38 = get_cn_segments( -#' these_samples_metadata = genome_metadata, -#' projection="hg38") -#' -#' -get_cn_segments = function(these_samples_metadata, - projection = "grch37", - this_seq_type,...){ - #warn/notify the user what version of this function they are using - message("Using the bundled CN segments (.seg) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) - - metadata = these_samples_metadata - - sample_ids = metadata$sample_id - #return CN segments based on the selected projection - if(projection %in% valid_projections){ - all_segs = GAMBLR.data::sample_data[[projection]]$seg %>% - dplyr::filter(ID %in% sample_ids) - }else{ - stop(paste("please provide a valid projection. The following are available:", - paste(valid_projections,collapse=", "))) - } - - #ensure chr prefixes are there when necessary - if(projection=="grch37"){ - if(grepl("chr",all_segs$chrom[1])){ - all_segs = all_segs %>% - dplyr::mutate(chrom = gsub("chr", "", chrom)) - } - }else{ - if(!grepl("chr",all_segs$chrom[1])){ - all_segs = all_segs %>% - dplyr::mutate(chrom = paste0("chr", chrom)) - } - } - - #return S3 class with CN segments and genome_build - all_segs = create_seg_data(all_segs,projection) - return(all_segs) -} diff --git a/R/get_coding_ssm.R b/R/get_coding_ssm.R deleted file mode 100644 index 306ff4f..0000000 --- a/R/get_coding_ssm.R +++ /dev/null @@ -1,134 +0,0 @@ - -#' @title Get Coding SSMs -#' -#' @description Convenience function for loading coding Simple Somatic Mutations -#' (SSM) from the bundled data [GAMBLR.data::sample_data]. -#' -#' @details This "bare bones" function was developed to retrieve coding SSM -#' calls for non-GSC-users. Effectively retrieve coding SSM calls. Multiple -#' filtering parameters are available for this function. For more -#' information on how to implement the filtering parameters, refer to the -#' parameter descriptions as well as examples in the vignettes. This -#' function depends on the bundled sample data in this package. -#' -#' @param these_sample_ids Optional, a vector of multiple sample_id (or a single -#' sample ID as a string) that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in -#' a column) to subset the return to. If not provided (and if -#' `these_sample_ids` is not provided), the function will return all -#' samples from the specified seq_type in the metadata. -#' @param projection Reference genome build for the coordinates in the MAF file. -#' The default is grch37. -#' @param this_seq_type The this_seq_type you want back, default is genome. -#' @param min_read_support Only returns variants with at least this many reads -#' in t_alt_count. -#' @param include_silent Logical parameter indicating whether to include silent -#' mutations into coding mutations. Default is TRUE. -#' @param verbose Set to FALSE to minimize the output to console. Default is -#' TRUE. This parameter also dictates the verbosity of any helper function -#' internally called inside the main function. -#' @param tool_name Optionally specify which tool to report variant from. The -#' default is slms-3, also supports "publication" to return the exact -#' variants as reported in the original papers. -#' @param ... Any additional parameters. -#' -#' @return data frame -#' -#' @import dplyr -#' -#' @export -#' -#' @examples -#' -#' # Get mutations from exome data originally aligned to grch37 -#' ssm_exomes_grch37 = get_coding_ssm(projection = "grch37",this_seq_type = "capture") -#' -#' # Get mutations from genome data, hg38 build -#' ssm_genomes_hg38 = get_coding_ssm(projection = "hg38",this_seq_type = "genome") -#' -#' -#' -#' -get_coding_ssm = function( - these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - min_read_support = 3, - include_silent = TRUE, - verbose = FALSE, - ... -){ - - # Warn/notify the user what version of this function they are using - message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - # Get valid projections - valid_projections = grep( - "meta", - names(GAMBLR.data::sample_data), - value = TRUE, - invert = TRUE - ) - - #get samples with the dedicated helper function - metadata = id_ease( - these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type - ) - - sample_ids = metadata$sample_id - - - if(!projection %in% valid_projections){ - stop( - paste( - "Provide a valid projection. The following are available:", - paste( - valid_projections, - collapse = ", " - ) - ) - ) - } - - #return SSMs based on the selected projection - muts = GAMBLR.data::sample_data[[projection]]$maf %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - - if(!include_silent){ - coding_class = coding_class[coding_class != "Silent"] - } - - sample_ids = pull(metadata, sample_id) - - # Drop variants with low read support (default is 3), - # enforce sample IDs and keep only coding variants - muts = dplyr::filter(muts, t_alt_count >= min_read_support) %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter(Variant_Classification %in% coding_class) - - # Filter maf on selected sample ids - muts = muts %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) - - mutated_samples = length(unique(muts$Tumor_Sample_Barcode)) - message( - paste( - "after linking with metadata, we have mutations from", - mutated_samples, - "samples" - ) - ) - muts = create_maf_data(muts,projection) - # use S3-safe version of dplyr function - muts = mutate.genomic_data(muts,maf_seq_type = this_seq_type) - return(muts) -} diff --git a/R/get_coding_ssm_status.R b/R/get_coding_ssm_status.R deleted file mode 100644 index 979c1d4..0000000 --- a/R/get_coding_ssm_status.R +++ /dev/null @@ -1,311 +0,0 @@ -#' @title Get Coding SSM Status. -#' -#' @description Tabulate mutation status (SSM) for a set of genes. -#' -#' @details This function takes a data frame (in MAF-like format) and converts -#' it to a binary one-hot encoded matrix of mutation status for either a set of -#' user-specified genes (via gene_symbols) or, if no genes are provided, default -#' to all lymphoma genes. The default behaviour is to assign each gene/sample_id -#' combination as mutated only if there is a protein coding mutation for that -#' sample in the MAF but this can be configured to use synonymous variants in -#' some (via include_silent_genes) or all (via include_silent) genes. -#' This function also has other filtering and convenience parameters giving -#' the user full control of the return. For more information, refer to the -#' parameter descriptions and examples. -#' Currently only the grch37 genome build is supported for hotspot annotation -#' and review for this version of the function. -#' -#' @param gene_symbols A vector of gene symbols for which the mutation status -#' will be tabulated. If not provided, lymphoma genes will be returned -#' by default. -#' @param these_samples_metadata The metadata for samples of interest to be -#' included in the returned matrix. Only the column "sample_id" is -#' required. If not provided, the example metadata is used as default. -#' @param maf_data data frame in maf format. Must be in the grch37 projection. -#' @param include_hotspots Logical parameter indicating whether hotspots object -#' should also be tabulated. Default is TRUE. -#' @param keep_multihit_hotspot Logical parameter indicating whether to keep the -#' gene annotation as mutated when the gene has both hot spot and -#' non-hotspot mutation. Default is FALSE. If set to TRUE, will report the -#' number of non-hotspot mutations instead of tabulating for just mutation -#' presence. -#' @param review_hotspots Logical parameter indicating whether hotspots object -#' should be reviewed to include functionally relevant mutations or rare -#' lymphoma-related genes. Default is TRUE. -#' @param genes_of_interest A vector of genes for hotspot review. Currently only -#' FOXO1, MYD88, and CREBBP are supported. -#' @param genome_build Reference genome build for the coordinates in the MAF -#' file. The default is inferred from maf_data. -#' @param include_silent Logical parameter indicating whether to include silent -#' mutations into coding mutations. Default is FALSE. -#' @param include_silent_genes Optionally, provide a list of genes for which the -#' Silent variants to be considered. If provided, the Silent variants for -#' these genes will be included regardless of the include_silent argument. -#' @param ... Any other parameter. These parameters will be ignored. -#' -#' @return A data frame with tabulated mutation status. -#' -#' @import dplyr tidyr -#' @export -#' -#' @examples -#' coding_tabulated_df = get_coding_ssm_status( -#' maf_data = get_coding_ssm(), -#' gene_symbols = c("EZH2","KMT2D","CREBBP","MYC") -#' ) -#' -#' -#' -#' #all lymphoma genes from bundled NHL gene list -#' coding_tabulated_df = get_coding_ssm_status() -#' -#' #this example will fail because hg38 is not supported by this function (yet) -#' coding_tabulated_df = get_coding_ssm_status(maf_data= -#' get_coding_ssm(projection = "hg38")) -#' # Error in get_coding_ssm_status(maf_data = get_coding_ssm(projection = "hg38")) : -#' # Currently only grch37 projection (hg19 genome build) is supported. -#' -get_coding_ssm_status = function( - gene_symbols, - these_samples_metadata, - maf_data, - include_hotspots = TRUE, - keep_multihit_hotspot = FALSE, - review_hotspots = TRUE, - genes_of_interest = c("FOXO1", "MYD88", "CREBBP"), - genome_build, - include_silent = FALSE, - include_silent_genes, - ... - ){ - if(missing(maf_data)){ - stop("maf_data is required") - } - # check if any invalid parameters are provided - check_excess_params(...) - if("maf_data" %in% class(maf_data)){ - if(missing(genome_build)){ - genome_build = get_genome_build(maf_data) - }else{ - if(!genome_build == get_genome_build(maf_data)){ - stop("you have specified a genome_build that doesn't match the genome_build attached to maf_data") - } - } - } - # check the projection - if(!genome_build == "grch37"){ - stop( - "Currently only grch37 projection (hg19 genome build) is supported." - ) - } - - if(missing(gene_symbols)){ - message( - "No gene_symbols provided, defaulting to all lymphoma genes." - ) - gene_symbols <- GAMBLR.data::lymphoma_genes$Gene - } - - if(!missing(include_silent_genes)){ - message( - strwrap( - prefix = " ", - initial = "", - "Output will include all genes specified in gene_symbols - and include_silent_genes parameters." - ) - ) - gene_symbols <- c( - gene_symbols, - include_silent_genes - ) %>% - unique() - } - - if(missing(these_samples_metadata)){ - these_samples_metadata <- get_gambl_metadata() - } - - coding_var <- c( - "Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", - "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", - "Nonstop_Mutation", "Splice_Region", "Splice_Site", - "Targeted_Region", "Translation_Start_Site" - ) - - if(include_silent){ - message("Including Synonymous variants for all genes...") - coding_var <- c(coding_var, "Silent") - } - - if(missing(include_silent_genes)){ - coding_ssm <- maf_data %>% - dplyr::filter( - Variant_Classification %in% coding_var - ) - } else { - message( - strwrap( - prefix = " ", - initial = "", - "You have provided gene list with argument include_silent_genes. - The Silent variants will be included even if the include_silent - argument is set to FALSE. - " - ) - ) - coding_ssm <- maf_data %>% - dplyr::filter( - Variant_Classification %in% coding_var | - ( - Hugo_Symbol %in% include_silent_genes & - Variant_Classification == "Silent" - ) - ) - } - - coding <- coding_ssm %>% - dplyr::filter( - Hugo_Symbol %in% gene_symbols - ) %>% - dplyr::select(Tumor_Sample_Barcode, Hugo_Symbol) %>% - dplyr::rename( - "sample_id" = "Tumor_Sample_Barcode", - "gene" = "Hugo_Symbol" - ) %>% - unique() %>% - dplyr::mutate(mutated = 1) - - samples_table <- dplyr::select( - these_samples_metadata, - sample_id - ) - wide_coding <- pivot_wider( - coding, - names_from = "gene", - values_from = "mutated", - values_fill = 0 - ) - all_tabulated <- left_join( - samples_table, - wide_coding - ) - all_tabulated <- all_tabulated %>% - replace(is.na(.), 0) - - # include hotspots if user chooses to do so - if(include_hotspots){ - # first annotate - annotated <- GAMBLR.data::annotate_hotspots( - coding_ssm - ) - - # review for the supported genes - if(review_hotspots){ - annotated = review_hotspots( - annotated, - genes_of_interest = genes_of_interest, - genome_build = genome_build - ) - } - - message("annotating hotspots") - - hotspots <- annotated %>% - dplyr::filter(Hugo_Symbol %in% genes_of_interest) %>% - dplyr::select(Tumor_Sample_Barcode, Hugo_Symbol, hot_spot) %>% - dplyr::rename( - "sample_id" = "Tumor_Sample_Barcode", - "gene" = "Hugo_Symbol" - ) %>% - dplyr::mutate(gene = paste0(gene, "HOTSPOT")) %>% - unique() %>% - dplyr::mutate(mutated = ifelse(hot_spot == "TRUE", 1, 0)) %>% - dplyr::filter(mutated == 1) %>% - dplyr::select(-hot_spot) - - # long to wide hotspots, samples are tabulated with 0 if no hotspot is detected - wide_hotspots <- pivot_wider( - hotspots, - names_from = "gene", - values_from = "mutated", - values_fill = 0 - ) - # join with the ssm object - all_tabulated <- left_join( - all_tabulated, - wide_hotspots - ) - all_tabulated <- all_tabulated %>% - replace(is.na(.), 0) - - all_tabulated <- all_tabulated %>% - dplyr::select(where(~ any(. != 0))) - - all_tabulated <- as.data.frame(all_tabulated) - # make SSM and hotspots non-redundant by giving priority - # to hotspot feature and setting SSM to 0 - for (hotspot_site in colnames(wide_hotspots)[grepl("HOTSPOT", colnames(wide_hotspots))]){ - message(hotspot_site) - this_gene <- gsub("HOTSPOT", "", hotspot_site) - redundant_features <- all_tabulated %>% - dplyr::select(starts_with(this_gene)) - - # if not both the gene and the hotspot are present, go to - # the next iteration - if(ncol(redundant_features)!= 2) next - message("OK") - # if both gene and it's hotspot are in the matrix, give priority to hotspot feature - all_tabulated[(all_tabulated[, this_gene] >0 & all_tabulated[, paste0(this_gene, "HOTSPOT")] == 1),][,c(this_gene, paste0(this_gene, "HOTSPOT"))][, this_gene] = 0 - - # in case gene has both hotspot and another mutation in the same gene, - # keep both tabulated as multihits - if(keep_multihit_hotspot){ - # determine which samples have hot spot and another mutation in same gene - multihits <- annotated %>% - dplyr::filter(Hugo_Symbol == this_gene) %>% - group_by(Tumor_Sample_Barcode) %>% - dplyr::mutate(n_mut = n()) %>% - dplyr::filter( - n_mut > 1 - ) %>% - dplyr::distinct(Tumor_Sample_Barcode, n_mut, hot_spot) %>% - # account for cases with both hotspot and not hotspot to avoid - # double-counting the number of mutations - mutate_at(vars(hot_spot), ~replace_na(., "FALSE")) %>% - dplyr::mutate( - n_mut = ifelse( - hot_spot == "TRUE", - n_mut - 1, - n_mut - ) - ) %>% - group_by(Tumor_Sample_Barcode) %>% - dplyr::arrange(n_mut) %>% - slice_head() %>% - ungroup %>% - select(-hot_spot) - - # Return the annotation of this gene to mutated in these samples - all_tabulated <- all_tabulated %>% - left_join( - ., - multihits, - by = c("sample_id" = "Tumor_Sample_Barcode") - ) %>% - dplyr::mutate( - {{this_gene}} := ifelse( - !is.na(n_mut), - n_mut, - !!!syms(this_gene) - ) - ) %>% - select(- n_mut) - } - - } - - } - return(all_tabulated) - -} \ No newline at end of file diff --git a/R/get_gambl_metadata.R b/R/get_gambl_metadata.R deleted file mode 100644 index a2471d2..0000000 --- a/R/get_gambl_metadata.R +++ /dev/null @@ -1,145 +0,0 @@ -#' @title Get GAMBL Metadata. -#' -#' @description Convenience function for loading the sample metadata. -#' -#' @details This bare bones function was developed to retrieve metadata for -#' non-GSC-users. Specify the seq type (`seq_type_filter`) for the samples you -#' want returned as the only argument. -#' It relies on the bundled metadata in this package. -#' Specify `case_set` argument to retreive samples from particular study. -#' Currently supported case_sets are: FL_Dreval (FL samples from Dreval et al), -#' DLBCL_Dreval (DLBCL samples from Dreval et al), FL-DLBCL-study (all samples -#' from Dreval et al), DLBCL_Arthur (all samples from Arthur et al study), -#' DLBCL_Hilton (all samples from Hilton et al DLBCL Trios study), -#' DLBCL_cell_lines (5 DLBCL cell lines), DLBCL_Chapuy (all samples from Chapuy -#' et al study), DLBCL_Schmitz (all samples from Schmitz et al study), -#' DLBCL_Reddy (all samples from Reddy et al study), DLBCL_Thomas (HTMCP DLBCLs -#' from Thomas et al study), BL_Thomas (BL samples from Thomas et al study) -#' -#' @param seq_type_filter Specify the seq type you want to return metadata for. -#' Default is "genome". -#' @param case_set Optionally specify study details to return samples from a -#' particular case set. See function description for supported case sets. -#' @param ... Any additional parameters. -#' -#' @return A data frame with metadata, tailored for user without GSC access. -#' -#' \describe{ -#' \item{compression}{Format of the original data used as input for our analysis pipelines (cram, bam or fastq)} -#' \item{bam_available}{Whether or not this file was available when last checked.} -#' \item{patient_id}{The anonymized unique identifier for this patient. For BC samples, this will be Res ID.} -#' \item{sample_id}{A unique identifier for the sample analyzed.} -#' \item{seq_type}{The assay type used to produce this data (one of "genome","capture, "mrna", "promethION")} -#' \item{genome_build}{The name of the genome reference the data were aligned to.} -#' \item{cohort}{Name for a group of samples that were added together (usually from a single study), often in the format {pathology}_{cohort_descriptor}.} -#' \item{pathology}{The diagnosis or pathology for the sample} -#' \item{time_point}{Timing of biopsy in increasing alphabetical order (A = diagnosis, B = first relapse etc)} -#' \item{ffpe_or_frozen}{Whether the nucleic acids were extracted from a frozen or FFPE sample} -#' \item{COO_consensus}{Consensus call of COO between different sources.} -#' \item{DHITsig_consensus}{Consensus call of DHIT signature status between different sources.} -#' \item{EBV_status_inf}{Inferred EBV status of the tumor} -#' \item{lymphgen_no_cnv}{LymphGen label using model without CNV} -#' \item{lymphgen_with_cnv}{LymphGen label using model with CNV} -#' \item{lymphgen_cnv_noA53}{LymphGen label using model with CNV but excluding A53 class} -#' \item{lymphgen_wright}{The LymphGen call for this sample from Wright et all (if applicable)} -#' \item{fl_grade}{Grade of FL samples} -#' \item{normal_sample_id}{Sample id for normal tissue used in the analysis} -#' \item{pairing_status}{Matching status of the sample} -#' \item{lymphgen}{LymphGen label} -#' \item{molecular_BL}{label of the sample according to the molecular BL classifier} -#' \item{Tumor_Sample_Barcode}{Duplicate of sample_id for simplifying joins to MAF data frames} -#' \item{pathology_rank}{Numeric rank for consistent ordering of samples by pathology} -#' \item{hiv_status}{HIV status of the sample} -#' \item{age_group}{Adult_BL or Pediatric_BL or Other, specific to the BLGSP study} -#' \item{sex}{The biological sex of the patient, if available. Allowable options: M, F, NA} -#' } -#' -#' @import dplyr purrr -#' -#' @export -#' -#' @examples -#' #return metadata for genome samples -#' genome_meta = get_gambl_metadata(seq_type_filter = "genome") -#' -#' #return metadata for capture samples -#' capture_meta = get_gambl_metadata(seq_type_filter = "capture") -#' -#' #return metadata for genome and capture samples -#' all_meta = get_gambl_metadata(seq_type_filter = c("genome", "capture")) -#' -get_gambl_metadata = function( - seq_type_filter = "genome", - case_set, - ... -){ - - #check if any invalid parameters are provided - check_excess_params(...) - - message("Using the bundled metadata in GAMBLR.data...") - metadata <- GAMBLR.data::sample_data$meta %>% - dplyr::filter(seq_type %in% seq_type_filter) - - - if(!missing(case_set)){ - - # pre-defined case sets - if(case_set == "FL_Dreval"){ - metadata <- metadata %>% - dplyr::filter(cohort == "FL_Dreval", pathology == "FL") - }else if(case_set == "DLBCL_Dreval"){ - metadata <- metadata %>% - dplyr::filter(cohort == "FL_Dreval", pathology == "DLBCL") - }else if(case_set == "FL-DLBCL-study"){ - metadata <- metadata %>% - dplyr::filter(cohort == "FL_Dreval") - }else if(case_set == "DLBCL_Arthur"){ - metadata <- metadata %>% - dplyr::filter(cohort == "DLBCL_Arthur") - }else if(case_set == "DLBCL_Hilton"){ - metadata <- metadata %>% - dplyr::filter(cohort == "DLBCL_Hilton") - }else if(case_set == "DLBCL_cell_lines"){ - metadata <- metadata %>% - dplyr::filter(cohort == "DLBCL_cell_lines") - }else if(case_set == "DLBCL_Chapuy"){ - metadata <- metadata %>% - dplyr::filter(cohort == "dlbcl_chapuy") - }else if(case_set == "DLBCL_Schmitz"){ - metadata <- metadata %>% - dplyr::filter(cohort == "dlbcl_schmitz") - }else if(case_set == "DLBCL_Reddy"){ - metadata <- metadata %>% - dplyr::filter(cohort == "dlbcl_reddy") - }else if(case_set == "BL_Thomas"){ - metadata <- metadata %>% - dplyr::filter(cohort == "BL_Thomas") - }else if(case_set == "DLBCL_Thomas"){ - metadata <- metadata %>% - dplyr::filter(cohort == "DLBCL_Thomas") - }else{ - message(paste("case set", case_set, "not available")) - return() - } - } - - metadata <- metadata %>% - dplyr::left_join( - gambl_metadata, - by = "sample_id", - suffix = c(".X", ".Y") - ) %>% - split.default(gsub('.[XY]', '', names(.))) %>% - purrr::map_dfc( ~ if (ncol(.x) == 1) - .x - else - dplyr::mutate(.x,!!sym(gsub('.X', '', names( - .x - )[1])) := dplyr::coalesce(!!!syms(names( - .x - ))))) %>% - dplyr::select(!contains(".")) - #ensure only unique rows are returned - return(unique(metadata)) -} diff --git a/R/get_manta_sv.R b/R/get_manta_sv.R deleted file mode 100644 index dd9f697..0000000 --- a/R/get_manta_sv.R +++ /dev/null @@ -1,164 +0,0 @@ -#' @title Get Manta SVs -#' -#' @description Convenience function for retrieving Manta Structural Variants (SVs) from the bundled data [GAMBLR.data::sample_data]. -#' -#' @details To obtain SV calls for multiple samples, give `these_sample_ids` a vector of sample IDs. -#' Alternatively, the user can also provide the `these_samples_metadata` parameter to make use of an already subset metadata table. -#' In this case, the returned SVs will be restricted to the sample_ids within that data frame. -#' This function internally calls [GAMBLR.data::id_ease] to streamline sample ID/metadata parameters. -#' This function can also restrict the returned calls to any genomic regions specified within `chromosome`, `qstart`, `qend`, -#' or the complete region specified under `region` (in chr:start-end format), note that chromosome can be either prefixed or not prefixed. -#' Useful filtering parameters are also available, use `min_vaf` to set the minimum tumour VAF for a SV to be returned and `min_score` -#' to set the lowest Manta somatic score for a SV to be returned. `pair_status` can be used to return variants from either matched or unmatched samples. -#' In addition, the user can chose to return all variants, even the ones not passing the filter criteria. To do so, set `pass = FALSE` (default is TRUE). -#' -#' @param these_sample_ids Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to subset the return to. -#' If not provided (and if `these_sample_ids` is not provided), the function will return all samples from the specified seq_type in the metadata. -#' @param projection The projection genome build. Default is grch37. -#' @param this_seq_type The this_seq_type you want back, default is genome. -#' @param chromosome Optional, the chromosome you are restricting to (can be prefixed or not prefixed). -#' @param qstart Optional, query start coordinate of the range you are restricting to. -#' @param qend Optional, query end coordinate of the range you are restricting to. -#' @param region Optional, region formatted like chrX:1234-5678 (chromosome can be prefixed or not prefixed) instead of specifying chromosome, start and end separately. -#' @param pairing_status Use to restrict results (if desired) to matched or unmatched results (default is to return all). This parameter takes the filtering condition as a string ("matched" or "unmatched"). -#' @param min_vaf The minimum tumour VAF for a SV to be returned. Default is 0.1. -#' @param min_score The lowest Manta somatic score for a SV to be returned. Default is 40. -#' @param pass If TRUE (default) only return SVs that are annotated with PASS in the FILTER column. Set to FALSE to keep all variants, regardless if they PASS the filters. -#' @param verbose Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function. -#' @param ... Any additional parameters. -#' -#' @export -#' -#' @import dplyr -#' -#' @examples -#' #load packages -#' library(dplyr) -#' -#' #lazily get every SV in the table with default quality filters -#' all_sv = get_manta_sv() -#' -#' #get all SVs DLBCL cell line samples -#' cell_line_meta = GAMBLR.data::sample_data$meta %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' dlbcl_sv = get_manta_sv(these_samples_metadata = cell_line_meta) -#' -#' #get the SVs in a region around MYC -#' myc_locus_sv = get_manta_sv(region = "8:128723128-128774067") -#' -get_manta_sv = function(these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - chromosome, - qstart, - qend, - region, - pairing_status, - min_vaf = 0.1, - min_score = 40, - pass = TRUE, - verbose = FALSE, - ...){ - - #warn/notify the user what version of this function they are using - message("Using the bundled Manta SV (.bedpe) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - #return manta SV based on the selected projection - if(projection %in% valid_projections){ - manta_sv = GAMBLR.data::sample_data[[projection]]$bedpe %>% - dplyr::filter(tumour_sample_id %in% sample_ids) - }else{ - stop(paste("please provide a valid projection. The following are available:", - paste(valid_projections,collapse=", "))) - } - - if(!missing(region)){ - region = gsub(",", "", region) - split_chunks = unlist(strsplit(region, ":")) - chromosome = split_chunks[1] - startend = unlist(strsplit(split_chunks[2], "-")) - qstart = startend[1] - qend = startend[2] - } - - manta_sv = manta_sv %>% - dplyr::filter(VAF_tumour >= min_vaf, - SCORE >= min_score) - - if(verbose){ - no_manta = setdiff(metadata$sample_id, manta_sv$tumour_sample_id) - - if(length(no_manta) > 0){ - message(paste0("No Manta results found for ", length(no_manta), " samples...")) - print(no_manta) - } - } - - #deal with chr prefixes based on the selected projection (if return is to be subset to regions...) - if(!missing(region) || !missing(chromosome)){ - if(projection == "grch37"){ - if(grepl("chr", chromosome)){ - chromosome = gsub("chr", "", chromosome) - } - }else if(projection == "hg38"){ - if(!grepl("chr", chromosome)){ - chromosome = paste0("chr", chromosome) - } - } - - manta_sv = manta_sv %>% - dplyr::filter((CHROM_A == chromosome & START_A >= qstart & START_A <= qend) | (CHROM_B == chromosome & START_B >= qstart & START_B <= qend)) - } - - if(verbose){ - message("\nThe following VCF filters are applied;") - message(paste0(" Minimum VAF: ", min_vaf)) - message(paste0(" Minimum Score: ", min_score)) - message(paste0(" Only keep variants passing the quality filter: ", pass)) - } - - #PASS filter - if(pass){ - manta_sv = manta_sv %>% - dplyr::filter(FILTER == "PASS") - } - - #pairing status filter - if(!missing(pairing_status)){ - if(verbose){ - message(paste0(" Pairing status: ", pairing_status)) - } - - manta_sv = manta_sv %>% - dplyr::filter(pair_status == pairing_status) - } - - #convert to data frame and print some metrics - manta_sv = as.data.frame(manta_sv) - - if(verbose){ - n_variants = nrow(manta_sv) - unique_samples = unique(manta_sv$tumour_sample_id) - message(paste0("\nReturning ", n_variants, " variants from ", length(unique_samples), " sample(s)")) - message("\nDone!") - } - - return(manta_sv) -} diff --git a/R/get_sample_cn_segments.R b/R/get_sample_cn_segments.R deleted file mode 100644 index 17388ea..0000000 --- a/R/get_sample_cn_segments.R +++ /dev/null @@ -1,90 +0,0 @@ -#' @title Get Sample CN Segments. -#' -#' @description Get all segments for a single (or multiple) sample_id(s). -#' -#' @details This function returns CN segments. This works for single sample or multiple samples. -#' Specify the sample IDs you are interested in with `these_sample_ids` (as a vector of characters), -#' Or call this function with `these_samples_metadata` if you already have a metadata table subset to the sample IDs of interest. -#' If none of the above parameters are specified, the function will return CN segments for available samples (from get_gambl_metadata). -#' Note, this. function internally calls [GAMBLR.data::id_ease] for dealing with sample IDs and metadata tables. -#' -#' @param these_sample_ids Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to subset the return to. -#' If not provided (and if `these_sample_ids` is not provided), the function will return all samples from the specified seq_type in the metadata. -#' @param projection Selected genome projection for returned CN segments. Default is "grch37". -#' @param this_seq_type Seq type for returned CN segments. Default is genome. -#' @param with_chr_prefix Set to TRUE to add a chr prefix to chromosome names. Default is FALSE. -#' @param streamlined Return a minimal output rather than full details. Default is FALSE. -#' @param verbose Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function. -#' @param ... Any additional parameters. -#' -#' @return A data frame of segments for a specific or multiple sample ID(s). -#' -#' @import dplyr -#' @export -#' -#' @examples -#' #load pacakges -#' library(dplyr) -#' -#' #get CN segments for one sample -#' dohh2_segs = get_sample_cn_segments(these_sample_ids = "DOHH-2", -#' projection = "hg38", -#' streamlined = TRUE) -#' -#' #get CN segments for DLBCL cell line -#' cell_line_meta = GAMBLR.data::sample_data$meta %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' dlbcl_segs = get_sample_cn_segments(these_samples_metadata = cell_line_meta, -#' streamlined = TRUE) -#' -get_sample_cn_segments = function(these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - with_chr_prefix = FALSE, - streamlined = FALSE, - verbose = FALSE, - ...){ - - #warn/notify the user what version of this function they are using - message("Using the bundled CN segments (.seg) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) - - #return CN segments based on the selected projection - if(projection %in% valid_projections){ - all_segs = GAMBLR.data::sample_data[[projection]]$seg %>% - dplyr::filter(ID %in% sample_ids) - }else{ - stop(paste("please provide a valid projection. The following are available:", - paste(valid_projections,collapse=", "))) - } - - #deal with chr prefixes - if(!with_chr_prefix){ - all_segs = all_segs %>% - dplyr::mutate(chrom = gsub("chr", "", chrom)) - }else{ - if(!grepl("chr", all_segs$chrom[1])){ - all_segs$chrom = paste0("chr", all_segs$chrom) - } - } - - if(streamlined){all_segs = dplyr::select(all_segs, ID, CN)} - - return(all_segs) -} diff --git a/R/get_ssm_by_patients.R b/R/get_ssm_by_patients.R deleted file mode 100644 index 6f33702..0000000 --- a/R/get_ssm_by_patients.R +++ /dev/null @@ -1,83 +0,0 @@ -#' @title Get SSM By Patients. -#' -#' @description Get MAF-format data frame for more than one patient. -#' -#' @details This function returns variants from a set of patients. -#' This function internally calls [GAMBLR.data::get_ssm_by_samples]. -#' Thus, the main contents of this function is to wrangle the provided patient IDs, -#' so that the corresponding sample IDs can be provided to the internal call of `get_ssm_by_samples`. -#' This function expects either a vector of patient IDs (`these_patients_ids`) -#' or an already subset metadata table (`these_samples_metadata`). -#' -#' @param these_patient_ids A vector of patient IDs that you want results for. -#' The user can also use a metadata table that has been subset to the patient IDs of interest (see `these_samples_metadata`). -#' @param these_samples_metadata A metadata subset to contain the rows corresponding to the patients of interest. -#' If the vector of patient IDs is missing (`these_patient_ids`), this function will default to all patient IDs in the metadata table given to this parameter. -#' @param projection Obtain variants projected to this reference (one of grch37 or hg38). Default is grch37. -#' @param this_seq_type The seq type you want results for. Default is "genome". -#' @param tool_name Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers. -#' @param this_study Optionally specify first name of the author for the paper -#' from which the variants should be returned for. -#' This parameter can either be a vector of indexes (integer) or a vector of characters (matching columns in MAF). -#' @param verbose Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function. -#' @param ... Any additional parameters. -#' -#' @return A data frame with SSM calls for the selected patients in MAF format. -#' -#' @import dplyr -#' -#' @export -#' -#' @examples -#' #load packages -#' library(dplyr) -#' -#' #basic usage, these_patient_ids -#' my_patient = get_ssm_by_patients(these_patient_ids = "DOHH-2") -#' -#' #using a subset metadata tablee to retreive patient SSMs -#' cell_line_meta = GAMBLR.data::sample_data$meta %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' patient_maf = get_ssm_by_patients(these_samples_metadata = cell_line_meta, -#' this_seq_type = "genome") -#' -get_ssm_by_patients = function(these_patient_ids, - these_samples_metadata, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ...){ - - #check if any invalid parameters are provided - check_excess_params(...) - - #figure out what patients the user wants - if(missing(these_patient_ids)){ - if(missing(these_samples_metadata)){ - stop("You must provide either patient IDs (`these_patient_ids`) or a metadata table with the patient IDs of interest (`these_samples_metadata`)...") - }else{ - message("No patient IDs were provided, this function will resort to all available patient IDs in the provided metadata.") - } - }else{ - if(missing(these_samples_metadata)){ - these_samples_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter = this_seq_type) - } - message("Patient IDs and metadata were provided, this function will resort to all available patient IDs in the provided metadata.") - these_samples_metadata = these_samples_metadata %>% - dplyr::filter(patient_id %in% these_patient_ids) - } - - #run get_ssm_by_samples with these_samples_metadata parameter - samples_ssm = GAMBLR.data::get_ssm_by_samples(these_samples_metadata = these_samples_metadata, - projection = projection, - this_seq_type = this_seq_type, - tool_name = tool_name, - verbose = verbose, - ...) - samples_ssm = create_maf_data(samples_ssm,projection) - # use S3-safe version of dplyr function - samples_ssm = mutate.genomic_data(samples_ssm,maf_seq_type = this_seq_type) -} diff --git a/R/get_ssm_by_region.R b/R/get_ssm_by_region.R deleted file mode 100644 index 34a6f65..0000000 --- a/R/get_ssm_by_region.R +++ /dev/null @@ -1,138 +0,0 @@ -#' @title Get SSM By Region. -#' -#' @description Retrieve all SSMs from the GAMBL database within a single genomic coordinate range. -#' -#' @details This function lets the user specify a region of interest for returning SSM calls within that region. -#' There are multiple ways a region can be specified. For example, the user can provide the full region in a "region" format (chr:start-end) to the `region` parameter. -#' Or, the user can provide chromosome, start and end coordinates individually with `chr`, `start`, and `end` parameters. -#' -#' @param these_sample_ids Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to subset the return to. -#' If not provided (and if `these_sample_ids` is not provided), the function will return all samples from the specified seq_type in the metadata. -#' @param maf_data Optional data frame with mutations in MAF format. -#' If user provides a maf, the function trusts that the user has already subset this to samples of interest, correct seq_type. -#' i.e the following parameters are ignored; `these_samples_metadata`, `these_sample_ids`, and `this_seq_type` -#' @param chromosome The chromosome you are restricting to (with or without a chr prefix). -#' @param qstart Query start coordinate of the range you are restricting to. -#' @param qend Query end coordinate of the range you are restricting to. -#' @param region Region formatted like chrX:1234-5678 instead of specifying chromosome, start and end separately. -#' @param streamlined Return Start_Position and Tumor_Smaple_Barcode as the only two MAF columns. Default is FALSE. -#' @param projection Obtain variants projected to this reference (one of grch37 or hg38). -#' @param this_seq_type The seq_type you want back, default is genome. -#' @param tool_name Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers. -#' @param this_study Optionally specify first name of the author for the paper -#' from which the variants should be returned for. -#' @param verbose Set to FALSE to prevent ANY message to be printed. -#' In most cases, this parameter should be left to TRUE. -#' The parameter was added to accommodate for noisy output -#' when running this function in a loop for retrieving SSM -#' for multiple regions [GAMBLR.data::get_ssm_by_regions]. -#' @param ... Any additional parameters. -#' -#' @return A data frame containing all mutations (MAF) in the specified region. -#' -#' @import dplyr -#' -#' @examples -#' my_mutations = get_ssm_by_region(region = "chr8:128,723,128-128,774,067") -#' -#' #specifying chromosome, start and end individually -#' my_mutations = get_ssm_by_region(chromosome = "8", -#' qstart = 128723128, -#' qend = 128774067) -#' -get_ssm_by_region = function(these_sample_ids = NULL, - these_samples_metadata = NULL, - maf_data, - chromosome, - qstart, - qend, - region = "", - streamlined = FALSE, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ...){ - - if(verbose){ - if(missing(maf_data)){ - #warn/notify the user what version of this function they are using - message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...") - } - } - - #check if any invalid parameters are provided - check_excess_params(...) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - - - # Optionally return variants from a particular study - if(!missing(this_study)){ - this_maf <- this_maf %>% - dplyr::filter((!!sym("Study")) == this_study) - } - - #split region into chunks (chr, start, end) and deal with chr prefixes based on the selected projection - if(length(region) > 1){ - stop("You are providing more than one region, please refer to get_ssm_by_regions for multiple regions...") - } - - if(!region == ""){ - region = gsub(",", "", region) - split_chunks = unlist(strsplit(region, ":")) - - chromosome = split_chunks[1] - startend = unlist(strsplit(split_chunks[2], "-")) - qstart = as.numeric(startend[1]) - qend = as.numeric(startend[2]) - }else{ - if(projection =="grch37"){ - chromosome = gsub("chr", "", chromosome) - } - region = paste0(chromosome, ":", qstart, "-", qend) - } - - if(projection == "grch37"){ - chromosome = gsub("chr", "", chromosome) - } - - #return SSMs based on the selected projection - if(missing(maf_data)){ - # Filter by position on-the-fly to avoid wastefully building the same large MAF each time - this_maf = GAMBLR.data::sample_data[[projection]]$maf %>% - dplyr::filter(Chromosome == chromosome & Start_Position > qstart & Start_Position < qend) %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - muts_region <- GAMBLR.data::sample_data[[projection]]$ashm %>% - dplyr::filter(Chromosome == chromosome & Start_Position > qstart & Start_Position < qend) %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) %>% - bind_rows(this_maf, .) - }else{ - muts_region = dplyr::filter(maf_data, Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter(Chromosome == chromosome & Start_Position > qstart & Start_Position < qend) - } - - # Handle possible duplicates - muts_region <- muts_region %>% - distinct(Tumor_Sample_Barcode, Chromosome, Start_Position, End_Position, .keep_all = TRUE) - - if(streamlined){ - muts_region = muts_region %>% - dplyr::select(Start_Position, Tumor_Sample_Barcode) - } - muts_region = create_maf_data(muts_region,projection) - # use S3-safe version of dplyr function - muts_region = mutate.genomic_data(muts_region,maf_seq_type = this_seq_type) - return(muts_region) -} diff --git a/R/get_ssm_by_regions.R b/R/get_ssm_by_regions.R deleted file mode 100644 index bda6930..0000000 --- a/R/get_ssm_by_regions.R +++ /dev/null @@ -1,143 +0,0 @@ -#' @title Get SSM By Regions. -#' -#' @description Efficiently retrieve all mutations across a range of genomic regions. -#' -#' @details This function internally calls get_ssm_by_region to retrieve SSM calls for the specified regions. -#' -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to subset the return to. -#' @param this_seq_type The this_seq_type you want back, default is genome. -#' @param tool_name Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers. -#' @param regions_list A vector of regions in the chr:start-end format to restrict the returned SSM calls to. -#' @param regions_bed A data frame in BED format with the coordinates you want to retrieve (recommended). -#' This parameter can also accept an additional column with region names that will be added to the return if `use_name_column = TRUE` -#' @param streamlined If set to TRUE (default) only 3 columns will be kept in the returned data frame (start, sample_id and region_name). -#' @param projection Obtain variants projected to this reference (one of grch37 or hg38), default is grch37. -#' @param verbose Set to TRUE to maximize the output to console. Default is TRUE. -#' This parameter also dictates the verbosity of any helper function internally called inside the main function. -#' @param ... Any additional parameters. -#' -#' @return Returns a data frame of variants in MAF-like format. -#' -#' @import tibble dplyr tidyr -#' -#' @export -#' -#' @examples -#' #basic usage, adding custom names from bundled ashm data frame -#' regions_bed = create_bed_data( GAMBLR.data::grch37_ashm_regions, -#' fix_names = "concat", -#' concat_cols = c("gene","region"), -#' sep="-") -#' -#' my_meta = get_gambl_metadata() -#' # get a full MAF-format data frame for all aSHM regions on grch37 coordinates -#' ashm_maf = get_ssm_by_regions(regions_bed = regions_bed, -#' these_samples_metadata = my_meta, -#' streamlined = FALSE) -#' -#' # This example intentionally fails -#' ashm_maf = get_ssm_by_regions(regions_bed = regions_bed, -#' these_samples_metadata = my_meta, -#' projection="hg38") -#' # Error in get_ssm_by_regions(regions_bed = regions_bed, these_samples_metadata = my_meta, : -#' # requested projection: hg38 and genome_build of regions_bed: grch37 don't match -#' -get_ssm_by_regions <- function(these_samples_metadata, - regions_list, - regions_bed, - this_seq_type = "genome", - streamlined = TRUE, - projection = "grch37", - verbose = FALSE, - tool_name = "slms-3", - ...) { - - # check provided projection - # first, get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), - value = TRUE, invert = TRUE) - if (!projection %in% valid_projections) { - stop("Please provide a valid projection. The following are available: ", - paste(valid_projections, collapse = ", "), ".") - } - - # check if any invalid parameters are provided - check_excess_params(...) - - bed2region = function(x) { - paste0(x[1], ":", as.numeric(x[2]), "-", as.numeric(x[3])) - } - - if (missing(regions_list)) { - if (!missing(regions_bed)) { - if("bed_data" %in% class(regions_bed)){ - #confirm the genome builds match - if(!get_genome_build(regions_bed)==projection){ - stop(paste("requested projection:",projection,"and genome_build of regions_bed:", get_genome_build(regions_bed), "don't match")) - } - } - regions = apply(regions_bed, 1, bed2region) - } else { - stop("You must supply either regions_list or regions_bed") - } - } else { - regions = regions_list - } - - # Get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - verbose = verbose, - this_seq_type = this_seq_type) - - - # Warn/notify the user what version of this function they are using - message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...") - if (verbose) { - print("Using the non-default engine for efficiency...") - } - - sample_maf <- get_ssm_by_samples( - these_samples_metadata = these_samples_metadata, - this_seq_type = this_seq_type, - projection = projection, - tool_name = tool_name - ) - if(!missing(regions_bed) & "bed_data" %in% class(regions_bed)){ - regions_df = dplyr::select(regions_bed,1:4) %>% - dplyr::rename(c("Chromosome"="chrom", - "Start_Position"="start", - "End_Position"="end", - "region"="name")) - }else{ - regions_df <- as.data.frame(regions) %>% - `names<-`("regions") %>% - separate( - regions, - c("Chromosome", "Start_Position", "End_Position"), - ":|-" - ) %>% - mutate( - Start_Position = as.numeric(Start_Position), - End_Position = as.numeric(End_Position), - region = row_number() - ) - } - - - region_mafs <- cool_overlaps( - sample_maf, - regions_df - ) %>% - dplyr::rename_with(~ gsub(".x", "", .x, fixed = TRUE)) %>% - dplyr::select(all_of(c(names(sample_maf), "region"))) %>% - dplyr::group_split(region) - maf_df = do.call(bind_rows, region_mafs) - - if(streamlined){ - maf_df = dplyr::select(maf_df,Start_Position,Tumor_Sample_Barcode,region) %>% - dplyr::rename(c("sample_id"="Tumor_Sample_Barcode")) - } - return(maf_df) - - -} \ No newline at end of file diff --git a/R/get_ssm_by_samples.R b/R/get_ssm_by_samples.R deleted file mode 100644 index 5c93669..0000000 --- a/R/get_ssm_by_samples.R +++ /dev/null @@ -1,85 +0,0 @@ -#' @title Get SSM By Samples. -#' -#' @description Get the SSMs (i.e. load MAF) for a single sample or a collection of samples. -#' -#' @details Retrieve a maf for a specific sample or a set of samples. -#' Either specify the sample IDs of interest with `these_sample_ids`. -#' Or a metadata table subset to the sample IDs of interest with `these_samples_metadata`. -#' -#' @param these_sample_ids A vector of one or more sample IDs that you want results for. -#' @param these_samples_metadata Optional, a metadata table (with sample IDs in a column) to auto-subset the data to samples in that table before returning. -#' If not provided and these_sample_ids is also not provided, the function will return SSM for all samples from the specified seq_type in the bundled metadata. -#' @param this_seq_type Default is genome. -#' @param projection The projection genome build. Supports hg38 and grch37. -#' @param tool_name Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers. -#' @param verbose Enable for debugging/noisier output. -#' @param ... Any additional parameters. -#' -#' @return data frame in MAF format. -#' -#' @import dplyr -#' -#' @export -#' -#' @examples -#' #load a common dependency -#' library(dplyr) -#' -#' #Get genome-wide set of mutations from all DLBCL cell lines -#' cell_line_meta = get_gambl_metadata() %>% -#' dplyr::filter(cohort == "DLBCL_cell_lines") -#' -#' dlbcl_maf = get_ssm_by_samples(these_samples_metadata = cell_line_meta) -#' -get_ssm_by_samples <- function(these_sample_ids = NULL, - these_samples_metadata = NULL, - this_seq_type = "genome", - projection = "grch37", - tool_name = "slms-3", - verbose = FALSE, - ...){ - - #warn/notify the user what version of this function they are using - message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...") - - #check if any invalid parameters are provided - check_excess_params(...) - - #get samples with the dedicated helper function - metadata = id_ease(these_samples_metadata = these_samples_metadata, - these_sample_ids = these_sample_ids, - verbose = verbose, - this_seq_type = this_seq_type) - - sample_ids = metadata$sample_id - - #get valid projections - valid_projections = grep("meta", names(GAMBLR.data::sample_data), value = TRUE, invert = TRUE) - - #return SSMs based on the selected projection - if(projection %in% valid_projections){ - sample_ssm = GAMBLR.data::sample_data[[projection]]$maf %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - sample_ssm <- bind_rows( - sample_ssm, - GAMBLR.data::sample_data[[projection]]$ashm %>% - dplyr::filter(Tumor_Sample_Barcode %in% sample_ids) %>% - dplyr::filter((tolower(!!sym("Pipeline")) == tool_name)) - ) - - }else{ - stop(paste("please provide a valid projection. The following are available:", - paste(valid_projections,collapse=", "))) - } - - - # Handle possible duplicates - sample_ssm <- sample_ssm %>% - distinct(Tumor_Sample_Barcode, Chromosome, Start_Position, End_Position, .keep_all = TRUE) - # bundle genome_build with the maf_data - sample_ssm = create_maf_data(sample_ssm,projection) - # use S3-safe version of dplyr function - sample_ssm = mutate.genomic_data(sample_ssm,maf_seq_type = this_seq_type) - return(sample_ssm) -} diff --git a/R/id_ease.R b/R/id_ease.R deleted file mode 100644 index 0a1dd9e..0000000 --- a/R/id_ease.R +++ /dev/null @@ -1,109 +0,0 @@ -#' @title ID Ease -#' -#' @aliases id_ease, id ease -#' -#' @description Internal convenience function that standardize the way GAMBLR functions deals with sample IDs (these_sample_ids) -#' and metadata (these_samples_metadata). -#' -#' @details This function can take sample IDs as a vector of characters, or a metadata table in data frame format. -#' If no sample IDs are provided to the function, the function will operate on all gambl sample IDs available for the given seq type. -#' It is highly recommended to run this function with `verbose = TRUE`. -#' Since this will not only improve the overall logic on how the function operates. -#' But also might help with debugging functions that are internally calling this function. -#' The function also performs sanity checks and notifies the user if any of the requested sample IDs are not found in the metadata. -#' In addition, the function also notifies the dimensions of the returned object, providing further insight to what is returned. -#' As with all GAMBLR functions, providing a curated metadata table to any GAMBLR function (as opposed to a vector of IDs) is the safest way to ensure you get the expected result. -#' -#' @param these_samples_metadata An optional data frame with metadata, subset to sample IDs of interest. -#' If not provided will retrieve GAMBL metadata for all available samples. -#' @param these_sample_ids Optional character vector of GAMBL sample IDs. -#' @param this_seq_type The seq type of interest. Default is both genome and exome, with priority for genome when a sample has >1 seq_type. -#' @param verbose Set to FALSE to limit the information that gets printed to the console. Default is FALSE. -#' -#' @export -#' -#' @return Metadata (data frame). -#' -#' @examples -#' #load packages -#' library(dplyr) -#' -#' #give the function nothing (i.e return all sample IDs in the metadata for the default seq type) -#' #return metadata for all samples in the default seq type -#' all_meta = id_ease() -#' -#' #return metadata based on a sample ID -#' sample_meta = id_ease(these_sample_ids = "94-15772_tumorA") -#' -#' #return sample IDs based on an already filtered metadata -#' this_metadata = get_gambl_metadata(seq_type_filter = "genome") %>% -#' head(5) -#' -#' these_ids = id_ease(these_samples_metadata = this_metadata) -#' -id_ease = function(these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = c("genome", "capture"), - verbose = FALSE){ - - #check for provided metadata, else use GAMBL metadata - if(is.null(these_samples_metadata)){ - if(verbose){ - message("id_ease: No metadata provided, the helper function will fetch metadata for all gambl samples in the selected seq type...") - } - metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter = this_seq_type) - }else{ - if(verbose){ - message("id_ease: Metadata is provided and samples of the selected seq type are kept...") - } - metadata = dplyr::filter(these_samples_metadata, seq_type %in% this_seq_type) - not_seq_type = setdiff(these_samples_metadata$sample_id, metadata$sample_id) - if(length(not_seq_type) > 0){ - not_seq_type_msg = gettextf("id_ease: WARNING! %i samples in the provided metadata were removed because their seq types are not the same as in the `set_type` argument.", - length(not_seq_type)) - if(verbose){ - max_to_show <- 100 - if( length(not_seq_type) > max_to_show ){ - not_seq_type_msg = gettextf("%s Their first %i IDs are:", not_seq_type_msg, - max_to_show) - not_seq_type = head(not_seq_type, max_to_show) - }else{ - not_seq_type_msg = gettextf("%s Their IDs are:", not_seq_type_msg) - } - message(not_seq_type_msg) - print(not_seq_type) - }else{ - not_seq_type_msg = gettextf("%s Use `verbose = TRUE` to see their IDs.", not_seq_type_msg) - message(not_seq_type_msg) - } - } - } - - #ensure metadata is subset to specified sample IDs - if(!is.null(these_sample_ids)){ - if(verbose){ - message("id_ease: Sample IDs are provided, filtering the metadata for selected sample IDs...") - } - metadata = dplyr::filter(metadata, sample_id %in% these_sample_ids) - - #check if metadata is empty - if(nrow(metadata) == 0){ - stop("No samples in the metadata, try a different sample ID...") - } - #check the existence of provided sample IDs in the metadata - not_in_meta = setdiff(these_sample_ids, metadata$sample_id) - if(length(not_in_meta) > 0){ - message("id_ease: WARNING! The following sample IDs were not found in the metadata:") - print(not_in_meta) - } - }else{ - if(verbose){ - message("id_ease: No sample IDs provided, all sample IDs in the metadata will be kept...") - } - } - if(verbose){ - unique_samples = unique(metadata$sample_id) - message(paste0("id_ease: Returning metadata for ", length(unique_samples), " samples..." )) - } - return(metadata) -} diff --git a/R/process_regions.R b/R/process_regions.R deleted file mode 100644 index 4b10d9c..0000000 --- a/R/process_regions.R +++ /dev/null @@ -1,147 +0,0 @@ -#' @title Process Regions objects. -#' -#' @description INTERNAL FUNCTION to harmonize genomic regions specified as character vectors or data frames. -#' -#' @details INTERNAL FUNCTION to harmonize genomic regions specified as character vectors or data frames. -#' -#' @param regions_list Character vector of genomic regions. If neither regions nor regions_df is specified, will use GAMBLR aSHM regions -#' @param regions_bed Data frame of genomic regions with column names "chrom", "start", "end", "name" -#' @param region_padding Amount to pad the start and end coordinates by. The default is 0 (no padding). -#' @param skip_regions Character vector of genes to drop from GAMBLR aSHM regions. -#' @param only_regions Character vector of genes to include from GAMBLR aSHM regions. -#' @param projection Specify which genome build projection to use. The default is "grch37", also accepts "hg38". -#' @param sort Set to TRUE to force regions_bed to be ordered on chromosome and coordinate -#' -#' @return A list with two objects, regions as a vector and in bed format. -#' -#' @export -#' -#' @examples -#' library(dplyr) -#' -#' regions <- setNames( -#' c("chr1:10000-15000", "chr1:100000000-100005000"), -#' c("one_region", "another_region") -#' ) -#' process_regions(regions_list = regions) -#' -#' reg_bed = GAMBLR.data::grch37_ashm_regions %>% -#' dplyr::filter(chr_name == "chr17") %>% -#' mutate(name = region, chrom = chr_name, start = hg19_start, end = hg19_end) %>% -#' select(chrom, start, end, name) -#' -#' process_regions(regions_bed = reg_bed) -#' -process_regions <- function(regions_list = NULL, - regions_bed = NULL, - region_padding = 0, - skip_regions = NULL, - only_regions = NULL, - projection = "grch37", - sort = FALSE) { - - # Use default ashm region table if no regions are provided - if (is.null(regions_list)) { - if (is.null(regions_bed)) { - message("Using default GAMBLR aSHM regions. ") - if (projection == "grch37") { - regions_bed <- create_bed_data(grch37_ashm_regions, - fix_names="concat", - concat_cols=c("gene","region"), - sep="_") - } else if(projection=="hg38") { - regions_bed <- create_bed_data(hg38_ashm_regions, - fix_names="concat", - concat_cols=c("gene","region"), - sep="_") - }else{ - stop("unsupported projection!") - } - - if (!is.null(skip_regions)) { - # drop user-specified regions - regions_bed <- regions_bed %>% - dplyr::filter(!gene %in% skip_regions) - } - if (!is.null(only_regions)) { - # keep only user-specified regions - regions_bed <- regions_bed %>% - dplyr::filter(gene %in% only_regions) - } - } - - required_cols <- c("chrom", "start", "end", "name") - if (min(required_cols %in% colnames(regions_bed)) == 0) { - stop("Provided regions_bed lacks required column names. Ensure columns chrom, start, end, and name are present. ") - } - - # gene column is required for later joins - if (!"gene" %in% colnames(regions_bed)) { - regions_bed <- mutate(regions_bed, gene = name) - } - } else { - # Convert character vector of regions to df - regions_bed <- bind_rows(lapply(regions_list, function(x) { - - chunks <- region_to_chunks(x) - if(projection=="grch37"){ - chunks$chromosome = gsub("chr","",chunks$chromosome) - }else if(projection=="hg38" && !any(grepl("chr",chunks$chromosome))){ - chunks$chromosome = paste0("chr",chunks$chromosome) - } - df <- data.frame( - chrom = chunks$chromosome, - start = as.numeric(chunks$start), - end = as.numeric(chunks$end) - ) - })) - if(sort){ - if(projection=="hg38"){ - chrom_order = c(paste0("chr",c(1:22)),"chrX","chrY") - }else{ - chrom_order = c(c(1:22),"X","Y") - } - - regions_bed = mutate(regions_bed, - chrom=factor(chrom,levels=chrom_order)) %>% - arrange(chrom,start) %>% - mutate(chrom = as.character(chrom)) - } - if (!is.null(names(regions_list))) { - regions_bed$name <- names(regions_list) - regions_bed$gene <- names(regions_list) - } else { - regions_bed = mutate(regions_bed,name=paste0(chrom,":",start,"-",end)) - } - } - - # Collapse regions with duplicate names - if (length(unique(regions_bed$name)) < length(regions_bed$name)) { - message("Warning: Multiple regions in the provided data frame have the same name. Merging these entries based on min(start) and max(end) per name value. ") - regions_bed <- regions_bed %>% - group_by(name) %>% - mutate( - start = min(start), - end = max(end) - ) %>% - ungroup() %>% - distinct() - } - - regions_list <- unlist(apply( - regions_bed, - 1, - function(x) { - # add specified padding around each region - paste0(x[1], ":", as.numeric(x[2]) - region_padding, "-", as.numeric(x[3]) + region_padding) - } - )) - names(regions_list) <- regions_bed$name - - return( - list( - regions_list = regions_list, - regions_bed = regions_bed - ) - ) -} diff --git a/R/region_to_chunks.R b/R/region_to_chunks.R deleted file mode 100644 index 2faad22..0000000 --- a/R/region_to_chunks.R +++ /dev/null @@ -1,26 +0,0 @@ -#' @title Separate a chromosome region into chunks -#' -#' @description `region_to_chunks` breaks the input string that stores a chromosome -#' region to create a list with chromosome number and start and end positions as -#' separated elements. -#' -#' @param region A single string that stores a chromosome region. Any format like -#' "chr1:100000-200000", "1:100000-200000", "chr1:100'000-200'000" is possible. -#' -#' @return A list with length 3 and names "chromosome", "start" and "end. -#' @export -#' -#' @examples -#' region_to_chunks(region = "chr1:100000-200000") -#' -region_to_chunks = function(region){ - region = unname(region) - region = gsub(",", "", region) - #format is chr6:37060224-37151701 - split_chunks = unlist(strsplit(region, ":")) - chromosome = split_chunks[1] - startend = unlist(strsplit(split_chunks[2], "-")) - qstart = startend[1] - qend = startend[2] - return(list(chromosome = chromosome, start = qstart, end = qend)) -} diff --git a/R/review_hotspots.R b/R/review_hotspots.R deleted file mode 100644 index 66d8d51..0000000 --- a/R/review_hotspots.R +++ /dev/null @@ -1,114 +0,0 @@ -#' @title Review Hotspots. -#' -#' @description Annotate MAF-like data frome with a hot_spot column indicating recurrent mutations. -#' -#' @details This function takes an annotated MAF (with [annotate_hotspots]) and updates an existing column, "hot_spot", in the same data frame. -#' Genes for hotspot review are supplied with the `genes_of_interest` parameter. -#' Currently only a few sets of genes are supported, see parameter description for more information and limitations. -#' The desired genome build can be specified with `genome_build` parameter. Should be the same as the incoming MAF. -#' -#' @param annotated_maf A data frame in MAF format that has hotspots annotated using the function annotate_hotspots(). -#' @param genes_of_interest A vector of genes for hotspot review. Currently only FOXO1, MYD88, CREBBP, NOTCH1, NOTCH2, CD79B and EZH2 are supported. -#' @param genome_build Reference genome build for the coordinates in the MAF file. The default is grch37 genome build. -#' -#' @return The same data frame (as given to the `annotated_maf` parameter) with the reviewed column "hot_spot". -#' -#' @import dplyr -#' @export -#' -#' @examples -#' hot_ssms = review_hotspots(annotate_hotspots(get_coding_ssm(this_seq_type = "genome")), -#' genes_of_interest = c("CREBBP")) -#' -review_hotspots = function(annotated_maf, - genes_of_interest = c("FOXO1", "MYD88", "CREBBP", "NOTCH1", "NOTCH2", "CD79B", "EZH2"), - genome_build){ - if(missing(genome_build)){ - if("maf_data" %in% class(annotated_maf)){ - genome_build = get_genome_build(annotated_maf) - #drop our S3 classes because these additional attributes seem to cause some problems when the data is subsequently munged. - annotated_maf = strip_genomic_classes(annotated_maf) - }else{ - stop("genome_build is required") - } - } - - # define the list of genes currently supported for review - supported_genes = c("FOXO1", "MYD88", "CREBBP", "NOTCH1", "NOTCH2", "CD79B", "EZH2") - - # check genome build because CREBBP coordinates are hg19-based or hg38-based - - if (genome_build %in% c("hg19", "grch37", "hs37d5", "GRCh37")){ - coordinates = hotspot_regions_grch37 - }else if(genome_build %in% c("hg38", "grch38", "GRCh38")){ - coordinates = hotspot_regions_hg38 - }else{ - stop("The genome build specified is not currently supported. Please provide MAF file in one of the following cordinates: hg19, grch37, hs37d5, GRCh37, hg38, grch38, or GRCh38") - } - # check that at least one of the currently supported genes are present - if (length(intersect(supported_genes, genes_of_interest))==0){ - stop(paste0("Currently only ", paste(supported_genes, collapse=", "), " are supported. Please specify one of these genes.")) - } - # notify user that there is limited number of genes currently supported - if (length(setdiff(genes_of_interest, supported_genes))>0){ - message(strwrap(paste0("Currently only ", paste(supported_genes, collapse=", "), - " are supported. By default only these genes from the supplied list will be reviewed. Reviewing hotspots for genes ", - paste(intersect(supported_genes, genes_of_interest), collapse = ", "), ", it will take a second ..."))) - } - if("FOXO1" %in% genes_of_interest){ - annotated_maf = annotated_maf %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "FOXO1" & - HGVSp_Short == "p.M1?", - "TRUE", hot_spot)) - } - - if("CREBBP" %in% genes_of_interest){ - annotated_maf = annotated_maf %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "CREBBP" & - Start_Position > coordinates["CREBBP", "start"] & - End_Position < coordinates["CREBBP", "end"] & - Variant_Classification == "Missense_Mutation", - "TRUE", hot_spot)) - } - if("EZH2" %in% genes_of_interest){ - annotated_maf = annotated_maf %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "EZH2" & - Start_Position > coordinates["EZH2", "start"] & - End_Position < coordinates["EZH2", "end"], - "TRUE", hot_spot)) - } - if("MYD88" %in% genes_of_interest){ - annotated_maf = annotated_maf %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "MYD88" & - HGVSp_Short %in% c("p.L273P", "p.L265P"), - "TRUE", hot_spot)) - } - if("NOTCH1" %in% genes_of_interest){ - annotated_maf = annotated_maf %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "NOTCH1" & - Start_Position < coordinates["NOTCH1", "start"], - "TRUE", hot_spot)) - } - if("NOTCH2" %in% genes_of_interest){ - annotated_maf = annotated_maf %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "NOTCH2" & - Start_Position < coordinates["NOTCH2", "start"], - "TRUE", hot_spot)) - } - - if("CD79B" %in% genes_of_interest){ - truncating_variants = c("Frame_Shift_Del", "Frame_Shift_Ins", "Nonsense_Mutation", "Splice_Region", "Splice_Site") - annotated_maf = annotated_maf %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "CD79B" & - Start_Position < coordinates["CD79B_trunc", "start"] & - Variant_Classification %in% truncating_variants, - "TRUE", hot_spot)) %>% - dplyr::mutate(hot_spot = ifelse(Hugo_Symbol == "CD79B" & - Start_Position < coordinates["CD79B_NONtrunc", "start"] & - ! Variant_Classification %in% truncating_variants, - "TRUE", hot_spot)) - } - annotated_maf = create_maf_data(annotated_maf,genome_build) - - return(annotated_maf) -} diff --git a/man/annotate_hotspots.Rd b/man/annotate_hotspots.Rd deleted file mode 100644 index 6040620..0000000 --- a/man/annotate_hotspots.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/annotate_hotspots.R -\name{annotate_hotspots} -\alias{annotate_hotspots} -\title{Annotate Hotspots.} -\usage{ -annotate_hotspots(mutation_maf, ...) -} -\arguments{ -\item{mutation_maf}{A data frame in MAF format.} - -\item{...}{Any other parameter. These parameters will be ignored.} -} -\value{ -The same data frame with one additional column "hot_spot". -} -\description{ -Annotate MAF-like data frome with a hot_spot column indicating recurrent mutations. -} -\details{ -This function takes an already loaded MAF data frame with the \code{mutation_maf} parameter. -} -\examples{ -my_metadata = get_gambl_metadata() -all_coding_ssm = get_coding_ssm(these_samples_metadata = my_metadata, - projection = "grch37", - this_seq_type = "genome") - -hot_ssms = annotate_hotspots(all_coding_ssm) - -} diff --git a/man/assign_cn_to_ssm.Rd b/man/assign_cn_to_ssm.Rd deleted file mode 100644 index e536b4b..0000000 --- a/man/assign_cn_to_ssm.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/assign_cn_to_ssm.R -\name{assign_cn_to_ssm} -\alias{assign_cn_to_ssm} -\title{Assign CN to SSM.} -\usage{ -assign_cn_to_ssm( - this_sample_id, - genes, - this_seq_type = "genome", - projection = "grch37", - coding_only = FALSE, - assume_diploid = FALSE, - include_silent = FALSE, - ... -) -} -\arguments{ -\item{this_sample_id}{Sample ID of the sample you want to annotate.} - -\item{genes}{A vector of characters with gene symbols (Hugo).} - -\item{this_seq_type}{Specified seq type for returned data. Default is genome.} - -\item{projection}{Specified genome projection that returned data is in -reference to. Default is grch37.} - -\item{coding_only}{Optional. Set to TRUE to restrict to only coding variants -(ssm). Deafult is FALSE.} - -\item{assume_diploid}{Optional, this parameter annotates every mutation as -copy neutral. Default is FALSE.} - -\item{include_silent}{Logical parameter indicating whether to include silent -mutations into coding mutations. Default is FALSE. This parameter only -makes sense if \code{coding_only} is set to TRUE.} - -\item{...}{Any additional parameters.} -} -\value{ -A list containing a data frame (MAF-like format) with three extra -columns: -- log.ratio is the log ratio from the seg file (NA when no overlap). -- LOH -- CN (the rounded absolute copy number estimate of the region based on -log.ratio, NA when no overlap was found). -} -\description{ -Annotate mutations with their copy number information. -} -\details{ -This function takes a sample ID with the \code{this_sample_id} parameter -and annotates mutations with copy number information. A variety of -parameters are at hand for a customized workflow. For example, -the user can specify if only coding mutations are of interest. To do so, -set \code{coding_only = TRUE}. This function internally calls -\code{get_ssm_by_samples} and \code{get_sample_cn_segments}. This function can -also take a vector with genes of interest (\code{genes}) that the returned -data frame will be restricted to. -} -\examples{ -cn_list = assign_cn_to_ssm( - this_sample_id = "DOHH-2", - coding_only = TRUE -) - -} diff --git a/man/bind_genomic_data.Rd b/man/bind_genomic_data.Rd deleted file mode 100644 index 1bc28f6..0000000 --- a/man/bind_genomic_data.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{bind_genomic_data} -\alias{bind_genomic_data} -\title{Bind maf or other genomic data together} -\usage{ -bind_genomic_data(..., check_id = TRUE) -} -\arguments{ -\item{...}{All maf_data or seg_data objects to be combined.} - -\item{check_id}{Logical. If TRUE (the default), the function will check for the presence of the expected ID column -and for duplicate sample IDs across the inputs. Set to FALSE to skip this check.} -} -\value{ -data.frame with combined data and preserved genome_build metadata. -} -\description{ -Combine multiple maf_data objects and retain metadata such as genome_build. -This function will not allow you to combine maf_data objects that have different genome_build values. -An error will also be thrown if the same sample id is found in more than one of the inputs (if check_id is TRUE). -} -\examples{ - -merged_maf = bind_genomic_data(maf1, maf2,check_id=FALSE) - -} diff --git a/man/calc_mutation_frequency_bin_region.Rd b/man/calc_mutation_frequency_bin_region.Rd deleted file mode 100644 index 83a50c3..0000000 --- a/man/calc_mutation_frequency_bin_region.Rd +++ /dev/null @@ -1,94 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calc_mutation_frequency_bin_region.R -\name{calc_mutation_frequency_bin_region} -\alias{calc_mutation_frequency_bin_region} -\title{Calculate Mutation Frequency By Sliding Window.} -\usage{ -calc_mutation_frequency_bin_region( - region, - chromosome, - start_pos, - end_pos, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - slide_by = 100, - window_size = 1000, - return_format = "long", - min_count_per_bin = 0, - return_count = TRUE, - drop_unmutated = FALSE, - ... -) -} -\arguments{ -\item{region}{A string describing a genomic region in the "chrom:start-end" format. -The region must be specified in this format OR as separate chromosome, start_pos, end_pos arguments.} - -\item{chromosome}{Chromosome name in region.} - -\item{start_pos}{Start coordinate of region.} - -\item{end_pos}{End coordinate of region.} - -\item{these_samples_metadata}{Optional data frame containing a sample_id column. -If not providing a maf file, seq_type is also a required column.} - -\item{these_sample_ids}{Optional vector of sample IDs. Output will be subset -to IDs present in this vector.} - -\item{this_seq_type}{Optional vector of seq_types to include in heatmap. -Default is "genome". Uses default seq_type priority for samples -with >1 seq_type.} - -\item{maf_data}{Optional maf data frame. Will be subset to rows where -Tumor_Sample_Barcode matches provided sample IDs or metadata table. -If not provided, maf data will be obtained with get_ssm_by_regions().} - -\item{projection}{Specify which genome build to use. Required. Default grch37.} - -\item{slide_by}{Slide size for sliding window. Default 100.} - -\item{window_size}{Size of sliding window. Default 1000.} - -\item{return_format}{Return format of mutations. Accepted inputs are "long" -and "wide". Long returns a data frame of one sample ID/window per row. -Wide returns a matrix with one sample ID per row and one window per column. -Using the "wide" format will retain all samples and windows regardless of -the drop_unmutated or min_count_per_bin parameters.} - -\item{min_count_per_bin}{Minimum counts per bin, default is 0. Setting this -greater than 0 will drop unmutated windows only when return_format is long.} - -\item{return_count}{Boolean statement to return mutation count per window (TRUE) -or binary mutated/unmutated status (FALSE). Default is TRUE.} - -\item{drop_unmutated}{Boolean for whether to drop windows with 0 mutations. -Only effective with "long" return format.} - -\item{...}{Any additional parameters.} -} -\value{ -Either a matrix or a long tidy table of counts per window. -} -\description{ -Count the number of mutations in a sliding window across a -region for all samples. -} -\details{ -This function is called to return the mutation frequency for a given -region, either from a provided input maf data frame or from the GAMBL maf data. -Regions are specified with the \code{region} parameter. Alternatively, the region of -interest can also be specified by calling the function with \code{chromosome}, -\code{start_pos}, and \code{end_pos} parameters. This function operates on a single region. -To return a matrix of sliding window counts over multiple regions, -see \code{calc_mutation_frequency_bin_regions}. -} -\examples{ -myc_mut_freq = calc_mutation_frequency_bin_region(region = "8:128747680-128753674", - slide_by = 10, - window_size = 10000) - -} diff --git a/man/calc_mutation_frequency_bin_regions.Rd b/man/calc_mutation_frequency_bin_regions.Rd deleted file mode 100644 index 7ef307b..0000000 --- a/man/calc_mutation_frequency_bin_regions.Rd +++ /dev/null @@ -1,101 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calc_mutation_frequency_bin_regions.R -\name{calc_mutation_frequency_bin_regions} -\alias{calc_mutation_frequency_bin_regions} -\title{Mutation counts across sliding windows for multiple regions.} -\usage{ -calc_mutation_frequency_bin_regions( - regions_list = NULL, - regions_bed = NULL, - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = "genome", - maf_data = NULL, - projection = "grch37", - region_padding = 1000, - drop_unmutated = FALSE, - skip_regions = NULL, - only_regions = NULL, - slide_by = 100, - window_size = 500, - return_format = "wide", - ... -) -} -\arguments{ -\item{regions_list}{Named vector of regions in the format -c(name1 = "chr:start-end", name2 = "chr:start-end"). If neither \code{regions} nor -\code{regions_bed} is specified, the function will use GAMBLR aSHM region information.} - -\item{regions_bed}{Data frame of regions with four columns (chrom, start, end, name).} - -\item{these_samples_metadata}{Metadata with at least sample_id column. -If not providing a maf data frame, seq_type is also required.} - -\item{these_sample_ids}{Vector of sample IDs. Metadata will be subset to -sample IDs present in this vector.} - -\item{this_seq_type}{Optional vector of seq_types to include in heatmap. -Default "genome". Uses default seq_type priority for samples with >1 seq_type.} - -\item{maf_data}{Optional maf data frame. Will be subset to rows where -Tumor_Sample_Barcode matches provided sample IDs or metadata table. -If not provided, maf data will be obtained with get_ssm_by_regions().} - -\item{projection}{Genome build the function will operate in. Ensure this -matches your provided regions and maf data for correct chr prefix handling. Default "grch37".} - -\item{region_padding}{Amount to pad the start and end coordinates by. Default 1000.} - -\item{drop_unmutated}{Whether to drop bins with 0 mutations. If returning a -matrix format, this will only drop bins with no mutations in any samples.} - -\item{skip_regions}{Optional character vector of genes to exclude from the default aSHM regions.} - -\item{only_regions}{Optional character vector of genes to include from the default aSHM regions.} - -\item{slide_by}{Slide size for sliding window. Default 100.} - -\item{window_size}{Size of sliding window. Default 500.} - -\item{return_format}{Return format of mutations. Accepted inputs are "long" and -"wide". Long returns a data frame of one sample ID/window per row. Wide returns -a matrix with one sample ID per row and one window per column. Using the "wide" -format will retain all samples and windows regardless of the drop_unmutated or -min_count_per_bin parameters. Default wide.} - -\item{...}{Any additional parameters.} -} -\value{ -A table of mutation counts for sliding windows across one or more regions. May be long or wide. -} -\description{ -Obtain a long tidy or wide matrix of mutation counts across -sliding windows for multiple regions. -} -\details{ -This function takes a metadata table with \code{these_samples_metadata} -parameter and internally calls \code{calc_mutation_frequency_bin_region} -(that internally calls \code{get_ssm_by_regions}). -to retrieve mutation counts for sliding windows across one or more regions. -May optionally provide any combination of a maf data frame, existing metadata, -or a regions data frame or named vector. -} -\examples{ -#get some regions -these_regions <- process_regions(only_regions = c("MYC", "BCL2", "BCL6")) -reg_vec <- these_regions$regions_list -reg_bed <- these_regions$regions_bed - -# use a set of user defined regions (from genes) and -# calculate mut frequency across all available samples -mult_freq_all = calc_mutation_frequency_bin_regions(regions_list = reg_vec) -mult_freq_all = calc_mutation_frequency_bin_regions(regions_bed = reg_bed) - -#restrict the analysis to specific samples using the metadata -my_meta = get_gambl_metadata() \%>\% - dplyr::filter(pathology \%in\% c("DLBCL","FL")) -mult_reg_freq_fl_dlbcl = calc_mutation_frequency_bin_regions(regions_list = reg_vec, - these_sample_ids = "DOHH-2") - -} diff --git a/man/check_excess_params.Rd b/man/check_excess_params.Rd deleted file mode 100644 index a4f5064..0000000 --- a/man/check_excess_params.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_excess_params.R -\name{check_excess_params} -\alias{check_excess_params} -\title{Check Excess Params} -\usage{ -check_excess_params(...) -} -\arguments{ -\item{...}{Parameters to check.} -} -\value{ -Nothing -} -\description{ -Function for checking excessive parameter names. -This function will notify the user if any unavailable parameters are called for any given given function. -This function is designed to work as internal function-call in already available GAMBLR functions. -} -\details{ -Catch function calls containing unsupported arguments. -} diff --git a/man/collate_results.Rd b/man/collate_results.Rd deleted file mode 100644 index 4a0b13f..0000000 --- a/man/collate_results.Rd +++ /dev/null @@ -1,64 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/collate_results.R -\name{collate_results} -\alias{collate_results} -\title{Collate Results} -\usage{ -collate_results( - sample_table, - these_samples_metadata, - join_with_full_metadata = FALSE, - seq_type_filter = c("genome", "capture"), - ... -) -} -\arguments{ -\item{sample_table}{A vector of characters with sample IDs, or a data frame with sample IDs in a column (sample_id). -If provided, this will overwrite any sample subsets provided these_samples_metadata.} - -\item{these_samples_metadata}{A metadata table with sample IDs of interest. -If not provided, the function will get metadata for all available samples. -This parameter is intended to use in combination with \code{join_with_full_metadata}.} - -\item{join_with_full_metadata}{Set to TRUE to horizontally expand metadata with QC results. -Default is FALSE. If \code{these_samples_metadata} is provided, collated resutls will be added to this metadata table. -If not provided, the function will join collated results with all available metadata in the specified seq_type (\code{seq_type_filter}).} - -\item{seq_type_filter}{Filtering criteria for \code{get_gambl_metadata} if \code{these_samples_metadata} is not provided, default is genomes and captures.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame with collated results. -} -\description{ -Bring together collated results for a selection of gambl samples. -} -\details{ -Currently, this function only gathers QC metrics (\code{mirage_metrics}) as the only collated result. -Potentially, in the future, additional collated results can be added by this function as well. -} -\examples{ -#load packages -library(dplyr) - -#return collated results for all available samples -all_collated = collate_results() - -#return available collated results for a metadata subset -fl_collated = collate_results( - these_samples_metadata = get_gambl_metadata( - seq_type_filter = "genome") \%>\% - dplyr::filter(pathology == "FL")) - -#horizontally expand a metadata subset with collated results -fl_meta_collated = collate_results( - join_with_full_metadata = TRUE, - these_samples_metadata = get_gambl_metadata( - seq_type_filter = "genome") \%>\% - dplyr::filter(pathology == "FL")) - -#horizontally expand all available metadata with collated results -all_meta_collated = collate_results(join_with_full_metadata = TRUE) - -} diff --git a/man/cool_overlaps.Rd b/man/cool_overlaps.Rd deleted file mode 100644 index dc7b9d4..0000000 --- a/man/cool_overlaps.Rd +++ /dev/null @@ -1,98 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cool_overlaps.R -\name{cool_overlaps} -\alias{cool_overlaps} -\title{Cool overlap of data frames.} -\usage{ -cool_overlaps( - data1, - data2, - columns1 = c("Chromosome", "Start_Position", "End_Position"), - columns2 = c("Chromosome", "Start_Position", "End_Position"), - type = "any", - nomatch = FALSE -) -} -\arguments{ -\item{data1}{Data frame with data to overlap. Required parameter. The minimal -required columns are those supplied with the argument columns1. Will -dictate the naming of the columns used for overlap in the output.} - -\item{data2}{Data frame with data to overlap. Required parameter. The minimal -required columns are those supplied with the argument columns2.} - -\item{columns1}{The list of columns from data frame data1 to be used to find -overlapping regions.} - -\item{columns2}{The list of columns from data frame data2 to be used to find -overlapping regions.} - -\item{type}{Character specifying the way to find overlaps. Accepted values -are "any" (used as default), "start", "end", "within", and "equal". -Please see function description for more details of different types.} - -\item{nomatch}{Whether the rows from data1 that do not have overlap in data2 -should be returned or not. The default is FALSE (rows without overlap -are not returned). If TRUE is specified, the row order in the output -data will match the exact order of rows in the input data1.} -} -\value{ -data frame -} -\description{ -This function implements overlap of 2 data frames that contain -regions of coordinates similar to what data.table::foverlaps does. Unlike -foverlaps, this function takes as input data frame class objects, and relies -on dplyr solution rather than data.table handling, therefore allowing usage -of data frames with virtually unlimited dimensions without crashing. This -implementation uses same logic of different types of overlaps as the original -foverlaps solution ("any", "start", "end", "within", "equal"). The type "any" -is default and allows for any overlapping solution between 2 regions. The -type "start" only considers regions with exact same start position as -overlap; similarly type "end" considers regions overlapped when the end -positions are exact matches. Type "within" means that regions are overlapped -when one is contained in another and neither start nor end positions match. -Finally, type "equal" only considers overlap when both start and end -positions match for both regions. For any type, the presence of any -additional column not directly specifying regions (for example, Chromosome) -will serve similar to a grouping variable. -The generated output of this function will contain the overlapping regions -and all columns present in the data frame data1, as well as any columns from -the data frame supplied with data2 argument, except for those columns present -in data2 that are used for overlap. When the same columns are present in both -data1 and data2, the output data frame will have ".x" and ".y" suffixes to -indicate which original input data they are coming from. -} -\examples{ -# obtain maf data -maf1 <- get_coding_ssm( - these_sample_ids = "DOHH-2" -) - -maf2 <- get_coding_ssm( - these_sample_ids = "SU-DHL-4" -) - -# The same mutations are not expected to be present in different samples -# so this overlap will produce 0 matching rows -overlap <- cool_overlaps( - maf1, - maf1, - type = "equal" -) - -# To demonstrate functionality we can supply the same maf to the data2 -overlap <- cool_overlaps( - maf1, - maf1 \%>\% head -) - -# We can also overlap different formats, for example -seg1 <- get_sample_cn_segments(these_sample_ids = "DOHH-2") -overlap <- cool_overlaps( - data1 = maf1, - data2 = seg1, - columns2 = c("chrom", "start", "end") -) - -} diff --git a/man/create_bed_data.Rd b/man/create_bed_data.Rd deleted file mode 100644 index 537a835..0000000 --- a/man/create_bed_data.Rd +++ /dev/null @@ -1,86 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{create_bed_data} -\alias{create_bed_data} -\title{Create BED Data} -\usage{ -create_bed_data( - bed_df, - genome_build = NULL, - fix_names = NULL, - concat_cols = NULL, - sep = "" -) -} -\arguments{ -\item{bed_df}{A data frame containing the BED data.} - -\item{genome_build}{A string specifying the genome build ("grch37" or "hg38"). -If NULL, the function will try to infer the genome build from the object name.} - -\item{fix_names}{Either NULL (the default), or one of "chrom_start_end" or "concat". -If not NULL and duplicate names are detected, the function will apply the chosen fix.} - -\item{concat_cols}{When \code{fix_names = "concat"}, a character vector specifying which columns -from the original data to merge.} - -\item{sep}{The separator to use when concatenating columns if fix_names = "concat". -Defaults to "" (no separator).} -} -\value{ -A data frame with class attributes for BED data. -} -\description{ -This function creates BED (Browser Extensible Data) objects from the given input. -It assumes that the BED data should have columns corresponding to chromosome, start, -and end. If the second and third columns are not numeric (as expected for start and end), -the function will attempt to identify the proper columns by matching column names. -} -\details{ -In the output, the first three columns will be renamed to "chrom", "start", and "end". -If a fourth column exists, it is renamed to "name" (and any additional columns are preserved). - -Additionally, if a "name" column exists and its values are not unique, the function -will warn the user. The user can optionally supply a method to automatically fix the -names via the \code{fix_names} argument: -\itemize{ -\item If \code{fix_names = "chrom_start_end"}, the new name will be built as "chrom:start-end". -\item If \code{fix_names = "concat"}, then the columns specified by \code{concat_cols} (using the -original column names in the input data) will be concatenated to form the new name. -By default, no separator is used, but a separator can be specified via the \code{sep} -argument. -} - -After applying the fix, the function checks if the new names are unique. If they are not, -a warning is issued that includes up to five examples of duplicate names and the row numbers -where they occur. -} -\examples{ - -# get a abed_data object for all aSHM regions -ashm_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, - fix_names = "concat", - concat_cols = c("gene","region"), - sep="-") -# the build is automatically inferred if it is in the variable name -# get_genome_build(ashm_bed) -# [1] "grch37" - -another_bed = create_bed_data(somatic_hypermutation_locations_GRCh37_v_latest, - fix_names = "concat", - concat_cols = c("chr_name","hg19_start","hg19_end")) - -# get_genome_build(another_bed) -# [1] "grch37" - -# get a bed_data object for all gene regions and combine several columns to make a unique name -gene_regions <- create_bed_data(hg38_gene_coordinates, - fix_names = "concat", - sep="-", - concat_cols = c("chromosome","start","end","gene_name")) - -#get_genome_build(gene_regions) -# [1] "hg38" - - -} diff --git a/man/create_maf_data.Rd b/man/create_maf_data.Rd deleted file mode 100644 index a6e8445..0000000 --- a/man/create_maf_data.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{create_maf_data} -\alias{create_maf_data} -\title{Create MAF Data} -\usage{ -create_maf_data(maf_df, genome_build) -} -\arguments{ -\item{maf_df}{A data frame containing the MAF data.} - -\item{genome_build}{A string specifying the genome build ("grch37" or "hg38").} -} -\value{ -A data frame with class attributes for MAF data. -} -\description{ -This function creates MAF (Mutation Annotation Format) data from the given input. -} diff --git a/man/create_seg_data.Rd b/man/create_seg_data.Rd deleted file mode 100644 index af31259..0000000 --- a/man/create_seg_data.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_cn_segments.R -\name{create_seg_data} -\alias{create_seg_data} -\title{Create Segmented Data} -\usage{ -create_seg_data(seg_df, genome_build) -} -\arguments{ -\item{seg_df}{A data frame containing the segmented data.} - -\item{genome_build}{A string specifying the genome build ("grch37" or "hg38").} -} -\value{ -A data frame with class attributes for segmented data. -} -\description{ -This function creates segmented data from the given input. -} -\examples{ -seg_df <- data.frame(...) -create_seg_data(seg_df, "grch37") -} diff --git a/man/get_ashm_count_matrix.Rd b/man/get_ashm_count_matrix.Rd deleted file mode 100644 index eb9943e..0000000 --- a/man/get_ashm_count_matrix.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ashm_count_matrix.R -\name{get_ashm_count_matrix} -\alias{get_ashm_count_matrix} -\title{Get ASHM Count Matrix.} -\usage{ -get_ashm_count_matrix( - regions_bed, - these_samples_metadata, - this_seq_type, - projection = "grch37" -) -} -\arguments{ -\item{regions_bed}{A bed file with one row for each region.} - -\item{these_samples_metadata}{This is used to complete your matrix. All GAMBL -samples will be used by default. Provide a data frame with at least -sample_id for all samples if you are using non-GAMBL data.} - -\item{this_seq_type}{The seq type to return results for. Only used if no -metadata is provided with these_samples_metadata.} - -\item{projection}{Which genome build to use for the mutations -(must match the coordinate system your regions to avoid a nonsense result)} -} -\value{ -matrix -} -\description{ -Prepare a matrix with one row per sample and one column per -region using a set of hypermutated regions. -} -\details{ -Values are the number of mutations in that patient in the region. -} -\examples{ -regions_bed = create_bed_data(GAMBLR.data::grch37_ashm_regions, - fix_names="concat", - concat_cols=c("gene","region"), - sep="-") -my_meta = get_gambl_metadata() \%>\% dplyr::filter(pathology=="DLBCL") -matrix <- get_ashm_count_matrix( - regions_bed = regions_bed, - this_seq_type = "genome" -) - -#this example intentionally fails - matrix <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", - these_samples_metadata = my_meta, - projection = "hg38") -# Error in get_ashm_count_matrix( -# Your projection argument does not match the genome_build of regions_bed - -# format the name column to include the chromosome coordinates instead of the gene -regions_bed = create_bed_data(GAMBLR.data::hg38_ashm_regions, - fix_names="concat", - concat_cols=c("chr_name","hg38_start","hg38_end"), - sep="-") - - matrix_hg38 <- get_ashm_count_matrix(regions_bed=regions_bed,this_seq_type = "genome", - these_samples_metadata = my_meta, - projection = "hg38") - -} diff --git a/man/get_cn_segments.Rd b/man/get_cn_segments.Rd deleted file mode 100644 index 369bce9..0000000 --- a/man/get_cn_segments.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_cn_segments.R -\name{get_cn_segments} -\alias{get_cn_segments} -\title{Get CN Segments.} -\usage{ -get_cn_segments( - these_samples_metadata, - projection = "grch37", - this_seq_type, - ... -) -} -\arguments{ -\item{these_samples_metadata}{User must provide a metadata table to restrict the data to the samples in your table. -The metadata also ensures the proper handling of duplicate sample_id across seq_types and ensures the -seq_type in the metadata faithfully represents the seq_type of the data} - -\item{projection}{Desired genome coordinate system for returned CN segments. Default is "grch37".} - -\item{this_seq_type}{Deprecated.} - -\item{...}{Additional parameters to be passed to the function.} -} -\value{ -A data frame with CN segments for the specified region. -} -\description{ -Retrieve all copy number segments from the GAMBL outputs -} -\details{ -This function merely loads and returns all the seg_data available for a projection (genome build) -} -\examples{ -# Example for the capture samples: - -genome_metadata = GAMBLR.data::get_gambl_metadata(seq_type_filter="genome") - -genome_segments_hg38 = get_cn_segments( - these_samples_metadata = genome_metadata, - projection="hg38") - - -} diff --git a/man/get_coding_ssm.Rd b/man/get_coding_ssm.Rd deleted file mode 100644 index 9c0afdd..0000000 --- a/man/get_coding_ssm.Rd +++ /dev/null @@ -1,75 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_coding_ssm.R -\name{get_coding_ssm} -\alias{get_coding_ssm} -\title{Get Coding SSMs} -\usage{ -get_coding_ssm( - these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - min_read_support = 3, - include_silent = TRUE, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{Optional, a vector of multiple sample_id (or a single -sample ID as a string) that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in -a column) to subset the return to. If not provided (and if -\code{these_sample_ids} is not provided), the function will return all -samples from the specified seq_type in the metadata.} - -\item{projection}{Reference genome build for the coordinates in the MAF file. -The default is grch37.} - -\item{this_seq_type}{The this_seq_type you want back, default is genome.} - -\item{tool_name}{Optionally specify which tool to report variant from. The -default is slms-3, also supports "publication" to return the exact -variants as reported in the original papers.} - -\item{min_read_support}{Only returns variants with at least this many reads -in t_alt_count.} - -\item{include_silent}{Logical parameter indicating whether to include silent -mutations into coding mutations. Default is TRUE.} - -\item{verbose}{Set to FALSE to minimize the output to console. Default is -TRUE. This parameter also dictates the verbosity of any helper function -internally called inside the main function.} - -\item{...}{Any additional parameters.} -} -\value{ -data frame -} -\description{ -Convenience function for loading coding Simple Somatic Mutations -(SSM) from the bundled data \link{sample_data}. -} -\details{ -This "bare bones" function was developed to retrieve coding SSM -calls for non-GSC-users. Effectively retrieve coding SSM calls. Multiple -filtering parameters are available for this function. For more -information on how to implement the filtering parameters, refer to the -parameter descriptions as well as examples in the vignettes. This -function depends on the bundled sample data in this package. -} -\examples{ - - # Get mutations from exome data originally aligned to grch37 -ssm_exomes_grch37 = get_coding_ssm(projection = "grch37",this_seq_type = "capture") - -# Get mutations from genome data, hg38 build -ssm_genomes_hg38 = get_coding_ssm(projection = "hg38",this_seq_type = "genome") - - - - -} diff --git a/man/get_coding_ssm_status.Rd b/man/get_coding_ssm_status.Rd deleted file mode 100644 index 458c407..0000000 --- a/man/get_coding_ssm_status.Rd +++ /dev/null @@ -1,97 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_coding_ssm_status.R -\name{get_coding_ssm_status} -\alias{get_coding_ssm_status} -\title{Get Coding SSM Status.} -\usage{ -get_coding_ssm_status( - gene_symbols, - these_samples_metadata, - maf_data, - include_hotspots = TRUE, - keep_multihit_hotspot = FALSE, - review_hotspots = TRUE, - genes_of_interest = c("FOXO1", "MYD88", "CREBBP"), - genome_build, - include_silent = FALSE, - include_silent_genes, - ... -) -} -\arguments{ -\item{gene_symbols}{A vector of gene symbols for which the mutation status -will be tabulated. If not provided, lymphoma genes will be returned -by default.} - -\item{these_samples_metadata}{The metadata for samples of interest to be -included in the returned matrix. Only the column "sample_id" is -required. If not provided, the example metadata is used as default.} - -\item{maf_data}{data frame in maf format. Must be in the grch37 projection.} - -\item{include_hotspots}{Logical parameter indicating whether hotspots object -should also be tabulated. Default is TRUE.} - -\item{keep_multihit_hotspot}{Logical parameter indicating whether to keep the -gene annotation as mutated when the gene has both hot spot and -non-hotspot mutation. Default is FALSE. If set to TRUE, will report the -number of non-hotspot mutations instead of tabulating for just mutation -presence.} - -\item{review_hotspots}{Logical parameter indicating whether hotspots object -should be reviewed to include functionally relevant mutations or rare -lymphoma-related genes. Default is TRUE.} - -\item{genes_of_interest}{A vector of genes for hotspot review. Currently only -FOXO1, MYD88, and CREBBP are supported.} - -\item{genome_build}{Reference genome build for the coordinates in the MAF -file. The default is inferred from maf_data.} - -\item{include_silent}{Logical parameter indicating whether to include silent -mutations into coding mutations. Default is FALSE.} - -\item{include_silent_genes}{Optionally, provide a list of genes for which the -Silent variants to be considered. If provided, the Silent variants for -these genes will be included regardless of the include_silent argument.} - -\item{...}{Any other parameter. These parameters will be ignored.} -} -\value{ -A data frame with tabulated mutation status. -} -\description{ -Tabulate mutation status (SSM) for a set of genes. -} -\details{ -This function takes a data frame (in MAF-like format) and converts -it to a binary one-hot encoded matrix of mutation status for either a set of -user-specified genes (via gene_symbols) or, if no genes are provided, default -to all lymphoma genes. The default behaviour is to assign each gene/sample_id -combination as mutated only if there is a protein coding mutation for that -sample in the MAF but this can be configured to use synonymous variants in -some (via include_silent_genes) or all (via include_silent) genes. -This function also has other filtering and convenience parameters giving -the user full control of the return. For more information, refer to the -parameter descriptions and examples. -Currently only the grch37 genome build is supported for hotspot annotation -and review for this version of the function. -} -\examples{ -coding_tabulated_df = get_coding_ssm_status( - maf_data = get_coding_ssm(), - gene_symbols = c("EZH2","KMT2D","CREBBP","MYC") -) - - - -#all lymphoma genes from bundled NHL gene list -coding_tabulated_df = get_coding_ssm_status() - -#this example will fail because hg38 is not supported by this function (yet) -coding_tabulated_df = get_coding_ssm_status(maf_data= - get_coding_ssm(projection = "hg38")) -# Error in get_coding_ssm_status(maf_data = get_coding_ssm(projection = "hg38")) : -# Currently only grch37 projection (hg19 genome build) is supported. - -} diff --git a/man/get_gambl_metadata.Rd b/man/get_gambl_metadata.Rd deleted file mode 100644 index 664fac3..0000000 --- a/man/get_gambl_metadata.Rd +++ /dev/null @@ -1,79 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_gambl_metadata.R -\name{get_gambl_metadata} -\alias{get_gambl_metadata} -\title{Get GAMBL Metadata.} -\usage{ -get_gambl_metadata(seq_type_filter = "genome", case_set, ...) -} -\arguments{ -\item{seq_type_filter}{Specify the seq type you want to return metadata for. -Default is "genome".} - -\item{case_set}{Optionally specify study details to return samples from a -particular case set. See function description for supported case sets.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame with metadata, tailored for user without GSC access. - -\describe{ -\item{compression}{Format of the original data used as input for our analysis pipelines (cram, bam or fastq)} -\item{bam_available}{Whether or not this file was available when last checked.} -\item{patient_id}{The anonymized unique identifier for this patient. For BC samples, this will be Res ID.} -\item{sample_id}{A unique identifier for the sample analyzed.} -\item{seq_type}{The assay type used to produce this data (one of "genome","capture, "mrna", "promethION")} -\item{genome_build}{The name of the genome reference the data were aligned to.} -\item{cohort}{Name for a group of samples that were added together (usually from a single study), often in the format {pathology}_{cohort_descriptor}.} -\item{pathology}{The diagnosis or pathology for the sample} -\item{time_point}{Timing of biopsy in increasing alphabetical order (A = diagnosis, B = first relapse etc)} -\item{ffpe_or_frozen}{Whether the nucleic acids were extracted from a frozen or FFPE sample} -\item{COO_consensus}{Consensus call of COO between different sources.} -\item{DHITsig_consensus}{Consensus call of DHIT signature status between different sources.} -\item{EBV_status_inf}{Inferred EBV status of the tumor} -\item{lymphgen_no_cnv}{LymphGen label using model without CNV} -\item{lymphgen_with_cnv}{LymphGen label using model with CNV} -\item{lymphgen_cnv_noA53}{LymphGen label using model with CNV but excluding A53 class} -\item{lymphgen_wright}{The LymphGen call for this sample from Wright et all (if applicable)} -\item{fl_grade}{Grade of FL samples} -\item{normal_sample_id}{Sample id for normal tissue used in the analysis} -\item{pairing_status}{Matching status of the sample} -\item{lymphgen}{LymphGen label} -\item{molecular_BL}{label of the sample according to the molecular BL classifier} -\item{Tumor_Sample_Barcode}{Duplicate of sample_id for simplifying joins to MAF data frames} -\item{pathology_rank}{Numeric rank for consistent ordering of samples by pathology} -\item{hiv_status}{HIV status of the sample} -\item{age_group}{Adult_BL or Pediatric_BL or Other, specific to the BLGSP study} -\item{sex}{The biological sex of the patient, if available. Allowable options: M, F, NA} -} -} -\description{ -Convenience function for loading the sample metadata. -} -\details{ -This bare bones function was developed to retrieve metadata for -non-GSC-users. Specify the seq type (\code{seq_type_filter}) for the samples you -want returned as the only argument. -It relies on the bundled metadata in this package. -Specify \code{case_set} argument to retreive samples from particular study. -Currently supported case_sets are: FL_Dreval (FL samples from Dreval et al), -DLBCL_Dreval (DLBCL samples from Dreval et al), FL-DLBCL-study (all samples -from Dreval et al), DLBCL_Arthur (all samples from Arthur et al study), -DLBCL_Hilton (all samples from Hilton et al DLBCL Trios study), -DLBCL_cell_lines (5 DLBCL cell lines), DLBCL_Chapuy (all samples from Chapuy -et al study), DLBCL_Schmitz (all samples from Schmitz et al study), -DLBCL_Reddy (all samples from Reddy et al study), DLBCL_Thomas (HTMCP DLBCLs -from Thomas et al study), BL_Thomas (BL samples from Thomas et al study) -} -\examples{ -#return metadata for genome samples -genome_meta = get_gambl_metadata(seq_type_filter = "genome") - -#return metadata for capture samples -capture_meta = get_gambl_metadata(seq_type_filter = "capture") - -#return metadata for genome and capture samples -all_meta = get_gambl_metadata(seq_type_filter = c("genome", "capture")) - -} diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd deleted file mode 100644 index 6a40f4e..0000000 --- a/man/get_genome_build.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{get_genome_build} -\alias{get_genome_build} -\title{Get Genome Build} -\usage{ -get_genome_build(data) -} -\arguments{ -\item{data}{A data frame with genome build attribute.} -} -\value{ -A string specifying the genome build. -} -\description{ -This function retrieves the genome build attribute from the data. -} diff --git a/man/get_manta_sv.Rd b/man/get_manta_sv.Rd deleted file mode 100644 index 1f71895..0000000 --- a/man/get_manta_sv.Rd +++ /dev/null @@ -1,84 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_manta_sv.R -\name{get_manta_sv} -\alias{get_manta_sv} -\title{Get Manta SVs} -\usage{ -get_manta_sv( - these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - chromosome, - qstart, - qend, - region, - pairing_status, - min_vaf = 0.1, - min_score = 40, - pass = TRUE, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to subset the return to. -If not provided (and if \code{these_sample_ids} is not provided), the function will return all samples from the specified seq_type in the metadata.} - -\item{projection}{The projection genome build. Default is grch37.} - -\item{this_seq_type}{The this_seq_type you want back, default is genome.} - -\item{chromosome}{Optional, the chromosome you are restricting to (can be prefixed or not prefixed).} - -\item{qstart}{Optional, query start coordinate of the range you are restricting to.} - -\item{qend}{Optional, query end coordinate of the range you are restricting to.} - -\item{region}{Optional, region formatted like chrX:1234-5678 (chromosome can be prefixed or not prefixed) instead of specifying chromosome, start and end separately.} - -\item{pairing_status}{Use to restrict results (if desired) to matched or unmatched results (default is to return all). This parameter takes the filtering condition as a string ("matched" or "unmatched").} - -\item{min_vaf}{The minimum tumour VAF for a SV to be returned. Default is 0.1.} - -\item{min_score}{The lowest Manta somatic score for a SV to be returned. Default is 40.} - -\item{pass}{If TRUE (default) only return SVs that are annotated with PASS in the FILTER column. Set to FALSE to keep all variants, regardless if they PASS the filters.} - -\item{verbose}{Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function.} - -\item{...}{Any additional parameters.} -} -\description{ -Convenience function for retrieving Manta Structural Variants (SVs) from the bundled data \link{sample_data}. -} -\details{ -To obtain SV calls for multiple samples, give \code{these_sample_ids} a vector of sample IDs. -Alternatively, the user can also provide the \code{these_samples_metadata} parameter to make use of an already subset metadata table. -In this case, the returned SVs will be restricted to the sample_ids within that data frame. -This function internally calls \link{id_ease} to streamline sample ID/metadata parameters. -This function can also restrict the returned calls to any genomic regions specified within \code{chromosome}, \code{qstart}, \code{qend}, -or the complete region specified under \code{region} (in chr:start-end format), note that chromosome can be either prefixed or not prefixed. -Useful filtering parameters are also available, use \code{min_vaf} to set the minimum tumour VAF for a SV to be returned and \code{min_score} -to set the lowest Manta somatic score for a SV to be returned. \code{pair_status} can be used to return variants from either matched or unmatched samples. -In addition, the user can chose to return all variants, even the ones not passing the filter criteria. To do so, set \code{pass = FALSE} (default is TRUE). -} -\examples{ -#load packages -library(dplyr) - -#lazily get every SV in the table with default quality filters -all_sv = get_manta_sv() - -#get all SVs DLBCL cell line samples -cell_line_meta = GAMBLR.data::sample_data$meta \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -dlbcl_sv = get_manta_sv(these_samples_metadata = cell_line_meta) - -#get the SVs in a region around MYC -myc_locus_sv = get_manta_sv(region = "8:128723128-128774067") - -} diff --git a/man/get_sample_cn_segments.Rd b/man/get_sample_cn_segments.Rd deleted file mode 100644 index df0959c..0000000 --- a/man/get_sample_cn_segments.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_sample_cn_segments.R -\name{get_sample_cn_segments} -\alias{get_sample_cn_segments} -\title{Get Sample CN Segments.} -\usage{ -get_sample_cn_segments( - these_sample_ids = NULL, - these_samples_metadata = NULL, - projection = "grch37", - this_seq_type = "genome", - with_chr_prefix = FALSE, - streamlined = FALSE, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to subset the return to. -If not provided (and if \code{these_sample_ids} is not provided), the function will return all samples from the specified seq_type in the metadata.} - -\item{projection}{Selected genome projection for returned CN segments. Default is "grch37".} - -\item{this_seq_type}{Seq type for returned CN segments. Default is genome.} - -\item{with_chr_prefix}{Set to TRUE to add a chr prefix to chromosome names. Default is FALSE.} - -\item{streamlined}{Return a minimal output rather than full details. Default is FALSE.} - -\item{verbose}{Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame of segments for a specific or multiple sample ID(s). -} -\description{ -Get all segments for a single (or multiple) sample_id(s). -} -\details{ -This function returns CN segments. This works for single sample or multiple samples. -Specify the sample IDs you are interested in with \code{these_sample_ids} (as a vector of characters), -Or call this function with \code{these_samples_metadata} if you already have a metadata table subset to the sample IDs of interest. -If none of the above parameters are specified, the function will return CN segments for available samples (from get_gambl_metadata). -Note, this. function internally calls \link{id_ease} for dealing with sample IDs and metadata tables. -} -\examples{ -#load pacakges -library(dplyr) - -#get CN segments for one sample -dohh2_segs = get_sample_cn_segments(these_sample_ids = "DOHH-2", - projection = "hg38", - streamlined = TRUE) - -#get CN segments for DLBCL cell line -cell_line_meta = GAMBLR.data::sample_data$meta \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -dlbcl_segs = get_sample_cn_segments(these_samples_metadata = cell_line_meta, - streamlined = TRUE) - -} diff --git a/man/get_ssm_by_patients.Rd b/man/get_ssm_by_patients.Rd deleted file mode 100644 index b24582c..0000000 --- a/man/get_ssm_by_patients.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ssm_by_patients.R -\name{get_ssm_by_patients} -\alias{get_ssm_by_patients} -\title{Get SSM By Patients.} -\usage{ -get_ssm_by_patients( - these_patient_ids, - these_samples_metadata, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_patient_ids}{A vector of patient IDs that you want results for. -The user can also use a metadata table that has been subset to the patient IDs of interest (see \code{these_samples_metadata}).} - -\item{these_samples_metadata}{A metadata subset to contain the rows corresponding to the patients of interest. -If the vector of patient IDs is missing (\code{these_patient_ids}), this function will default to all patient IDs in the metadata table given to this parameter.} - -\item{projection}{Obtain variants projected to this reference (one of grch37 or hg38). Default is grch37.} - -\item{this_seq_type}{The seq type you want results for. Default is "genome".} - -\item{tool_name}{Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers.} - -\item{this_study}{Optionally specify first name of the author for the paper -from which the variants should be returned for. -This parameter can either be a vector of indexes (integer) or a vector of characters (matching columns in MAF).} - -\item{verbose}{Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbosity of any helper function internally called inside the main function.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame with SSM calls for the selected patients in MAF format. -} -\description{ -Get MAF-format data frame for more than one patient. -} -\details{ -This function returns variants from a set of patients. -This function internally calls \link{get_ssm_by_samples}. -Thus, the main contents of this function is to wrangle the provided patient IDs, -so that the corresponding sample IDs can be provided to the internal call of \code{get_ssm_by_samples}. -This function expects either a vector of patient IDs (\code{these_patients_ids}) -or an already subset metadata table (\code{these_samples_metadata}). -} -\examples{ -#load packages -library(dplyr) - -#basic usage, these_patient_ids -my_patient = get_ssm_by_patients(these_patient_ids = "DOHH-2") - -#using a subset metadata tablee to retreive patient SSMs -cell_line_meta = GAMBLR.data::sample_data$meta \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -patient_maf = get_ssm_by_patients(these_samples_metadata = cell_line_meta, - this_seq_type = "genome") - -} diff --git a/man/get_ssm_by_region.Rd b/man/get_ssm_by_region.Rd deleted file mode 100644 index e25a627..0000000 --- a/man/get_ssm_by_region.Rd +++ /dev/null @@ -1,80 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ssm_by_region.R -\name{get_ssm_by_region} -\alias{get_ssm_by_region} -\title{Get SSM By Region.} -\usage{ -get_ssm_by_region( - these_sample_ids = NULL, - these_samples_metadata = NULL, - maf_data, - chromosome, - qstart, - qend, - region = "", - streamlined = FALSE, - projection = "grch37", - this_seq_type = "genome", - tool_name = "slms-3", - this_study, - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{Optional, a vector of multiple sample_id (or a single sample ID as a string) that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to subset the return to. -If not provided (and if \code{these_sample_ids} is not provided), the function will return all samples from the specified seq_type in the metadata.} - -\item{maf_data}{Optional data frame with mutations in MAF format. -If user provides a maf, the function trusts that the user has already subset this to samples of interest, correct seq_type. -i.e the following parameters are ignored; \code{these_samples_metadata}, \code{these_sample_ids}, and \code{this_seq_type}} - -\item{chromosome}{The chromosome you are restricting to (with or without a chr prefix).} - -\item{qstart}{Query start coordinate of the range you are restricting to.} - -\item{qend}{Query end coordinate of the range you are restricting to.} - -\item{region}{Region formatted like chrX:1234-5678 instead of specifying chromosome, start and end separately.} - -\item{streamlined}{Return Start_Position and Tumor_Smaple_Barcode as the only two MAF columns. Default is FALSE.} - -\item{projection}{Obtain variants projected to this reference (one of grch37 or hg38).} - -\item{this_seq_type}{The seq_type you want back, default is genome.} - -\item{tool_name}{Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers.} - -\item{this_study}{Optionally specify first name of the author for the paper -from which the variants should be returned for.} - -\item{verbose}{Set to FALSE to prevent ANY message to be printed. -In most cases, this parameter should be left to TRUE. -The parameter was added to accommodate for noisy output -when running this function in a loop for retrieving SSM -for multiple regions \link{get_ssm_by_regions}.} - -\item{...}{Any additional parameters.} -} -\value{ -A data frame containing all mutations (MAF) in the specified region. -} -\description{ -Retrieve all SSMs from the GAMBL database within a single genomic coordinate range. -} -\details{ -This function lets the user specify a region of interest for returning SSM calls within that region. -There are multiple ways a region can be specified. For example, the user can provide the full region in a "region" format (chr:start-end) to the \code{region} parameter. -Or, the user can provide chromosome, start and end coordinates individually with \code{chr}, \code{start}, and \code{end} parameters. -} -\examples{ -my_mutations = get_ssm_by_region(region = "chr8:128,723,128-128,774,067") - -#specifying chromosome, start and end individually -my_mutations = get_ssm_by_region(chromosome = "8", - qstart = 128723128, - qend = 128774067) - -} diff --git a/man/get_ssm_by_regions.Rd b/man/get_ssm_by_regions.Rd deleted file mode 100644 index f36a948..0000000 --- a/man/get_ssm_by_regions.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ssm_by_regions.R -\name{get_ssm_by_regions} -\alias{get_ssm_by_regions} -\title{Get SSM By Regions.} -\usage{ -get_ssm_by_regions( - these_samples_metadata, - regions_list, - regions_bed, - this_seq_type = "genome", - streamlined = TRUE, - projection = "grch37", - verbose = FALSE, - tool_name = "slms-3", - ... -) -} -\arguments{ -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to subset the return to.} - -\item{regions_list}{A vector of regions in the chr:start-end format to restrict the returned SSM calls to.} - -\item{regions_bed}{A data frame in BED format with the coordinates you want to retrieve (recommended). -This parameter can also accept an additional column with region names that will be added to the return if \code{use_name_column = TRUE}} - -\item{this_seq_type}{The this_seq_type you want back, default is genome.} - -\item{streamlined}{If set to TRUE (default) only 3 columns will be kept in the returned data frame (start, sample_id and region_name).} - -\item{projection}{Obtain variants projected to this reference (one of grch37 or hg38), default is grch37.} - -\item{verbose}{Set to TRUE to maximize the output to console. Default is TRUE. -This parameter also dictates the verbosity of any helper function internally called inside the main function.} - -\item{tool_name}{Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers.} - -\item{...}{Any additional parameters.} -} -\value{ -Returns a data frame of variants in MAF-like format. -} -\description{ -Efficiently retrieve all mutations across a range of genomic regions. -} -\details{ -This function internally calls get_ssm_by_region to retrieve SSM calls for the specified regions. -} -\examples{ -#basic usage, adding custom names from bundled ashm data frame -regions_bed = create_bed_data( GAMBLR.data::grch37_ashm_regions, - fix_names = "concat", - concat_cols = c("gene","region"), - sep="-") - -my_meta = get_gambl_metadata() -# get a full MAF-format data frame for all aSHM regions on grch37 coordinates -ashm_maf = get_ssm_by_regions(regions_bed = regions_bed, - these_samples_metadata = my_meta, - streamlined = FALSE) - -# This example intentionally fails -ashm_maf = get_ssm_by_regions(regions_bed = regions_bed, - these_samples_metadata = my_meta, - projection="hg38") -# Error in get_ssm_by_regions(regions_bed = regions_bed, these_samples_metadata = my_meta, : -# requested projection: hg38 and genome_build of regions_bed: grch37 don't match - -} diff --git a/man/get_ssm_by_samples.Rd b/man/get_ssm_by_samples.Rd deleted file mode 100644 index d975111..0000000 --- a/man/get_ssm_by_samples.Rd +++ /dev/null @@ -1,54 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_ssm_by_samples.R -\name{get_ssm_by_samples} -\alias{get_ssm_by_samples} -\title{Get SSM By Samples.} -\usage{ -get_ssm_by_samples( - these_sample_ids = NULL, - these_samples_metadata = NULL, - this_seq_type = "genome", - projection = "grch37", - tool_name = "slms-3", - verbose = FALSE, - ... -) -} -\arguments{ -\item{these_sample_ids}{A vector of one or more sample IDs that you want results for.} - -\item{these_samples_metadata}{Optional, a metadata table (with sample IDs in a column) to auto-subset the data to samples in that table before returning. -If not provided and these_sample_ids is also not provided, the function will return SSM for all samples from the specified seq_type in the bundled metadata.} - -\item{this_seq_type}{Default is genome.} - -\item{projection}{The projection genome build. Supports hg38 and grch37.} - -\item{tool_name}{Optionally specify which tool to report variant from. The default is slms-3, also supports "publication" to return the exact variants as reported in the original papers.} - -\item{verbose}{Enable for debugging/noisier output.} - -\item{...}{Any additional parameters.} -} -\value{ -data frame in MAF format. -} -\description{ -Get the SSMs (i.e. load MAF) for a single sample or a collection of samples. -} -\details{ -Retrieve a maf for a specific sample or a set of samples. -Either specify the sample IDs of interest with \code{these_sample_ids}. -Or a metadata table subset to the sample IDs of interest with \code{these_samples_metadata}. -} -\examples{ -#load a common dependency -library(dplyr) - -#Get genome-wide set of mutations from all DLBCL cell lines -cell_line_meta = get_gambl_metadata() \%>\% - dplyr::filter(cohort == "DLBCL_cell_lines") - -dlbcl_maf = get_ssm_by_samples(these_samples_metadata = cell_line_meta) - -} diff --git a/man/id_ease.Rd b/man/id_ease.Rd deleted file mode 100644 index 7ee88e9..0000000 --- a/man/id_ease.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/id_ease.R -\name{id_ease} -\alias{id_ease} -\alias{id_ease,} -\alias{id} -\alias{ease} -\title{ID Ease} -\usage{ -id_ease( - these_samples_metadata = NULL, - these_sample_ids = NULL, - this_seq_type = c("genome", "capture"), - verbose = FALSE -) -} -\arguments{ -\item{these_samples_metadata}{An optional data frame with metadata, subset to sample IDs of interest. -If not provided will retrieve GAMBL metadata for all available samples.} - -\item{these_sample_ids}{Optional character vector of GAMBL sample IDs.} - -\item{this_seq_type}{The seq type of interest. Default is both genome and exome, with priority for genome when a sample has >1 seq_type.} - -\item{verbose}{Set to FALSE to limit the information that gets printed to the console. Default is FALSE.} -} -\value{ -Metadata (data frame). -} -\description{ -Internal convenience function that standardize the way GAMBLR functions deals with sample IDs (these_sample_ids) -and metadata (these_samples_metadata). -} -\details{ -This function can take sample IDs as a vector of characters, or a metadata table in data frame format. -If no sample IDs are provided to the function, the function will operate on all gambl sample IDs available for the given seq type. -It is highly recommended to run this function with \code{verbose = TRUE}. -Since this will not only improve the overall logic on how the function operates. -But also might help with debugging functions that are internally calling this function. -The function also performs sanity checks and notifies the user if any of the requested sample IDs are not found in the metadata. -In addition, the function also notifies the dimensions of the returned object, providing further insight to what is returned. -As with all GAMBLR functions, providing a curated metadata table to any GAMBLR function (as opposed to a vector of IDs) is the safest way to ensure you get the expected result. -} -\examples{ -#load packages -library(dplyr) - -#give the function nothing (i.e return all sample IDs in the metadata for the default seq type) -#return metadata for all samples in the default seq type -all_meta = id_ease() - -#return metadata based on a sample ID -sample_meta = id_ease(these_sample_ids = "94-15772_tumorA") - -#return sample IDs based on an already filtered metadata -this_metadata = get_gambl_metadata(seq_type_filter = "genome") \%>\% - head(5) - -these_ids = id_ease(these_samples_metadata = this_metadata) - -} diff --git a/man/preserve_genomic_attributes.Rd b/man/preserve_genomic_attributes.Rd deleted file mode 100644 index 45ed20b..0000000 --- a/man/preserve_genomic_attributes.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{preserve_genomic_attributes} -\alias{preserve_genomic_attributes} -\title{Preserve Genomic Attributes} -\usage{ -preserve_genomic_attributes(new_data, old_data) -} -\arguments{ -\item{new_data}{A data frame resulting from dplyr operations.} - -\item{old_data}{The original data frame with genomic attributes.} -} -\value{ -A data frame with preserved genomic attributes. -} -\description{ -This function preserves the genomic attributes and class after dplyr operations. -} diff --git a/man/process_regions.Rd b/man/process_regions.Rd deleted file mode 100644 index 4dc42e3..0000000 --- a/man/process_regions.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/process_regions.R -\name{process_regions} -\alias{process_regions} -\title{Process Regions objects.} -\usage{ -process_regions( - regions_list = NULL, - regions_bed = NULL, - region_padding = 0, - skip_regions = NULL, - only_regions = NULL, - projection = "grch37", - sort = FALSE -) -} -\arguments{ -\item{regions_list}{Character vector of genomic regions. If neither regions nor regions_df is specified, will use GAMBLR aSHM regions} - -\item{regions_bed}{Data frame of genomic regions with column names "chrom", "start", "end", "name"} - -\item{region_padding}{Amount to pad the start and end coordinates by. The default is 0 (no padding).} - -\item{skip_regions}{Character vector of genes to drop from GAMBLR aSHM regions.} - -\item{only_regions}{Character vector of genes to include from GAMBLR aSHM regions.} - -\item{projection}{Specify which genome build projection to use. The default is "grch37", also accepts "hg38".} - -\item{sort}{Set to TRUE to force regions_bed to be ordered on chromosome and coordinate} -} -\value{ -A list with two objects, regions as a vector and in bed format. -} -\description{ -INTERNAL FUNCTION to harmonize genomic regions specified as character vectors or data frames. -} -\details{ -INTERNAL FUNCTION to harmonize genomic regions specified as character vectors or data frames. -} -\examples{ -library(dplyr) - -regions <- setNames( - c("chr1:10000-15000", "chr1:100000000-100005000"), - c("one_region", "another_region") -) -process_regions(regions_list = regions) - -reg_bed = GAMBLR.data::grch37_ashm_regions \%>\% -dplyr::filter(chr_name == "chr17") \%>\% - mutate(name = region, chrom = chr_name, start = hg19_start, end = hg19_end) \%>\% - select(chrom, start, end, name) - -process_regions(regions_bed = reg_bed) - -} diff --git a/man/region_to_chunks.Rd b/man/region_to_chunks.Rd deleted file mode 100644 index a4ab176..0000000 --- a/man/region_to_chunks.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/region_to_chunks.R -\name{region_to_chunks} -\alias{region_to_chunks} -\title{Separate a chromosome region into chunks} -\usage{ -region_to_chunks(region) -} -\arguments{ -\item{region}{A single string that stores a chromosome region. Any format like -"chr1:100000-200000", "1:100000-200000", "chr1:100'000-200'000" is possible.} -} -\value{ -A list with length 3 and names "chromosome", "start" and "end. -} -\description{ -\code{region_to_chunks} breaks the input string that stores a chromosome -region to create a list with chromosome number and start and end positions as -separated elements. -} -\examples{ -region_to_chunks(region = "chr1:100000-200000") - -} diff --git a/man/review_hotspots.Rd b/man/review_hotspots.Rd deleted file mode 100644 index f0f253e..0000000 --- a/man/review_hotspots.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/review_hotspots.R -\name{review_hotspots} -\alias{review_hotspots} -\title{Review Hotspots.} -\usage{ -review_hotspots( - annotated_maf, - genes_of_interest = c("FOXO1", "MYD88", "CREBBP", "NOTCH1", "NOTCH2", "CD79B", "EZH2"), - genome_build -) -} -\arguments{ -\item{annotated_maf}{A data frame in MAF format that has hotspots annotated using the function annotate_hotspots().} - -\item{genes_of_interest}{A vector of genes for hotspot review. Currently only FOXO1, MYD88, CREBBP, NOTCH1, NOTCH2, CD79B and EZH2 are supported.} - -\item{genome_build}{Reference genome build for the coordinates in the MAF file. The default is grch37 genome build.} -} -\value{ -The same data frame (as given to the \code{annotated_maf} parameter) with the reviewed column "hot_spot". -} -\description{ -Annotate MAF-like data frome with a hot_spot column indicating recurrent mutations. -} -\details{ -This function takes an annotated MAF (with \link{annotate_hotspots}) and updates an existing column, "hot_spot", in the same data frame. -Genes for hotspot review are supplied with the \code{genes_of_interest} parameter. -Currently only a few sets of genes are supported, see parameter description for more information and limitations. -The desired genome build can be specified with \code{genome_build} parameter. Should be the same as the incoming MAF. -} -\examples{ -hot_ssms = review_hotspots(annotate_hotspots(get_coding_ssm(this_seq_type = "genome")), - genes_of_interest = c("CREBBP")) - -} diff --git a/man/strip_genomic_classes.Rd b/man/strip_genomic_classes.Rd deleted file mode 100644 index 5c08846..0000000 --- a/man/strip_genomic_classes.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genomic_data.R -\name{strip_genomic_classes} -\alias{strip_genomic_classes} -\title{Strip Genomic Data Classes} -\usage{ -strip_genomic_classes(x, classes = c("genomic_data", "maf_data", "bed_data")) -} -\arguments{ -\item{x}{An object, such as one of your genomic data objects.} - -\item{classes}{A character vector of class names to remove. The default is -c("genomic_data", "maf_data", "bed_data").} -} -\value{ -The object with the specified classes removed. -} -\description{ -This function removes custom classes associated with genomic data objects -(by default, "genomic_data", "maf_data", and "bed_data") from the class attribute -of an object. This can be useful when you want to revert an S3 object to its -underlying data.frame (or data.table) classes without converting the object. -}