Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Loader for mutationTimeR (get_timed_mutations) #87

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
^.*\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
^vignettes/articles$
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ export(get_ssh_session)
export(get_ssm_by_regions)
export(get_ssm_by_samples)
export(get_study_info)
export(get_timed_mutations)
export(id_ease)
export(link_seq_types)
export(maf_to_custom_track)
Expand Down
206 changes: 102 additions & 104 deletions R/annotate_maf_triplet.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,48 +32,47 @@
#' @export
#'
#' @examples
#' maf <- GAMBLR.open::get_coding_ssm(projection="hg38") %>% head(n=500)
#' maf <- GAMBLR.open::get_coding_ssm(projection = "hg38") %>% head(n = 500)
#' # peek at the data
#' dplyr::select(maf,1:12) %>% head()
#' dplyr::select(maf, 1:12) %>% head()
#'
#' maf_anno <- annotate_maf_triplet(maf)
#' dplyr::select(maf_anno,1:12,seq) %>% head()
#' dplyr::select(maf_anno, 1:12, seq) %>% head()
#' # Each mutation is now associated with it's sequence context in the
#' # reference genome in a column named seq
#' \dontrun{
#' annotate_maf_triplet(maf, all_SNVs = FALSE, "C", "T")
#' annotate_maf_triplet(maf, ref = "C", alt = "T", pyrimidine_collapse = TRUE)
#' annotate_maf_triplet(maf, all_SNVs = FALSE, "C", "T")
#' annotate_maf_triplet(maf, ref = "C", alt = "T", pyrimidine_collapse = TRUE)
#' }

annotate_maf_triplet = function(maf,
all_SNVs = TRUE,
ref,
alt,
genome_build,
fastaPath,
pyrimidine_collapse = FALSE){
genome = ""
annotate_maf_triplet <- function(maf,
all_SNVs = TRUE,
ref,
alt,
genome_build,
fastaPath,
pyrimidine_collapse = FALSE) {
genome <- ""
# Get projection from NCBI_Build column of the maf
if ("NCBI_Build" %in% colnames(maf)) {
genome_build = tolower(maf$NCBI_Build[1])
genome_build <- tolower(maf$NCBI_Build[1])
} # If it is not in the maf, look for it in bsgenome_name
else if (!missing(bsgenome_name)) {
bsgenome_parts = unlist(strsplit(bsgenome_name, "\\."))
genome_build = bsgenome_parts[4]
bsgenome_parts <- unlist(strsplit(bsgenome_name, "\\."))
genome_build <- bsgenome_parts[4]
} # Cannot find it ask for mutating it
else{
else {
stop("Please add the 'NCBI_Build' column to your MAF file to specify the genome build.")
}
bsgenome_loaded = FALSE
bsgenome_loaded <- FALSE

# If there is no fastaPath, it will read it from config key

# Based on the genome_build the fasta file which will be loaded is different
if (missing(fastaPath)){
if("maf_data" %in% class(maf)){
genome_build = get_genome_build(maf)
if (missing(fastaPath)) {
if ("maf_data" %in% class(maf)) {
genome_build <- get_genome_build(maf)
}
if(missing(genome_build)){
if (missing(genome_build)) {
stop("no genome_build information provided or present in maf")
}
base <- check_config_value(config::get("repo_base"))
Expand All @@ -83,145 +82,144 @@ annotate_maf_triplet = function(maf,
genome_build,
"/genome_fasta/genome.fa"
)
if(!file.exists(fastaPath)){
#try BSgenome
installed = installed.genomes()
if(genome_build=="hg38"){
bsgenome_name = "BSgenome.Hsapiens.UCSC.hg38"
}else if(genome_build == "grch37"){
bsgenome_name = "BSgenome.Hsapiens.NCBI.GRCh37"
}else{
stop(paste("unsupported genome:",genome_build))
if (!file.exists(fastaPath)) {
# try BSgenome
installed <- installed.genomes()
if (genome_build == "hg38") {
bsgenome_name <- "BSgenome.Hsapiens.UCSC.hg38"
} else if (genome_build == "grch37") {
bsgenome_name <- "BSgenome.Hsapiens.NCBI.GRCh37"
} else {
stop(paste("unsupported genome:", genome_build))
}
if(bsgenome_name %in% installed){
genome = getBSgenome(bsgenome_name)
bsgenome_loaded = TRUE
}else{
if (bsgenome_name %in% installed) {
genome <- getBSgenome(bsgenome_name)
bsgenome_loaded <- TRUE
} else {
print(installed)
print(paste("Local Fasta file cannot be found and missing genome_build",bsgenome_name,"Supply a fastaPath for a local fasta file or install the missing BSGenome package and re-run"))
print(paste("Local Fasta file cannot be found and missing genome_build", bsgenome_name, "Supply a fastaPath for a local fasta file or install the missing BSGenome package and re-run"))
}
}
}
# It checks for the presence of a local fastaPath
if(!bsgenome_loaded){
if (!bsgenome_loaded) {
# Create a reference to an indexed fasta file.
if (!file.exists(fastaPath)) {
stop("Failed to find the fasta file and no compatible BSgenome found")
}
fasta = Rsamtools::FaFile(file = fastaPath)
fasta <- Rsamtools::FaFile(file = fastaPath)
}

# Store the complement of ref and alt alleles
complement <- c(
'A'= 'T',
'T'= 'A',
'C'= 'G',
'G'= 'C'
"A" = "T",
"T" = "A",
"C" = "G",
"G" = "C"
)
if(pyrimidine_collapse){
if (pyrimidine_collapse) {
maf <- maf %>%
dplyr::mutate(
mutation_strand=ifelse(
Reference_Allele %in% c("A","G"),
"-",
"+"
)
) %>%
dplyr::mutate(
mutation = ifelse(
Reference_Allele %in% c("A","G"),
paste0(complement[Reference_Allele],">",complement[Tumor_Seq_Allele2]),
paste0(Reference_Allele,">",Tumor_Seq_Allele2)))
}else{
maf = dplyr::mutate(
maf,
mutation_strand="+")
dplyr::mutate(
mutation_strand = ifelse(
Reference_Allele %in% c("A", "G"),
"-",
"+"
)
) %>%
dplyr::mutate(
mutation = ifelse(
Reference_Allele %in% c("A", "G"),
paste0(complement[Reference_Allele], ">", complement[Tumor_Seq_Allele2]),
paste0(Reference_Allele, ">", Tumor_Seq_Allele2)
)
)
} else {
maf <- dplyr::mutate(
maf,
mutation_strand = "+"
)
}

CompRef = complement[ref]
CompAlt = complement[alt]
CompRef <- complement[ref]
CompAlt <- complement[alt]
# Provide triple sequence for all the SNVs
# Using BSgenome
if (all_SNVs == TRUE){
if(bsgenome_loaded){
# Using BSgenome
if (all_SNVs == TRUE) {
if (bsgenome_loaded) {
sequences <- maf %>%
dplyr::mutate(
seq = ifelse(
(nchar(maf$Reference_Allele) == 1 &
nchar(maf$Tumor_Seq_Allele2) == 1
nchar(maf$Tumor_Seq_Allele2) == 1
),
as.character(
Rsamtools::getSeq(
genome,
maf$Chromosome,
start = maf$Start_Position-1,
end = maf$Start_Position + 1,
strand = mutation_strand
)
Rsamtools::getSeq(
genome,
maf$Chromosome,
start = maf$Start_Position - 1,
end = maf$Start_Position + 1,
strand = mutation_strand
)
),
"NA"
)
)

}else{
} else {
# Using local Fasta file
sequences <- maf %>%
dplyr::mutate(
seq = ifelse(
(nchar(maf$Reference_Allele) == 1 &
nchar(maf$Tumor_Seq_Allele2) == 1
nchar(maf$Tumor_Seq_Allele2) == 1
),
as.character(
Rsamtools::getSeq(
fasta,
GenomicRanges::GRanges(
maf$Chromosome,
IRanges(
start = maf$Start_Position - 1,
end = maf$Start_Position + 1
),
strand = maf$mutation_strand
)
Rsamtools::getSeq(
fasta,
GenomicRanges::GRanges(
maf$Chromosome,
IRanges(
start = maf$Start_Position - 1,
end = maf$Start_Position + 1
),
strand = maf$mutation_strand
)
)
),
"NA"
)
)

}

}else{
} else {
# Provide triple sequence of + strand and reverse complement of - strand
# Mutations on + strand with chosen ref and alt alleles
# Mutations on - strand with complement ref and alt alleles
sequences <- maf %>%
dplyr::mutate(
seq = ifelse(
(maf$mutation_strand == "+" &
maf$Reference_Allele == ref &
maf$Tumor_Seq_Allele2 == alt
)|(
maf$Reference_Allele == ref &
maf$Tumor_Seq_Allele2 == alt
) | (
maf$mutation_strand == "-" &
maf$Reference_Allele == CompRef &
maf$Tumor_Seq_Allele2 == CompAlt
),
as.character(
Rsamtools::getSeq(
fasta,
GenomicRanges::GRanges(
maf$Chromosome,
IRanges(
start = maf$Start_Position - 1,
end = maf$Start_Position + 1
),
strand = maf$mutation_strand
)
)
Rsamtools::getSeq(
fasta,
GenomicRanges::GRanges(
maf$Chromosome,
IRanges(
start = maf$Start_Position - 1,
end = maf$Start_Position + 1
),
strand = maf$mutation_strand
)
)
),
"NA"
)
)

}
return(sequences)
}
Loading