From 4bf5b00e32802c1a42997d510789a590af0ba174 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 19:14:00 +0200 Subject: [PATCH] add getCaseStudyReport function Signed-off-by: Awa Synthia --- R/ipr2viz.R | 2 +- inst/report/scripts/generate_report.R | 18 +++++--- inst/report/scripts/run_molevolvr_pipeline.R | 43 ++++++++++++++++---- inst/report/scripts/viz_utils.R | 34 +++++++++------- 4 files changed, 69 insertions(+), 28 deletions(-) diff --git a/R/ipr2viz.R b/R/ipr2viz.R index e5fed49f..fb145c67 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis_labeler <- analyses %>% pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis) - system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) + lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols) lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index 06128453..d1dd178b 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -1,6 +1,12 @@ # Author(s): Awa Synthia # Last modified: 2024 +# load functions to use +source("MolEvolData_class.R") +source("run_molevolvr_pipeline.R") +source("viz_utils.R") + +# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams") getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) { cat("\n") cat("\033[1;36m") # Cyan for the "MolEvolvR" header @@ -616,7 +622,7 @@ runAnalysis <- function( domarch_cols_value <- getDomArchCols(app_data, DASelect) - query_domarch_cols_value <- getDomArchCols(query_data@df) + query_domarch_cols_value <- getQueryDomArchCols(query_data@df) mainTable_value <- getDataTable(data) @@ -630,8 +636,8 @@ runAnalysis <- function( rs_IprGenes_value <- getIPRGenesVisualization(data, app_data, - input_rs_iprDatabases, - input_rs_iprVisType) + query_iprDatabases, + query_iprVisType) rs_network_layout_value <- getRSNetworkLayout(data, app_data, @@ -641,9 +647,9 @@ runAnalysis <- function( rs_data_table_value <- getDataTable(data) da_IprGenes_value <- getDomArchIPRGenesPlot(app_data, - da_iprDatabases, - da_iprVisType, - DASelect) + query_iprDatabases, + query_iprVisType, + DASelect) query_heatmap_value <- getQueryHeatmap(query_data@df, heatmap_select = "All", diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index c9c0deeb..94324599 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -740,18 +740,47 @@ runCDHIT <- function(infile, suffix, outdir) { "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8", infile, outfile ) - clean_cdhit_format <- sprintf( - "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s", - input_file, - outfile - ) + # clean_cdhit_format <- sprintf( + # "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s", + # input_file, + # outfile + # ) cat("\nPerforming BLASTCLUST analysis on", infile, "\n") # system(blastclust_cmd) system(cdhit_command) - system(clean_cdhit_format) + # system(clean_cdhit_format) + cleanCDHIT(input_file, outfile) + cat("Cleaning CDHIT results completed") +} +# extract_sequences("input_file.tsv.clstr", "output_file.tsv") +cleanCDHIT <- function(input_file, output_file) { + # Read the input file line by line + lines <- readLines(input_file) + + # Initialize an empty vector to store extracted identifiers + extracted <- c() + + # Loop through lines and extract content between ">" and "..." + for (line in lines) { + # Only process lines containing valid sequences (skip cluster headers) + if (grepl(">", line) && grepl("\\.\\.\\.", line)) { + # Extract the sequence identifier between ">" and "..." + match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line)) + if (length(match) > 0) { + sequence <- gsub("^>", "", match) # Remove the leading ">" + sequence <- gsub("\\.\\.\\.$", "", sequence) # Remove trailing dots + extracted <- c(extracted, sequence) + } + } + } + + # Write the extracted sequences to the output file + writeLines(extracted, output_file) + + cat("Sequences extracted and written to:", output_file, "\n") } # Function to format blastclust output @@ -902,7 +931,7 @@ ipr2Linear <- function(ipr, acc2info, prefix) { # add lookup table to iprscan file lookup_tbl <- fread( - system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE), + system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE), sep = "\t", header = TRUE, fill = TRUE) %>% distinct() if ("AccNum.x" %in% names(ipr_lin)) { diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R index 573e0739..aa62e126 100644 --- a/inst/report/scripts/viz_utils.R +++ b/inst/report/scripts/viz_utils.R @@ -10,8 +10,8 @@ library(plotly) # Function to generate the InterProScan Visualization getIPRGenesVisualization <- function(data, app_data, - input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), - input_rs_iprVisType = "Analysis") { + query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), + query_iprVisType = "Analysis") { # Check if analysis is loaded if (nrow(data@df) == 0 || app_data@ipr_path == "") { @@ -33,8 +33,8 @@ getIPRGenesVisualization <- function(data, app_data, ipr_plot <- plotIPR2VizWeb( infile_ipr = data@ipr_path, accessions = data@df$Name, - analysis = input_rs_iprDatabases, - group_by = input_rs_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, name = n ) @@ -45,8 +45,8 @@ getIPRGenesVisualization <- function(data, app_data, infile_ipr = data@ipr_path, infile_full = data@df, accessions = unique(data@df$Name), - analysis = input_rs_iprDatabases, - group_by = input_rs_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, topn = 20, # This value is hardcoded in the original code query = "All" ) @@ -269,7 +269,13 @@ getMSAData <- function(msa_path) { if (is.null(msa_path) || msa_path == "") { stop("Error: MSA path is not provided.") } - return(read_file(msa_path)) + # Attempt to read the file and handle potential errors + if (file.exists(msa_path)) { + return(read_file(msa_path)) + } else { + warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path)) + return(NULL) # Return NULL if the file cannot be read + } } # Function to generate a heatmap @@ -307,7 +313,7 @@ getQueryHeatmap <- function(query_data_df, } # Function to retrieve domain architecture columns -getDomArchCols <- function(query_data_df) { +getQueryDomArchCols <- function(query_data_df) { # Check if query data exists if (nrow(query_data_df) == 0) { stop("No query data available.") @@ -712,8 +718,8 @@ getDomArchCols <- function(app_data, DASelect) { } # Function to generate the IPR genes plot -getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases, - da_iprVisType, DASelect) { +getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases, + query_iprVisType, DASelect) { if (app_data@ipr_path == "") { stop("IPR path is not set.") @@ -739,8 +745,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases, plot <- plotIPR2VizWeb( infile_ipr = app_data@ipr_path, accessions = df$Name, - analysis = da_iprDatabases, - group_by = da_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, name = name_column ) } else { @@ -749,8 +755,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases, infile_ipr = app_data@ipr_path, infile_full = df, accessions = unique(df$Name), - analysis = da_iprDatabases, - group_by = da_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, topn = 20, query = DASelect )