add getCaseStudyReport function

Signed-off-by: Awa Synthia <[email protected]>
JRaviLab · Nov 20, 2024 · 4bf5b00 · 4bf5b00
1 parent 83ccfb7
commit 4bf5b00
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 28 deletions.
diff --git a/R/ipr2viz.R b/R/ipr2viz.R
@@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     analysis_labeler <- analyses %>%
         pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
 
-    system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
+    lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc

diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
@@ -1,6 +1,12 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
+# load functions to use
+source("MolEvolData_class.R")
+source("run_molevolvr_pipeline.R")
+source("viz_utils.R")
+
+# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams")
 getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
   cat("\n")
   cat("\033[1;36m")  # Cyan for the "MolEvolvR" header
@@ -616,7 +622,7 @@ runAnalysis <- function(
 
     domarch_cols_value <- getDomArchCols(app_data, DASelect)
 
-    query_domarch_cols_value <- getDomArchCols(query_data@df)
+    query_domarch_cols_value <- getQueryDomArchCols(query_data@df)
 
     mainTable_value <- getDataTable(data)
 
@@ -630,8 +636,8 @@ runAnalysis <- function(
 
     rs_IprGenes_value <- getIPRGenesVisualization(data,
                                                           app_data,
-                                                          input_rs_iprDatabases,
-                                                          input_rs_iprVisType)
+                                                          query_iprDatabases,
+                                                          query_iprVisType)
 
     rs_network_layout_value <- getRSNetworkLayout(data,
                                                           app_data,
@@ -641,9 +647,9 @@ runAnalysis <- function(
     rs_data_table_value <- getDataTable(data)
 
     da_IprGenes_value <- getDomArchIPRGenesPlot(app_data,
-                                                    da_iprDatabases,
-                                                    da_iprVisType,
-                                                    DASelect)
+                                                query_iprDatabases,
+                                                query_iprVisType,
+                                                DASelect)
 
     query_heatmap_value <- getQueryHeatmap(query_data@df,
                                                   heatmap_select = "All",

diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -740,18 +740,47 @@ runCDHIT <- function(infile, suffix, outdir) {
     "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
     infile, outfile
   )
-  clean_cdhit_format <- sprintf(
-    "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
-    input_file,
-    outfile
-  )
+  # clean_cdhit_format <- sprintf(
+  #   "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
+  #   input_file,
+  #   outfile
+  # )
 
   cat("\nPerforming BLASTCLUST analysis on", infile, "\n")
   # system(blastclust_cmd)
 
   system(cdhit_command)
-  system(clean_cdhit_format)
+  # system(clean_cdhit_format)
+  cleanCDHIT(input_file, outfile)
+  cat("Cleaning CDHIT results completed")
+}
 
+# extract_sequences("input_file.tsv.clstr", "output_file.tsv")
+cleanCDHIT <- function(input_file, output_file) {
+  # Read the input file line by line
+  lines <- readLines(input_file)
+
+  # Initialize an empty vector to store extracted identifiers
+  extracted <- c()
+
+  # Loop through lines and extract content between ">" and "..."
+  for (line in lines) {
+    # Only process lines containing valid sequences (skip cluster headers)
+    if (grepl(">", line) && grepl("\\.\\.\\.", line)) {
+      # Extract the sequence identifier between ">" and "..."
+      match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line))
+      if (length(match) > 0) {
+        sequence <- gsub("^>", "", match)  # Remove the leading ">"
+        sequence <- gsub("\\.\\.\\.$", "", sequence)  # Remove trailing dots
+        extracted <- c(extracted, sequence)
+      }
+    }
+  }
+
+  # Write the extracted sequences to the output file
+  writeLines(extracted, output_file)
+
+  cat("Sequences extracted and written to:", output_file, "\n")
 }
 
 # Function to format blastclust output
@@ -902,7 +931,7 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
 
   # add lookup table to iprscan file
   lookup_tbl <- fread(
-    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE),
                       sep = "\t", header = TRUE, fill = TRUE) %>%
     distinct()
   if ("AccNum.x" %in% names(ipr_lin)) {

diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R
@@ -10,8 +10,8 @@ library(plotly)
 
 # Function to generate the InterProScan Visualization
 getIPRGenesVisualization <- function(data, app_data,
-                                             input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
-                                             input_rs_iprVisType = "Analysis") {
+                                    query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
+                                    query_iprVisType = "Analysis") {
 
     # Check if analysis is loaded
     if (nrow(data@df) == 0 || app_data@ipr_path == "") {
@@ -33,8 +33,8 @@ getIPRGenesVisualization <- function(data, app_data,
         ipr_plot <- plotIPR2VizWeb(
             infile_ipr = data@ipr_path,
             accessions = data@df$Name,
-            analysis = input_rs_iprDatabases,
-            group_by = input_rs_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             name = n
         )
 
@@ -45,8 +45,8 @@ getIPRGenesVisualization <- function(data, app_data,
             infile_ipr = data@ipr_path,
             infile_full = data@df,
             accessions = unique(data@df$Name),
-            analysis = input_rs_iprDatabases,
-            group_by = input_rs_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             topn = 20,   # This value is hardcoded in the original code
             query = "All"
         )
@@ -269,7 +269,13 @@ getMSAData <- function(msa_path) {
     if (is.null(msa_path) || msa_path == "") {
         stop("Error: MSA path is not provided.")
     }
-    return(read_file(msa_path))
+    # Attempt to read the file and handle potential errors
+    if (file.exists(msa_path)) {
+        return(read_file(msa_path))
+    } else {
+        warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path))
+        return(NULL)  # Return NULL if the file cannot be read
+    }
 }
 
 # Function to generate a heatmap
@@ -307,7 +313,7 @@ getQueryHeatmap <- function(query_data_df,
 }
 
 # Function to retrieve domain architecture columns
-getDomArchCols <- function(query_data_df) {
+getQueryDomArchCols <- function(query_data_df) {
     # Check if query data exists
     if (nrow(query_data_df) == 0) {
         stop("No query data available.")
@@ -712,8 +718,8 @@ getDomArchCols <- function(app_data, DASelect) {
 }
 
 # Function to generate the IPR genes plot
-getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
-                                       da_iprVisType, DASelect) {
+getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases,
+                                   query_iprVisType, DASelect) {
 
     if (app_data@ipr_path == "") {
         stop("IPR path is not set.")
@@ -739,8 +745,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
         plot <- plotIPR2VizWeb(
             infile_ipr = app_data@ipr_path,
             accessions = df$Name,
-            analysis = da_iprDatabases,
-            group_by = da_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             name = name_column
         )
     } else {
@@ -749,8 +755,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
             infile_ipr = app_data@ipr_path,
             infile_full = df,
             accessions = unique(df$Name),
-            analysis = da_iprDatabases,
-            group_by = da_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             topn = 20,
             query = DASelect
         )