Skip to content

Commit

Permalink
add getCaseStudyReport function
Browse files Browse the repository at this point in the history
Signed-off-by: Awa Synthia <[email protected]>
  • Loading branch information
awasyn committed Nov 20, 2024
1 parent 83ccfb7 commit 4bf5b00
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 28 deletions.
2 changes: 1 addition & 1 deletion R/ipr2viz.R
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
analysis_labeler <- analyses %>%
pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)

system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)

lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
Expand Down
18 changes: 12 additions & 6 deletions inst/report/scripts/generate_report.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Author(s): Awa Synthia
# Last modified: 2024

# load functions to use
source("MolEvolData_class.R")
source("run_molevolvr_pipeline.R")
source("viz_utils.R")

# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams")
getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
cat("\n")
cat("\033[1;36m") # Cyan for the "MolEvolvR" header
Expand Down Expand Up @@ -616,7 +622,7 @@ runAnalysis <- function(

domarch_cols_value <- getDomArchCols(app_data, DASelect)

query_domarch_cols_value <- getDomArchCols(query_data@df)
query_domarch_cols_value <- getQueryDomArchCols(query_data@df)

mainTable_value <- getDataTable(data)

Expand All @@ -630,8 +636,8 @@ runAnalysis <- function(

rs_IprGenes_value <- getIPRGenesVisualization(data,
app_data,
input_rs_iprDatabases,
input_rs_iprVisType)
query_iprDatabases,
query_iprVisType)

rs_network_layout_value <- getRSNetworkLayout(data,
app_data,
Expand All @@ -641,9 +647,9 @@ runAnalysis <- function(
rs_data_table_value <- getDataTable(data)

da_IprGenes_value <- getDomArchIPRGenesPlot(app_data,
da_iprDatabases,
da_iprVisType,
DASelect)
query_iprDatabases,
query_iprVisType,
DASelect)

query_heatmap_value <- getQueryHeatmap(query_data@df,
heatmap_select = "All",
Expand Down
43 changes: 36 additions & 7 deletions inst/report/scripts/run_molevolvr_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -740,18 +740,47 @@ runCDHIT <- function(infile, suffix, outdir) {
"cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
infile, outfile
)
clean_cdhit_format <- sprintf(
"awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
input_file,
outfile
)
# clean_cdhit_format <- sprintf(
# "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
# input_file,
# outfile
# )

cat("\nPerforming BLASTCLUST analysis on", infile, "\n")
# system(blastclust_cmd)

system(cdhit_command)
system(clean_cdhit_format)
# system(clean_cdhit_format)
cleanCDHIT(input_file, outfile)
cat("Cleaning CDHIT results completed")
}

# extract_sequences("input_file.tsv.clstr", "output_file.tsv")
cleanCDHIT <- function(input_file, output_file) {
# Read the input file line by line
lines <- readLines(input_file)

# Initialize an empty vector to store extracted identifiers
extracted <- c()

# Loop through lines and extract content between ">" and "..."
for (line in lines) {
# Only process lines containing valid sequences (skip cluster headers)
if (grepl(">", line) && grepl("\\.\\.\\.", line)) {
# Extract the sequence identifier between ">" and "..."
match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line))
if (length(match) > 0) {
sequence <- gsub("^>", "", match) # Remove the leading ">"
sequence <- gsub("\\.\\.\\.$", "", sequence) # Remove trailing dots
extracted <- c(extracted, sequence)
}
}
}

# Write the extracted sequences to the output file
writeLines(extracted, output_file)

cat("Sequences extracted and written to:", output_file, "\n")
}

# Function to format blastclust output
Expand Down Expand Up @@ -902,7 +931,7 @@ ipr2Linear <- function(ipr, acc2info, prefix) {

# add lookup table to iprscan file
lookup_tbl <- fread(
system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE),
sep = "\t", header = TRUE, fill = TRUE) %>%
distinct()
if ("AccNum.x" %in% names(ipr_lin)) {
Expand Down
34 changes: 20 additions & 14 deletions inst/report/scripts/viz_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ library(plotly)

# Function to generate the InterProScan Visualization
getIPRGenesVisualization <- function(data, app_data,
input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
input_rs_iprVisType = "Analysis") {
query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
query_iprVisType = "Analysis") {

# Check if analysis is loaded
if (nrow(data@df) == 0 || app_data@ipr_path == "") {
Expand All @@ -33,8 +33,8 @@ getIPRGenesVisualization <- function(data, app_data,
ipr_plot <- plotIPR2VizWeb(
infile_ipr = data@ipr_path,
accessions = data@df$Name,
analysis = input_rs_iprDatabases,
group_by = input_rs_iprVisType,
analysis = query_iprDatabases,
group_by = query_iprVisType,
name = n
)

Expand All @@ -45,8 +45,8 @@ getIPRGenesVisualization <- function(data, app_data,
infile_ipr = data@ipr_path,
infile_full = data@df,
accessions = unique(data@df$Name),
analysis = input_rs_iprDatabases,
group_by = input_rs_iprVisType,
analysis = query_iprDatabases,
group_by = query_iprVisType,
topn = 20, # This value is hardcoded in the original code
query = "All"
)
Expand Down Expand Up @@ -269,7 +269,13 @@ getMSAData <- function(msa_path) {
if (is.null(msa_path) || msa_path == "") {
stop("Error: MSA path is not provided.")
}
return(read_file(msa_path))
# Attempt to read the file and handle potential errors
if (file.exists(msa_path)) {
return(read_file(msa_path))
} else {
warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path))
return(NULL) # Return NULL if the file cannot be read
}
}

# Function to generate a heatmap
Expand Down Expand Up @@ -307,7 +313,7 @@ getQueryHeatmap <- function(query_data_df,
}

# Function to retrieve domain architecture columns
getDomArchCols <- function(query_data_df) {
getQueryDomArchCols <- function(query_data_df) {
# Check if query data exists
if (nrow(query_data_df) == 0) {
stop("No query data available.")
Expand Down Expand Up @@ -712,8 +718,8 @@ getDomArchCols <- function(app_data, DASelect) {
}

# Function to generate the IPR genes plot
getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
da_iprVisType, DASelect) {
getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases,
query_iprVisType, DASelect) {

if (app_data@ipr_path == "") {
stop("IPR path is not set.")
Expand All @@ -739,8 +745,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
plot <- plotIPR2VizWeb(
infile_ipr = app_data@ipr_path,
accessions = df$Name,
analysis = da_iprDatabases,
group_by = da_iprVisType,
analysis = query_iprDatabases,
group_by = query_iprVisType,
name = name_column
)
} else {
Expand All @@ -749,8 +755,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
infile_ipr = app_data@ipr_path,
infile_full = df,
accessions = unique(df$Name),
analysis = da_iprDatabases,
group_by = da_iprVisType,
analysis = query_iprDatabases,
group_by = query_iprVisType,
topn = 20,
query = DASelect
)
Expand Down

0 comments on commit 4bf5b00

Please sign in to comment.