update paths and address suggestions

Signed-off-by: Awa Synthia <[email protected]>
JRaviLab · Nov 20, 2024 · 397c02c · 397c02c
1 parent 9ce780b
commit 397c02c
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 49 deletions.
diff --git a/R/ipr2viz.R b/R/ipr2viz.R
@@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     analysis_labeler <- analyses %>%
         pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
 
-    lookup_tbl_path <- "~/awasyn/new_trial/cln_lookup_tbl.tsv"
+    system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc

diff --git a/inst/report/scripts/MolEvolData_class.R b/inst/report/scripts/MolEvolData_class.R
@@ -41,37 +41,6 @@ setClass("seqUpload",
          )
 )
 
-# Group by lineage + DA then take top 20
-top_acc <- function(cln_file, DA_col = "DomArch.Pfam",
-                    lin_col = "Lineage", n = 20) {
-    lin_sym <- sym(lin_col)
-    DA_sym <- sym(DA_col)
-
-    cln <- fread(cln_file, sep = "\t", fill = T)
-
-    grouped <- cln %>%
-        group_by({{ lin_sym }}, {{ DA_sym }}) %>%
-        summarise(count = n()) %>%
-        arrange(-count) %>%
-        filter(!is.na({{ lin_sym }}) & !is.na({{ DA_sym }}))
-
-    top_acc <- character(n)
-    for (r in 1:min(nrow(grouped), n))
-    {
-        l <- (grouped %>% pull({{ lin_sym }}))[r]
-        DA <- (grouped %>% pull({{ DA_sym }}))[r]
-
-        filt <- cln %>% filter({{ lin_sym }} == l & {{ DA_sym }} == DA)
-
-        top <- filt[which(filt$PcPositive == max(filt$PcPositive))[1], ]
-
-        top_acc[r] <- top$AccNum
-    }
-    top_acc <- top_acc[which(top_acc != "")]
-    return(top_acc)
-}
-
-
 combineFilesNopmap <- function(inpath, pattern, outpath,
                                  delim = "\t", skip = 0,
                                  col_names) {

diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
@@ -303,7 +303,7 @@ runAnalysis <- function(
 
     # After uploading the sequence data, you would check the uploaded data
     if (sequence_upload_data@seqs == "") {
-        stop("Error: Please upload a protein sequence")
+        stop("Error: Please upload a protein sequence.")
     }
     OUT_PATH <- getwd()
     unavailable_pins <- list.files(OUT_PATH)
@@ -339,7 +339,7 @@ runAnalysis <- function(
                        validateFasta(tmp_file)
                    },
                    error = function(e) {
-                       warning("Error: Failed to run input FASTA verification")
+                       warning("Error: Failed to run input FASTA verification.")
                        return(FALSE)  # Return FALSE if an error occurs
                    },
                    finally = {
@@ -526,7 +526,7 @@ runAnalysis <- function(
         }
         if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) {
             stop("Error: Please provide a file containing sequences or check the
-                 box to use fetch sequences for NCBI accession numbers.")
+                box to fetch sequences using NCBI accession numbers.")
         }
 
         ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE)

diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -187,7 +187,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
     # setwd(OUTDIR)
 
     # Run DELTABLAST
-    runDeltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval)
+    runDELTABLAST(input_paths, PREFIX, OUTDIR, db, nhits, eval)
 
     # Run ACC2FA
     convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
@@ -207,7 +207,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
   # Sys.sleep(30)
 
   # Run BLASTCLUST
-  runBlastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+  runCDHIT(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
                  PREFIX, OUTDIR )
 
   # Convert clusters to table
@@ -541,7 +541,7 @@ replaceAccNums <- function(path_acc2info,
 }
 
 
-runDeltablast <- function(infile, prefix, outdir,
+runDELTABLAST <- function(infile, prefix, outdir,
                            db = "refseq_protein",
                            nhits = 5000, evalue = 1e-5,
                            threads = 10) {
@@ -700,9 +700,9 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
   # TaxID to lineage mapping
   cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID)
-  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
-                       header = TRUE, fill = TRUE,
-                       colClasses = lineage_map_cols)
+  lineage_map <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    header = TRUE, fill = TRUE, colClasses = lineage_map_cols)
 
   # Merge with lineage map and clean up columns
   mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID",
@@ -722,7 +722,7 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
 
 # Function to run BLASTCLUST on given input
-runBlastclust <- function(infile, suffix, outdir) {
+runCDHIT <- function(infile, suffix, outdir) {
 
   # Prepare output file path
   outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv"))
@@ -734,13 +734,13 @@ runBlastclust <- function(infile, suffix, outdir) {
   cat("## Now running BLASTCLUST on file(s):", infile, "\n")
   cat("#####################################\n")
 
-  # Run BLASTCLUST
+  # Cluster sequences w/ BLASTCLUST/CD-HIT
   # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8")
   cdhit_command <- sprintf(
     "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
     infile, outfile
   )
-  clean_command <- sprintf(
+  clean_cdhit_format <- sprintf(
     "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
     input_file,
     outfile
@@ -750,7 +750,7 @@ runBlastclust <- function(infile, suffix, outdir) {
   # system(blastclust_cmd)
 
   system(cdhit_command)
-  system(clean_command)
+  system(clean_cdhit_format)
 
 }
 
@@ -881,8 +881,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
   ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum")
 
   # read in lineage map
-  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
-                       header = T, fill = T)
+  lineage_map <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    header = TRUE, fill = TRUE)
 
   # merge ipr+info w/ lineage
   # both tables have a species column, but only
@@ -895,8 +896,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
     select(-Species.x, -Species.y)
 
   # add lookup table to iprscan file
-  lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv",
-                      sep = "\t", header = T, fill = T) %>%
+  lookup_tbl <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+                      sep = "\t", header = TRUE, fill = TRUE) %>%
     distinct()
   if ("AccNum.x" %in% names(ipr_lin)) {
     ipr_lin <- ipr_lin %>%