From 0e79c409e03eadec4eb5037f99c2d2cf78e2b80c Mon Sep 17 00:00:00 2001
From: deliaBlue <iris.mepa13@gmail.com>
Date: Sun, 11 Feb 2024 20:06:44 +0100
Subject: [PATCH 1/3] refactor: allow empty input table

---
 scripts/merge_tables.R | 136 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 113 insertions(+), 23 deletions(-)

diff --git a/scripts/merge_tables.R b/scripts/merge_tables.R
index 69416f36..8952202c 100755
--- a/scripts/merge_tables.R
+++ b/scripts/merge_tables.R
@@ -7,8 +7,12 @@
 #################
 
 # Import required packages
-if ( suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE ) { stop("[ERROR] Package 'optparse' required! Aborted.") }
-if ( suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE ) { stop("[ERROR] Package 'dplyr' required! Aborted.") }
+if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) {
+  stop("[ERROR] Package 'optparse' required! Aborted.")
+}
+if (suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE) {
+  stop("[ERROR] Package 'dplyr' required! Aborted.")
+}
 
 
 #######################
@@ -21,9 +25,10 @@ script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4]))
 # Build description message
 description <- "Merge miRNAs quantification tables.\n"
 author <- "Author: Paula Iborra, Biozentrum, University of Basel"
-version <- "Version: 1.0.0 (JUN-2019)"
-requirements <- "Requires: optparse"
-msg <- paste(description, author, version, requirements, sep="\n")
+mantainer <- "Refactor and documentation: Iris Mestres"
+version <- "Version: 1.1.0 (FEB-2024)"
+requirements <- "Requires: optparse, dplyr"
+msg <- paste(description, author, mantainer, version, requirements, sep="\n")
 
 # Define list of arguments
 option_list <- list(
@@ -73,7 +78,18 @@ option_list <- list(
 )
 
 # Parse command-line arguments
-opt_parser <- OptionParser(usage=paste("Usage:", script, "[OPTIONS] --input_dir <path/to/input/files>\n", sep=" "), option_list = option_list, add_help_option=FALSE, description=msg)
+opt_parser <-
+  OptionParser(
+    usage = paste(
+      "Usage:",
+      script,
+      "[OPTIONS] --input_dir <path/to/input/files>\n",
+      sep = " "
+    ),
+    option_list = option_list,
+    add_help_option = FALSE,
+    description = msg
+  )
 opt <- parse_args(opt_parser)
 
 # Re-assign variables
@@ -92,19 +108,80 @@ if ( is.null(in.dir) ) {
 ###    FUNCTIONS   ###
 ######################
 
-merge_tables <- function(cwd, prefix){
-  dataFiles <- dir(cwd, prefix, full.names=TRUE)
+#' Read and process input table
+#'
+#' `get_table()` uses `tryCatch()` to read the file in `tbl_pth`. If the table
+#' is empty and an error is raised, a data frame is created.
+#'
+#' @param tbl_pth Path to the input table.
+#' @param prefix String to be removed from the input file name. It must be
+#'   present in all the tables to be merged.
+#'
+#' @returns `get_table()` returns a data frame containing the miRNA species to
+#'   be counted in first column, named `ID`, and their counts in that file in
+#'   the second one. The name of the second column in the data frame is obtained
+#'   by removing the `prefix` from the input file name. If no `prefix` is given,
+#'   the whole file name is used.
+#'
+#'   If the input file is empty, the returned data frame will consist on one row
+#'   with a `NA` in both fields.
+#'
+#' @seealso [tryCatch()] which this function uses.
+get_table <- function(tbl_pth, prefix) {
+  sample <- gsub(prefix, "", tbl_pth)
+  fields <- c("ID", basename(sample))
+  
+  tryCatch(
+    expr = {
+      table <- read.table(tbl_pth, sep = '\t', col.names = fields)
+      return(table)
+    },
+    error = function(e) {
+      table <- data.frame(matrix(NA, ncol = 2, nrow = 1))
+      colnames(table) <- fields
+      return(table)
+    }
+  )
+}
+
+#' Merge tables with the same prefix
+#'
+#' `merge_tables()` takes all the files in `cwd` that start with `prefix` and
+#' merge them keeping all the miRNA species present in each of the tables.
+#'
+#' @details The function `get_table()` is used to make sure that even if an
+#'   empty input file is given, the merge can still be done. Thus, before
+#'   returning the merged table, the row with a `NA` in the `ID` field, if any,
+#'   is removed.
+#'
+#'   The function `dplyr::full_join()` method is used for the merge, therefore,
+#'   if a miRNA species in `ID` is missing in any of the tables being joined,
+#'   its value is set to `NA` in that column.
+#'
+#' @param cwd Path to the input tables directory.
+#' @param prefix String used in all the tables to be selected for the merge. If
+#'    not provided, all the files in `cwd` are used.
+#'
+#' @returns `merge_tables()` returns a single data frame, `mat`, with all the
+#'   miRNA species present in the input tables in the first column, `ID`, and
+#'   their counts. Each input file has it own column.
+#'
+#'   If all the input tables are empty, the output consists only on the table's
+#'   header, and if no files starting with `prefix` are found, nothing is
+#'   returned.
+#'
+#' @seealso [get_table()], [dplyr::full_join()] which this function uses.
+merge_tables <- function(cwd, prefix) {
+  dataFiles <- dir(cwd, prefix, full.names = TRUE)
   mat <- NULL
+  
   if (length(dataFiles)) {
-    mat <- read.table(dataFiles[1], sep='\t')
-    sample <- gsub(prefix, "", dataFiles[1])
-    colnames(mat)[2] <- basename(sample)
-    for (i in seq_len(length(dataFiles)-1)) {
-       mat <- full_join(mat, read.table(dataFiles[i+1], sep = "\t"), by='V1')
-       sample <- gsub(prefix, "", dataFiles[i+1])
-       colnames(mat)[i + 2] <- basename(sample)
+    mat <- get_table(dataFiles[1], prefix)
+    
+    for (i in seq_len(length(dataFiles) - 1)) {
+      mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID")
     }
-    colnames(mat)[1] <- "ID"
+    mat <- filter(mat, !is.na(ID))
   }
   return(mat)
 }
@@ -113,22 +190,35 @@ merge_tables <- function(cwd, prefix){
 ###      MAIN      ###
 ######################
 # Write log
-if ( verb ) cat("Creating output directory...\n", sep="")
+if (verb)
+  cat("Creating output directory...\n", sep = "")
 
 # Create output directories
-dir.create(dirname(out.file), recursive=TRUE, showWarnings=FALSE)
+dir.create(dirname(out.file),
+           recursive = TRUE,
+           showWarnings = FALSE)
 
 # Write log
-if ( verb ) cat("Creating table...\n", sep="")
+if (verb)
+  cat("Creating table...\n", sep = "")
 
 # Create table from input directory files
-myTable <- merge_tables(cwd=in.dir, prefix=prefix)
+myTable <- merge_tables(cwd = in.dir, prefix = prefix)
 
 # Write log
-if ( verb ) cat(paste("Writing table: ", out.file, "\n", sep=""), sep="")
+if (verb)
+  cat(paste("Writing table: ", out.file, "\n", sep = ""), sep = "")
 
 # Writing table
-write.table(myTable, out.file, row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t")
+write.table(
+  myTable,
+  out.file,
+  row.names = FALSE,
+  col.names = TRUE,
+  quote = FALSE,
+  sep = "\t"
+)
 
 # Write log
-if ( verb ) cat("Done.\n", sep="")
+if (verb)
+  cat("Done.\n", sep = "")

From ebdf0372d61d7c670ed50f8647196fb9065ed187 Mon Sep 17 00:00:00 2001
From: deliaBlue <iris.mepa13@gmail.com>
Date: Fri, 23 Feb 2024 18:11:32 +0100
Subject: [PATCH 2/3] Commit to checkout

---
 scripts/merge_tables.R    | 3 +--
 workflow/rules/pileup.smk | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/merge_tables.R b/scripts/merge_tables.R
index 8952202c..7047784a 100755
--- a/scripts/merge_tables.R
+++ b/scripts/merge_tables.R
@@ -177,9 +177,8 @@ merge_tables <- function(cwd, prefix) {
   
   if (length(dataFiles)) {
     mat <- get_table(dataFiles[1], prefix)
-    
     for (i in seq_len(length(dataFiles) - 1)) {
-      mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID")
+      mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID", relationship = "many-to-many")
     }
     mat <- filter(mat, !is.na(ID))
   }
diff --git a/workflow/rules/pileup.smk b/workflow/rules/pileup.smk
index 97654c3e..b0ea7696 100644
--- a/workflow/rules/pileup.smk
+++ b/workflow/rules/pileup.smk
@@ -113,6 +113,8 @@ rule compress_reference_genome:
         LOCAL_LOG / "compress_reference_genome.log",
     container:
         "docker://quay.io/biocontainers/samtools:1.16.1--h00cdaf9_2"
+    conda:
+        ENV_DIR / "samtools.yaml"
     shell:
         "(bgzip < {input.genome} > {output.genome}) &> {log}"
 

From 09b1284032b4d50f007db7814c84ed93dbe29666 Mon Sep 17 00:00:00 2001
From: deliaBlue <iris.mepa13@gmail.com>
Date: Sun, 25 Feb 2024 22:06:32 +0100
Subject: [PATCH 3/3] apply changes

---
 scripts/merge_tables.R | 141 +++++++++++++++++++++++------------------
 1 file changed, 78 insertions(+), 63 deletions(-)

diff --git a/scripts/merge_tables.R b/scripts/merge_tables.R
index 7047784a..ac22b762 100755
--- a/scripts/merge_tables.R
+++ b/scripts/merge_tables.R
@@ -7,10 +7,14 @@
 #################
 
 # Import required packages
-if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) {
+if (
+  suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE
+) {
   stop("[ERROR] Package 'optparse' required! Aborted.")
 }
-if (suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE) {
+if (
+  suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE
+) {
   stop("[ERROR] Package 'dplyr' required! Aborted.")
 }
 
@@ -20,60 +24,60 @@ if (suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE)
 #######################
 
 # Get script name
-script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4]))
+script <- sub("--file=", "", basename(commandArgs(trailingOnly = FALSE)[4]))
 
 # Build description message
 description <- "Merge miRNAs quantification tables.\n"
-author <- "Author: Paula Iborra, Biozentrum, University of Basel"
-mantainer <- "Refactor and documentation: Iris Mestres"
+author <- "Author: Zavolan Lab <zavolab-biozentrum@unibas.ch>"
+maintainer <- "Maintainer: Iris Mestres <zavolab-biozentrum@unibas.ch>"
 version <- "Version: 1.1.0 (FEB-2024)"
 requirements <- "Requires: optparse, dplyr"
-msg <- paste(description, author, mantainer, version, requirements, sep="\n")
+msg <- paste(description, author, maintainer, version, requirements, sep = "\n")
 
 # Define list of arguments
 option_list <- list(
   make_option(
     "--input_dir",
-    action="store",
-    type="character",
-    default=getwd(),
-    help="Absolute path from where input files shall be readed. Required!",
-    metavar="directory"
+    action = "store",
+    type = "character",
+    default = getwd(),
+    help = "Absolute path from where input files shall be readed. Required!",
+    metavar = "directory"
   ),
   make_option(
     "--output_file",
-    action="store",
-    type="character",
-    default=file.path(getwd(), "counts.tab"),
-    help="Table output file path. Default: $PWD/counts.tab",
-    metavar="directory"
+    action = "store",
+    type = "character",
+    default = file.path(getwd(), "counts.tab"),
+    help = "Table output file path. Default: $PWD/counts.tab",
+    metavar = "directory"
   ),
   make_option(
     c("--prefix"),
-    action="store_true",
-    type="character",
-    default=NULL,
-    help="Prefix for reading input files. Default: NULL.",
-    metavar="file"
+    action = "store_true",
+    type = "character",
+    default = NULL,
+    help = "Prefix for reading input files. Default: NULL.",
+    metavar = "file"
   ),
   make_option(
     c("-h", "--help"),
-    action="store_true",
-    default=FALSE,
-    help="Show this information and die."
+    action = "store_true",
+    default = FALSE,
+    help = "Show this information and die."
   ),
   make_option(
     c("-u", "--usage"),
-    action="store_true",
-    default=FALSE,
-    dest="help",
-    help="Show this information and die."
+    action = "store_true",
+    default = FALSE,
+    dest = "help",
+    help = "Show this information and die."
   ),
   make_option(
     c("-v", "--verbose"),
-    action="store_true",
-    default=FALSE,
-    help="Print log messages to STDOUT."
+    action = "store_true",
+    default = FALSE,
+    help = "Print log messages to STDOUT."
   )
 )
 
@@ -93,13 +97,13 @@ opt_parser <-
 opt <- parse_args(opt_parser)
 
 # Re-assign variables
-in.dir <- opt$`input_dir`
+in_dir <- opt$`input_dir`
 prefix <- opt$`prefix`
-out.file <- opt$`output_file`
+out_file <- opt$`output_file`
 verb <- opt$`verbose`
 
 # Validate required arguments
-if ( is.null(in.dir) ) {
+if (is.null(in_dir)) {
   print_help(opt_parser)
   stop("[ERROR] Required argument missing! Aborted.")
 }
@@ -123,17 +127,17 @@ if ( is.null(in.dir) ) {
 #'   by removing the `prefix` from the input file name. If no `prefix` is given,
 #'   the whole file name is used.
 #'
-#'   If the input file is empty, the returned data frame will consist on one row
+#'   If the input file is empty, the returned data frame consist of one row
 #'   with a `NA` in both fields.
 #'
 #' @seealso [tryCatch()] which this function uses.
 get_table <- function(tbl_pth, prefix) {
   sample <- gsub(prefix, "", tbl_pth)
   fields <- c("ID", basename(sample))
-  
+
   tryCatch(
     expr = {
-      table <- read.table(tbl_pth, sep = '\t', col.names = fields)
+      table <- read.table(tbl_pth, sep = "\t", col.names = fields)
       return(table)
     },
     error = function(e) {
@@ -147,16 +151,17 @@ get_table <- function(tbl_pth, prefix) {
 #' Merge tables with the same prefix
 #'
 #' `merge_tables()` takes all the files in `cwd` that start with `prefix` and
-#' merge them keeping all the miRNA species present in each of the tables.
+#' merges them keeping all the miRNA species present in each of the tables.
 #'
 #' @details The function `get_table()` is used to make sure that even if an
-#'   empty input file is given, the merge can still be done. Thus, before
-#'   returning the merged table, the row with a `NA` in the `ID` field, if any,
-#'   is removed.
+#'   empty input file is given, the merge can still be done by creating a
+#'   data frame with a single row made of `MA`s. Therefore, prior to the
+#'   returning of the merged table, if there is a row with a `NA` in the
+#'   `ID` filed, it is removed.
 #'
-#'   The function `dplyr::full_join()` method is used for the merge, therefore,
-#'   if a miRNA species in `ID` is missing in any of the tables being joined,
-#'   its value is set to `NA` in that column.
+#'   The function `dplyr::full_join()` method is used for the merge. This
+#'   implies that if a miRNA species in `ID` is missing in any of the tables
+#'   being joined, its value is set to `NA` in that column.
 #'
 #' @param cwd Path to the input tables directory.
 #' @param prefix String used in all the tables to be selected for the merge. If
@@ -166,21 +171,25 @@ get_table <- function(tbl_pth, prefix) {
 #'   miRNA species present in the input tables in the first column, `ID`, and
 #'   their counts. Each input file has it own column.
 #'
-#'   If all the input tables are empty, the output consists only on the table's
+#'   If all the input tables are empty, the output only consist of the table's
 #'   header, and if no files starting with `prefix` are found, nothing is
 #'   returned.
 #'
 #' @seealso [get_table()], [dplyr::full_join()] which this function uses.
 merge_tables <- function(cwd, prefix) {
-  dataFiles <- dir(cwd, prefix, full.names = TRUE)
+  data_files <- dir(cwd, prefix, full.names = TRUE)
   mat <- NULL
-  
-  if (length(dataFiles)) {
-    mat <- get_table(dataFiles[1], prefix)
-    for (i in seq_len(length(dataFiles) - 1)) {
-      mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID", relationship = "many-to-many")
+
+  if (length(data_files)) {
+    mat <- get_table(data_files[1], prefix)
+    for (i in seq_len(length(data_files) - 1)) {
+      mat <- full_join(
+        mat,
+        get_table(data_files[i + 1], prefix),
+        by = "ID"
+      )
     }
-    mat <- filter(mat, !is.na(ID))
+    mat <- filter(mat, !is.na("ID"))
   }
   return(mat)
 }
@@ -189,29 +198,34 @@ merge_tables <- function(cwd, prefix) {
 ###      MAIN      ###
 ######################
 # Write log
-if (verb)
+if (verb) {
   cat("Creating output directory...\n", sep = "")
+}
 
 # Create output directories
-dir.create(dirname(out.file),
-           recursive = TRUE,
-           showWarnings = FALSE)
+dir.create(
+  dirname(out_file),
+  recursive = TRUE,
+  showWarnings = FALSE
+)
 
 # Write log
-if (verb)
+if (verb) {
   cat("Creating table...\n", sep = "")
+}
 
 # Create table from input directory files
-myTable <- merge_tables(cwd = in.dir, prefix = prefix)
+my_table <- merge_tables(cwd = in_dir, prefix = prefix)
 
 # Write log
-if (verb)
-  cat(paste("Writing table: ", out.file, "\n", sep = ""), sep = "")
+if (verb) {
+  cat(paste("Writing table: ", out_file, "\n", sep = ""), sep = "")
+}
 
 # Writing table
 write.table(
-  myTable,
-  out.file,
+  my_table,
+  out_file,
   row.names = FALSE,
   col.names = TRUE,
   quote = FALSE,
@@ -219,5 +233,6 @@ write.table(
 )
 
 # Write log
-if (verb)
+if (verb) {
   cat("Done.\n", sep = "")
+}