From 0e79c409e03eadec4eb5037f99c2d2cf78e2b80c Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sun, 11 Feb 2024 20:06:44 +0100 Subject: [PATCH 1/3] refactor: allow empty input table --- scripts/merge_tables.R | 136 ++++++++++++++++++++++++++++++++++------- 1 file changed, 113 insertions(+), 23 deletions(-) diff --git a/scripts/merge_tables.R b/scripts/merge_tables.R index 69416f36..8952202c 100755 --- a/scripts/merge_tables.R +++ b/scripts/merge_tables.R @@ -7,8 +7,12 @@ ################# # Import required packages -if ( suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE ) { stop("[ERROR] Package 'optparse' required! Aborted.") } -if ( suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE ) { stop("[ERROR] Package 'dplyr' required! Aborted.") } +if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) { + stop("[ERROR] Package 'optparse' required! Aborted.") +} +if (suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE) { + stop("[ERROR] Package 'dplyr' required! Aborted.") +} ####################### @@ -21,9 +25,10 @@ script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4])) # Build description message description <- "Merge miRNAs quantification tables.\n" author <- "Author: Paula Iborra, Biozentrum, University of Basel" -version <- "Version: 1.0.0 (JUN-2019)" -requirements <- "Requires: optparse" -msg <- paste(description, author, version, requirements, sep="\n") +mantainer <- "Refactor and documentation: Iris Mestres" +version <- "Version: 1.1.0 (FEB-2024)" +requirements <- "Requires: optparse, dplyr" +msg <- paste(description, author, mantainer, version, requirements, sep="\n") # Define list of arguments option_list <- list( @@ -73,7 +78,18 @@ option_list <- list( ) # Parse command-line arguments -opt_parser <- OptionParser(usage=paste("Usage:", script, "[OPTIONS] --input_dir \n", sep=" "), option_list = option_list, add_help_option=FALSE, description=msg) +opt_parser <- + OptionParser( + usage = paste( + "Usage:", + script, + "[OPTIONS] --input_dir \n", + sep = " " + ), + option_list = option_list, + add_help_option = FALSE, + description = msg + ) opt <- parse_args(opt_parser) # Re-assign variables @@ -92,19 +108,80 @@ if ( is.null(in.dir) ) { ### FUNCTIONS ### ###################### -merge_tables <- function(cwd, prefix){ - dataFiles <- dir(cwd, prefix, full.names=TRUE) +#' Read and process input table +#' +#' `get_table()` uses `tryCatch()` to read the file in `tbl_pth`. If the table +#' is empty and an error is raised, a data frame is created. +#' +#' @param tbl_pth Path to the input table. +#' @param prefix String to be removed from the input file name. It must be +#' present in all the tables to be merged. +#' +#' @returns `get_table()` returns a data frame containing the miRNA species to +#' be counted in first column, named `ID`, and their counts in that file in +#' the second one. The name of the second column in the data frame is obtained +#' by removing the `prefix` from the input file name. If no `prefix` is given, +#' the whole file name is used. +#' +#' If the input file is empty, the returned data frame will consist on one row +#' with a `NA` in both fields. +#' +#' @seealso [tryCatch()] which this function uses. +get_table <- function(tbl_pth, prefix) { + sample <- gsub(prefix, "", tbl_pth) + fields <- c("ID", basename(sample)) + + tryCatch( + expr = { + table <- read.table(tbl_pth, sep = '\t', col.names = fields) + return(table) + }, + error = function(e) { + table <- data.frame(matrix(NA, ncol = 2, nrow = 1)) + colnames(table) <- fields + return(table) + } + ) +} + +#' Merge tables with the same prefix +#' +#' `merge_tables()` takes all the files in `cwd` that start with `prefix` and +#' merge them keeping all the miRNA species present in each of the tables. +#' +#' @details The function `get_table()` is used to make sure that even if an +#' empty input file is given, the merge can still be done. Thus, before +#' returning the merged table, the row with a `NA` in the `ID` field, if any, +#' is removed. +#' +#' The function `dplyr::full_join()` method is used for the merge, therefore, +#' if a miRNA species in `ID` is missing in any of the tables being joined, +#' its value is set to `NA` in that column. +#' +#' @param cwd Path to the input tables directory. +#' @param prefix String used in all the tables to be selected for the merge. If +#' not provided, all the files in `cwd` are used. +#' +#' @returns `merge_tables()` returns a single data frame, `mat`, with all the +#' miRNA species present in the input tables in the first column, `ID`, and +#' their counts. Each input file has it own column. +#' +#' If all the input tables are empty, the output consists only on the table's +#' header, and if no files starting with `prefix` are found, nothing is +#' returned. +#' +#' @seealso [get_table()], [dplyr::full_join()] which this function uses. +merge_tables <- function(cwd, prefix) { + dataFiles <- dir(cwd, prefix, full.names = TRUE) mat <- NULL + if (length(dataFiles)) { - mat <- read.table(dataFiles[1], sep='\t') - sample <- gsub(prefix, "", dataFiles[1]) - colnames(mat)[2] <- basename(sample) - for (i in seq_len(length(dataFiles)-1)) { - mat <- full_join(mat, read.table(dataFiles[i+1], sep = "\t"), by='V1') - sample <- gsub(prefix, "", dataFiles[i+1]) - colnames(mat)[i + 2] <- basename(sample) + mat <- get_table(dataFiles[1], prefix) + + for (i in seq_len(length(dataFiles) - 1)) { + mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID") } - colnames(mat)[1] <- "ID" + mat <- filter(mat, !is.na(ID)) } return(mat) } @@ -113,22 +190,35 @@ merge_tables <- function(cwd, prefix){ ### MAIN ### ###################### # Write log -if ( verb ) cat("Creating output directory...\n", sep="") +if (verb) + cat("Creating output directory...\n", sep = "") # Create output directories -dir.create(dirname(out.file), recursive=TRUE, showWarnings=FALSE) +dir.create(dirname(out.file), + recursive = TRUE, + showWarnings = FALSE) # Write log -if ( verb ) cat("Creating table...\n", sep="") +if (verb) + cat("Creating table...\n", sep = "") # Create table from input directory files -myTable <- merge_tables(cwd=in.dir, prefix=prefix) +myTable <- merge_tables(cwd = in.dir, prefix = prefix) # Write log -if ( verb ) cat(paste("Writing table: ", out.file, "\n", sep=""), sep="") +if (verb) + cat(paste("Writing table: ", out.file, "\n", sep = ""), sep = "") # Writing table -write.table(myTable, out.file, row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t") +write.table( + myTable, + out.file, + row.names = FALSE, + col.names = TRUE, + quote = FALSE, + sep = "\t" +) # Write log -if ( verb ) cat("Done.\n", sep="") +if (verb) + cat("Done.\n", sep = "") From ebdf0372d61d7c670ed50f8647196fb9065ed187 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Fri, 23 Feb 2024 18:11:32 +0100 Subject: [PATCH 2/3] Commit to checkout --- scripts/merge_tables.R | 3 +-- workflow/rules/pileup.smk | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/merge_tables.R b/scripts/merge_tables.R index 8952202c..7047784a 100755 --- a/scripts/merge_tables.R +++ b/scripts/merge_tables.R @@ -177,9 +177,8 @@ merge_tables <- function(cwd, prefix) { if (length(dataFiles)) { mat <- get_table(dataFiles[1], prefix) - for (i in seq_len(length(dataFiles) - 1)) { - mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID") + mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID", relationship = "many-to-many") } mat <- filter(mat, !is.na(ID)) } diff --git a/workflow/rules/pileup.smk b/workflow/rules/pileup.smk index 97654c3e..b0ea7696 100644 --- a/workflow/rules/pileup.smk +++ b/workflow/rules/pileup.smk @@ -113,6 +113,8 @@ rule compress_reference_genome: LOCAL_LOG / "compress_reference_genome.log", container: "docker://quay.io/biocontainers/samtools:1.16.1--h00cdaf9_2" + conda: + ENV_DIR / "samtools.yaml" shell: "(bgzip < {input.genome} > {output.genome}) &> {log}" From 09b1284032b4d50f007db7814c84ed93dbe29666 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sun, 25 Feb 2024 22:06:32 +0100 Subject: [PATCH 3/3] apply changes --- scripts/merge_tables.R | 141 +++++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/scripts/merge_tables.R b/scripts/merge_tables.R index 7047784a..ac22b762 100755 --- a/scripts/merge_tables.R +++ b/scripts/merge_tables.R @@ -7,10 +7,14 @@ ################# # Import required packages -if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) { +if ( + suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE +) { stop("[ERROR] Package 'optparse' required! Aborted.") } -if (suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE) { +if ( + suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE +) { stop("[ERROR] Package 'dplyr' required! Aborted.") } @@ -20,60 +24,60 @@ if (suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE) ####################### # Get script name -script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4])) +script <- sub("--file=", "", basename(commandArgs(trailingOnly = FALSE)[4])) # Build description message description <- "Merge miRNAs quantification tables.\n" -author <- "Author: Paula Iborra, Biozentrum, University of Basel" -mantainer <- "Refactor and documentation: Iris Mestres" +author <- "Author: Zavolan Lab " +maintainer <- "Maintainer: Iris Mestres " version <- "Version: 1.1.0 (FEB-2024)" requirements <- "Requires: optparse, dplyr" -msg <- paste(description, author, mantainer, version, requirements, sep="\n") +msg <- paste(description, author, maintainer, version, requirements, sep = "\n") # Define list of arguments option_list <- list( make_option( "--input_dir", - action="store", - type="character", - default=getwd(), - help="Absolute path from where input files shall be readed. Required!", - metavar="directory" + action = "store", + type = "character", + default = getwd(), + help = "Absolute path from where input files shall be readed. Required!", + metavar = "directory" ), make_option( "--output_file", - action="store", - type="character", - default=file.path(getwd(), "counts.tab"), - help="Table output file path. Default: $PWD/counts.tab", - metavar="directory" + action = "store", + type = "character", + default = file.path(getwd(), "counts.tab"), + help = "Table output file path. Default: $PWD/counts.tab", + metavar = "directory" ), make_option( c("--prefix"), - action="store_true", - type="character", - default=NULL, - help="Prefix for reading input files. Default: NULL.", - metavar="file" + action = "store_true", + type = "character", + default = NULL, + help = "Prefix for reading input files. Default: NULL.", + metavar = "file" ), make_option( c("-h", "--help"), - action="store_true", - default=FALSE, - help="Show this information and die." + action = "store_true", + default = FALSE, + help = "Show this information and die." ), make_option( c("-u", "--usage"), - action="store_true", - default=FALSE, - dest="help", - help="Show this information and die." + action = "store_true", + default = FALSE, + dest = "help", + help = "Show this information and die." ), make_option( c("-v", "--verbose"), - action="store_true", - default=FALSE, - help="Print log messages to STDOUT." + action = "store_true", + default = FALSE, + help = "Print log messages to STDOUT." ) ) @@ -93,13 +97,13 @@ opt_parser <- opt <- parse_args(opt_parser) # Re-assign variables -in.dir <- opt$`input_dir` +in_dir <- opt$`input_dir` prefix <- opt$`prefix` -out.file <- opt$`output_file` +out_file <- opt$`output_file` verb <- opt$`verbose` # Validate required arguments -if ( is.null(in.dir) ) { +if (is.null(in_dir)) { print_help(opt_parser) stop("[ERROR] Required argument missing! Aborted.") } @@ -123,17 +127,17 @@ if ( is.null(in.dir) ) { #' by removing the `prefix` from the input file name. If no `prefix` is given, #' the whole file name is used. #' -#' If the input file is empty, the returned data frame will consist on one row +#' If the input file is empty, the returned data frame consist of one row #' with a `NA` in both fields. #' #' @seealso [tryCatch()] which this function uses. get_table <- function(tbl_pth, prefix) { sample <- gsub(prefix, "", tbl_pth) fields <- c("ID", basename(sample)) - + tryCatch( expr = { - table <- read.table(tbl_pth, sep = '\t', col.names = fields) + table <- read.table(tbl_pth, sep = "\t", col.names = fields) return(table) }, error = function(e) { @@ -147,16 +151,17 @@ get_table <- function(tbl_pth, prefix) { #' Merge tables with the same prefix #' #' `merge_tables()` takes all the files in `cwd` that start with `prefix` and -#' merge them keeping all the miRNA species present in each of the tables. +#' merges them keeping all the miRNA species present in each of the tables. #' #' @details The function `get_table()` is used to make sure that even if an -#' empty input file is given, the merge can still be done. Thus, before -#' returning the merged table, the row with a `NA` in the `ID` field, if any, -#' is removed. +#' empty input file is given, the merge can still be done by creating a +#' data frame with a single row made of `MA`s. Therefore, prior to the +#' returning of the merged table, if there is a row with a `NA` in the +#' `ID` filed, it is removed. #' -#' The function `dplyr::full_join()` method is used for the merge, therefore, -#' if a miRNA species in `ID` is missing in any of the tables being joined, -#' its value is set to `NA` in that column. +#' The function `dplyr::full_join()` method is used for the merge. This +#' implies that if a miRNA species in `ID` is missing in any of the tables +#' being joined, its value is set to `NA` in that column. #' #' @param cwd Path to the input tables directory. #' @param prefix String used in all the tables to be selected for the merge. If @@ -166,21 +171,25 @@ get_table <- function(tbl_pth, prefix) { #' miRNA species present in the input tables in the first column, `ID`, and #' their counts. Each input file has it own column. #' -#' If all the input tables are empty, the output consists only on the table's +#' If all the input tables are empty, the output only consist of the table's #' header, and if no files starting with `prefix` are found, nothing is #' returned. #' #' @seealso [get_table()], [dplyr::full_join()] which this function uses. merge_tables <- function(cwd, prefix) { - dataFiles <- dir(cwd, prefix, full.names = TRUE) + data_files <- dir(cwd, prefix, full.names = TRUE) mat <- NULL - - if (length(dataFiles)) { - mat <- get_table(dataFiles[1], prefix) - for (i in seq_len(length(dataFiles) - 1)) { - mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID", relationship = "many-to-many") + + if (length(data_files)) { + mat <- get_table(data_files[1], prefix) + for (i in seq_len(length(data_files) - 1)) { + mat <- full_join( + mat, + get_table(data_files[i + 1], prefix), + by = "ID" + ) } - mat <- filter(mat, !is.na(ID)) + mat <- filter(mat, !is.na("ID")) } return(mat) } @@ -189,29 +198,34 @@ merge_tables <- function(cwd, prefix) { ### MAIN ### ###################### # Write log -if (verb) +if (verb) { cat("Creating output directory...\n", sep = "") +} # Create output directories -dir.create(dirname(out.file), - recursive = TRUE, - showWarnings = FALSE) +dir.create( + dirname(out_file), + recursive = TRUE, + showWarnings = FALSE +) # Write log -if (verb) +if (verb) { cat("Creating table...\n", sep = "") +} # Create table from input directory files -myTable <- merge_tables(cwd = in.dir, prefix = prefix) +my_table <- merge_tables(cwd = in_dir, prefix = prefix) # Write log -if (verb) - cat(paste("Writing table: ", out.file, "\n", sep = ""), sep = "") +if (verb) { + cat(paste("Writing table: ", out_file, "\n", sep = ""), sep = "") +} # Writing table write.table( - myTable, - out.file, + my_table, + out_file, row.names = FALSE, col.names = TRUE, quote = FALSE, @@ -219,5 +233,6 @@ write.table( ) # Write log -if (verb) +if (verb) { cat("Done.\n", sep = "") +}