From 13d156826b40de604b293cf3ad7d622fe25d6ab4 Mon Sep 17 00:00:00 2001 From: deliaBlue <103108590+deliaBlue@users.noreply.github.com> Date: Wed, 31 Jul 2024 14:35:43 +0200 Subject: [PATCH] fix: allow empty input tables (#140) * refactor: allow empty input table --- scripts/merge_tables.R | 212 ++++++++++++++++++++++++++++---------- workflow/rules/pileup.smk | 2 + 2 files changed, 160 insertions(+), 54 deletions(-) diff --git a/scripts/merge_tables.R b/scripts/merge_tables.R index 69416f36..ac22b762 100755 --- a/scripts/merge_tables.R +++ b/scripts/merge_tables.R @@ -7,8 +7,16 @@ ################# # Import required packages -if ( suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE ) { stop("[ERROR] Package 'optparse' required! Aborted.") } -if ( suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE ) { stop("[ERROR] Package 'dplyr' required! Aborted.") } +if ( + suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE +) { + stop("[ERROR] Package 'optparse' required! Aborted.") +} +if ( + suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE +) { + stop("[ERROR] Package 'dplyr' required! Aborted.") +} ####################### @@ -16,74 +24,86 @@ if ( suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE ####################### # Get script name -script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4])) +script <- sub("--file=", "", basename(commandArgs(trailingOnly = FALSE)[4])) # Build description message description <- "Merge miRNAs quantification tables.\n" -author <- "Author: Paula Iborra, Biozentrum, University of Basel" -version <- "Version: 1.0.0 (JUN-2019)" -requirements <- "Requires: optparse" -msg <- paste(description, author, version, requirements, sep="\n") +author <- "Author: Zavolan Lab " +maintainer <- "Maintainer: Iris Mestres " +version <- "Version: 1.1.0 (FEB-2024)" +requirements <- "Requires: optparse, dplyr" +msg <- paste(description, author, maintainer, version, requirements, sep = "\n") # Define list of arguments option_list <- list( make_option( "--input_dir", - action="store", - type="character", - default=getwd(), - help="Absolute path from where input files shall be readed. Required!", - metavar="directory" + action = "store", + type = "character", + default = getwd(), + help = "Absolute path from where input files shall be readed. Required!", + metavar = "directory" ), make_option( "--output_file", - action="store", - type="character", - default=file.path(getwd(), "counts.tab"), - help="Table output file path. Default: $PWD/counts.tab", - metavar="directory" + action = "store", + type = "character", + default = file.path(getwd(), "counts.tab"), + help = "Table output file path. Default: $PWD/counts.tab", + metavar = "directory" ), make_option( c("--prefix"), - action="store_true", - type="character", - default=NULL, - help="Prefix for reading input files. Default: NULL.", - metavar="file" + action = "store_true", + type = "character", + default = NULL, + help = "Prefix for reading input files. Default: NULL.", + metavar = "file" ), make_option( c("-h", "--help"), - action="store_true", - default=FALSE, - help="Show this information and die." + action = "store_true", + default = FALSE, + help = "Show this information and die." ), make_option( c("-u", "--usage"), - action="store_true", - default=FALSE, - dest="help", - help="Show this information and die." + action = "store_true", + default = FALSE, + dest = "help", + help = "Show this information and die." ), make_option( c("-v", "--verbose"), - action="store_true", - default=FALSE, - help="Print log messages to STDOUT." + action = "store_true", + default = FALSE, + help = "Print log messages to STDOUT." ) ) # Parse command-line arguments -opt_parser <- OptionParser(usage=paste("Usage:", script, "[OPTIONS] --input_dir \n", sep=" "), option_list = option_list, add_help_option=FALSE, description=msg) +opt_parser <- + OptionParser( + usage = paste( + "Usage:", + script, + "[OPTIONS] --input_dir \n", + sep = " " + ), + option_list = option_list, + add_help_option = FALSE, + description = msg + ) opt <- parse_args(opt_parser) # Re-assign variables -in.dir <- opt$`input_dir` +in_dir <- opt$`input_dir` prefix <- opt$`prefix` -out.file <- opt$`output_file` +out_file <- opt$`output_file` verb <- opt$`verbose` # Validate required arguments -if ( is.null(in.dir) ) { +if (is.null(in_dir)) { print_help(opt_parser) stop("[ERROR] Required argument missing! Aborted.") } @@ -92,19 +112,84 @@ if ( is.null(in.dir) ) { ### FUNCTIONS ### ###################### -merge_tables <- function(cwd, prefix){ - dataFiles <- dir(cwd, prefix, full.names=TRUE) +#' Read and process input table +#' +#' `get_table()` uses `tryCatch()` to read the file in `tbl_pth`. If the table +#' is empty and an error is raised, a data frame is created. +#' +#' @param tbl_pth Path to the input table. +#' @param prefix String to be removed from the input file name. It must be +#' present in all the tables to be merged. +#' +#' @returns `get_table()` returns a data frame containing the miRNA species to +#' be counted in first column, named `ID`, and their counts in that file in +#' the second one. The name of the second column in the data frame is obtained +#' by removing the `prefix` from the input file name. If no `prefix` is given, +#' the whole file name is used. +#' +#' If the input file is empty, the returned data frame consist of one row +#' with a `NA` in both fields. +#' +#' @seealso [tryCatch()] which this function uses. +get_table <- function(tbl_pth, prefix) { + sample <- gsub(prefix, "", tbl_pth) + fields <- c("ID", basename(sample)) + + tryCatch( + expr = { + table <- read.table(tbl_pth, sep = "\t", col.names = fields) + return(table) + }, + error = function(e) { + table <- data.frame(matrix(NA, ncol = 2, nrow = 1)) + colnames(table) <- fields + return(table) + } + ) +} + +#' Merge tables with the same prefix +#' +#' `merge_tables()` takes all the files in `cwd` that start with `prefix` and +#' merges them keeping all the miRNA species present in each of the tables. +#' +#' @details The function `get_table()` is used to make sure that even if an +#' empty input file is given, the merge can still be done by creating a +#' data frame with a single row made of `MA`s. Therefore, prior to the +#' returning of the merged table, if there is a row with a `NA` in the +#' `ID` filed, it is removed. +#' +#' The function `dplyr::full_join()` method is used for the merge. This +#' implies that if a miRNA species in `ID` is missing in any of the tables +#' being joined, its value is set to `NA` in that column. +#' +#' @param cwd Path to the input tables directory. +#' @param prefix String used in all the tables to be selected for the merge. If +#' not provided, all the files in `cwd` are used. +#' +#' @returns `merge_tables()` returns a single data frame, `mat`, with all the +#' miRNA species present in the input tables in the first column, `ID`, and +#' their counts. Each input file has it own column. +#' +#' If all the input tables are empty, the output only consist of the table's +#' header, and if no files starting with `prefix` are found, nothing is +#' returned. +#' +#' @seealso [get_table()], [dplyr::full_join()] which this function uses. +merge_tables <- function(cwd, prefix) { + data_files <- dir(cwd, prefix, full.names = TRUE) mat <- NULL - if (length(dataFiles)) { - mat <- read.table(dataFiles[1], sep='\t') - sample <- gsub(prefix, "", dataFiles[1]) - colnames(mat)[2] <- basename(sample) - for (i in seq_len(length(dataFiles)-1)) { - mat <- full_join(mat, read.table(dataFiles[i+1], sep = "\t"), by='V1') - sample <- gsub(prefix, "", dataFiles[i+1]) - colnames(mat)[i + 2] <- basename(sample) + + if (length(data_files)) { + mat <- get_table(data_files[1], prefix) + for (i in seq_len(length(data_files) - 1)) { + mat <- full_join( + mat, + get_table(data_files[i + 1], prefix), + by = "ID" + ) } - colnames(mat)[1] <- "ID" + mat <- filter(mat, !is.na("ID")) } return(mat) } @@ -113,22 +198,41 @@ merge_tables <- function(cwd, prefix){ ### MAIN ### ###################### # Write log -if ( verb ) cat("Creating output directory...\n", sep="") +if (verb) { + cat("Creating output directory...\n", sep = "") +} # Create output directories -dir.create(dirname(out.file), recursive=TRUE, showWarnings=FALSE) +dir.create( + dirname(out_file), + recursive = TRUE, + showWarnings = FALSE +) # Write log -if ( verb ) cat("Creating table...\n", sep="") +if (verb) { + cat("Creating table...\n", sep = "") +} # Create table from input directory files -myTable <- merge_tables(cwd=in.dir, prefix=prefix) +my_table <- merge_tables(cwd = in_dir, prefix = prefix) # Write log -if ( verb ) cat(paste("Writing table: ", out.file, "\n", sep=""), sep="") +if (verb) { + cat(paste("Writing table: ", out_file, "\n", sep = ""), sep = "") +} # Writing table -write.table(myTable, out.file, row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t") +write.table( + my_table, + out_file, + row.names = FALSE, + col.names = TRUE, + quote = FALSE, + sep = "\t" +) # Write log -if ( verb ) cat("Done.\n", sep="") +if (verb) { + cat("Done.\n", sep = "") +} diff --git a/workflow/rules/pileup.smk b/workflow/rules/pileup.smk index 00768c26..827c7aa8 100644 --- a/workflow/rules/pileup.smk +++ b/workflow/rules/pileup.smk @@ -113,6 +113,8 @@ rule compress_reference_genome: LOCAL_LOG / "compress_reference_genome.log", container: "docker://quay.io/biocontainers/samtools:1.16.1--h00cdaf9_2" + conda: + ENV_DIR / "samtools.yaml" shell: "(bgzip < {input.genome} > {output.genome}) &> {log}"