Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: allow empty input tables #140

Merged
merged 7 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 158 additions & 54 deletions scripts/merge_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,83 +7,103 @@
#################

# Import required packages
if ( suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE ) { stop("[ERROR] Package 'optparse' required! Aborted.") }
if ( suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE ) { stop("[ERROR] Package 'dplyr' required! Aborted.") }
if (
suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE
) {
stop("[ERROR] Package 'optparse' required! Aborted.")
}
if (
suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE
) {
stop("[ERROR] Package 'dplyr' required! Aborted.")
}


#######################
### PARSE OPTIONS ###
#######################

# Get script name
script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4]))
script <- sub("--file=", "", basename(commandArgs(trailingOnly = FALSE)[4]))

# Build description message
description <- "Merge miRNAs quantification tables.\n"
author <- "Author: Paula Iborra, Biozentrum, University of Basel"
version <- "Version: 1.0.0 (JUN-2019)"
requirements <- "Requires: optparse"
msg <- paste(description, author, version, requirements, sep="\n")
author <- "Author: Zavolan Lab <[email protected]>"
maintainer <- "Maintainer: Iris Mestres <[email protected]>"
version <- "Version: 1.1.0 (FEB-2024)"
requirements <- "Requires: optparse, dplyr"
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
msg <- paste(description, author, maintainer, version, requirements, sep = "\n")

# Define list of arguments
option_list <- list(
make_option(
"--input_dir",
action="store",
type="character",
default=getwd(),
help="Absolute path from where input files shall be readed. Required!",
metavar="directory"
action = "store",
type = "character",
default = getwd(),
help = "Absolute path from where input files shall be readed. Required!",
metavar = "directory"
),
make_option(
"--output_file",
action="store",
type="character",
default=file.path(getwd(), "counts.tab"),
help="Table output file path. Default: $PWD/counts.tab",
metavar="directory"
action = "store",
type = "character",
default = file.path(getwd(), "counts.tab"),
help = "Table output file path. Default: $PWD/counts.tab",
metavar = "directory"
),
make_option(
c("--prefix"),
action="store_true",
type="character",
default=NULL,
help="Prefix for reading input files. Default: NULL.",
metavar="file"
action = "store_true",
type = "character",
default = NULL,
help = "Prefix for reading input files. Default: NULL.",
metavar = "file"
),
make_option(
c("-h", "--help"),
action="store_true",
default=FALSE,
help="Show this information and die."
action = "store_true",
default = FALSE,
help = "Show this information and die."
),
make_option(
c("-u", "--usage"),
action="store_true",
default=FALSE,
dest="help",
help="Show this information and die."
action = "store_true",
default = FALSE,
dest = "help",
help = "Show this information and die."
),
make_option(
c("-v", "--verbose"),
action="store_true",
default=FALSE,
help="Print log messages to STDOUT."
action = "store_true",
default = FALSE,
help = "Print log messages to STDOUT."
)
)

# Parse command-line arguments
opt_parser <- OptionParser(usage=paste("Usage:", script, "[OPTIONS] --input_dir <path/to/input/files>\n", sep=" "), option_list = option_list, add_help_option=FALSE, description=msg)
opt_parser <-
OptionParser(
usage = paste(
"Usage:",
script,
"[OPTIONS] --input_dir <path/to/input/files>\n",
sep = " "
),
option_list = option_list,
add_help_option = FALSE,
description = msg
)
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
opt <- parse_args(opt_parser)

# Re-assign variables
in.dir <- opt$`input_dir`
in_dir <- opt$`input_dir`
prefix <- opt$`prefix`
out.file <- opt$`output_file`
out_file <- opt$`output_file`
verb <- opt$`verbose`

# Validate required arguments
if ( is.null(in.dir) ) {
if (is.null(in_dir)) {
print_help(opt_parser)
stop("[ERROR] Required argument missing! Aborted.")
}
Expand All @@ -92,19 +112,84 @@ if ( is.null(in.dir) ) {
### FUNCTIONS ###
######################

merge_tables <- function(cwd, prefix){
dataFiles <- dir(cwd, prefix, full.names=TRUE)
#' Read and process input table
#'
#' `get_table()` uses `tryCatch()` to read the file in `tbl_pth`. If the table
#' is empty and an error is raised, a data frame is created.
#'
#' @param tbl_pth Path to the input table.
#' @param prefix String to be removed from the input file name. It must be
#' present in all the tables to be merged.
#'
#' @returns `get_table()` returns a data frame containing the miRNA species to
#' be counted in first column, named `ID`, and their counts in that file in
#' the second one. The name of the second column in the data frame is obtained
#' by removing the `prefix` from the input file name. If no `prefix` is given,
#' the whole file name is used.
#'
#' If the input file is empty, the returned data frame consist of one row
#' with a `NA` in both fields.
#'
#' @seealso [tryCatch()] which this function uses.
get_table <- function(tbl_pth, prefix) {
sample <- gsub(prefix, "", tbl_pth)
fields <- c("ID", basename(sample))

tryCatch(
expr = {
table <- read.table(tbl_pth, sep = "\t", col.names = fields)
return(table)
},
error = function(e) {
table <- data.frame(matrix(NA, ncol = 2, nrow = 1))
colnames(table) <- fields
return(table)
}
)
}

#' Merge tables with the same prefix
#'
#' `merge_tables()` takes all the files in `cwd` that start with `prefix` and
#' merges them keeping all the miRNA species present in each of the tables.
#'
#' @details The function `get_table()` is used to make sure that even if an
#' empty input file is given, the merge can still be done by creating a
#' data frame with a single row made of `MA`s. Therefore, prior to the
#' returning of the merged table, if there is a row with a `NA` in the
#' `ID` filed, it is removed.
#'
#' The function `dplyr::full_join()` method is used for the merge. This
#' implies that if a miRNA species in `ID` is missing in any of the tables
#' being joined, its value is set to `NA` in that column.
#'
#' @param cwd Path to the input tables directory.
#' @param prefix String used in all the tables to be selected for the merge. If
#' not provided, all the files in `cwd` are used.
#'
#' @returns `merge_tables()` returns a single data frame, `mat`, with all the
#' miRNA species present in the input tables in the first column, `ID`, and
#' their counts. Each input file has it own column.
#'
#' If all the input tables are empty, the output only consist of the table's
#' header, and if no files starting with `prefix` are found, nothing is
#' returned.
#'
#' @seealso [get_table()], [dplyr::full_join()] which this function uses.
merge_tables <- function(cwd, prefix) {
data_files <- dir(cwd, prefix, full.names = TRUE)
mat <- NULL
if (length(dataFiles)) {
mat <- read.table(dataFiles[1], sep='\t')
sample <- gsub(prefix, "", dataFiles[1])
colnames(mat)[2] <- basename(sample)
for (i in seq_len(length(dataFiles)-1)) {
mat <- full_join(mat, read.table(dataFiles[i+1], sep = "\t"), by='V1')
sample <- gsub(prefix, "", dataFiles[i+1])
colnames(mat)[i + 2] <- basename(sample)

if (length(data_files)) {
mat <- get_table(data_files[1], prefix)
for (i in seq_len(length(data_files) - 1)) {
mat <- full_join(
mat,
get_table(data_files[i + 1], prefix),
by = "ID"
)
}
colnames(mat)[1] <- "ID"
mat <- filter(mat, !is.na("ID"))
}
return(mat)
}
Expand All @@ -113,22 +198,41 @@ merge_tables <- function(cwd, prefix){
### MAIN ###
######################
# Write log
if ( verb ) cat("Creating output directory...\n", sep="")
if (verb) {
cat("Creating output directory...\n", sep = "")
}

# Create output directories
dir.create(dirname(out.file), recursive=TRUE, showWarnings=FALSE)
dir.create(
dirname(out_file),
recursive = TRUE,
showWarnings = FALSE
)

# Write log
if ( verb ) cat("Creating table...\n", sep="")
if (verb) {
cat("Creating table...\n", sep = "")
}

# Create table from input directory files
myTable <- merge_tables(cwd=in.dir, prefix=prefix)
my_table <- merge_tables(cwd = in_dir, prefix = prefix)

# Write log
if ( verb ) cat(paste("Writing table: ", out.file, "\n", sep=""), sep="")
if (verb) {
cat(paste("Writing table: ", out_file, "\n", sep = ""), sep = "")
}

# Writing table
write.table(myTable, out.file, row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t")
write.table(
my_table,
out_file,
row.names = FALSE,
col.names = TRUE,
quote = FALSE,
sep = "\t"
)

# Write log
if ( verb ) cat("Done.\n", sep="")
if (verb) {
cat("Done.\n", sep = "")
}
2 changes: 2 additions & 0 deletions workflow/rules/pileup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ rule compress_reference_genome:
LOCAL_LOG / "compress_reference_genome.log",
container:
"docker://quay.io/biocontainers/samtools:1.16.1--h00cdaf9_2"
conda:
ENV_DIR / "samtools.yaml"
shell:
"(bgzip < {input.genome} > {output.genome}) &> {log}"

Expand Down
Loading