Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: allow empty input tables #140

Merged
merged 7 commits into from
Jul 31, 2024
136 changes: 113 additions & 23 deletions scripts/merge_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@
#################

# Import required packages
if ( suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE ) { stop("[ERROR] Package 'optparse' required! Aborted.") }
if ( suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE ) { stop("[ERROR] Package 'dplyr' required! Aborted.") }
if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) {
stop("[ERROR] Package 'optparse' required! Aborted.")
}
if (suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE) {
stop("[ERROR] Package 'dplyr' required! Aborted.")
}
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved


#######################
Expand All @@ -21,9 +25,10 @@ script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4]))
# Build description message
description <- "Merge miRNAs quantification tables.\n"
author <- "Author: Paula Iborra, Biozentrum, University of Basel"
version <- "Version: 1.0.0 (JUN-2019)"
requirements <- "Requires: optparse"
msg <- paste(description, author, version, requirements, sep="\n")
mantainer <- "Refactor and documentation: Iris Mestres"
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
version <- "Version: 1.1.0 (FEB-2024)"
requirements <- "Requires: optparse, dplyr"
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
msg <- paste(description, author, mantainer, version, requirements, sep="\n")

# Define list of arguments
option_list <- list(
Expand Down Expand Up @@ -73,7 +78,18 @@ option_list <- list(
)

# Parse command-line arguments
opt_parser <- OptionParser(usage=paste("Usage:", script, "[OPTIONS] --input_dir <path/to/input/files>\n", sep=" "), option_list = option_list, add_help_option=FALSE, description=msg)
opt_parser <-
OptionParser(
usage = paste(
"Usage:",
script,
"[OPTIONS] --input_dir <path/to/input/files>\n",
sep = " "
),
option_list = option_list,
add_help_option = FALSE,
description = msg
)
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
opt <- parse_args(opt_parser)

# Re-assign variables
Expand All @@ -92,19 +108,80 @@ if ( is.null(in.dir) ) {
### FUNCTIONS ###
######################

merge_tables <- function(cwd, prefix){
dataFiles <- dir(cwd, prefix, full.names=TRUE)
#' Read and process input table
#'
#' `get_table()` uses `tryCatch()` to read the file in `tbl_pth`. If the table
#' is empty and an error is raised, a data frame is created.
#'
#' @param tbl_pth Path to the input table.
#' @param prefix String to be removed from the input file name. It must be
#' present in all the tables to be merged.
#'
#' @returns `get_table()` returns a data frame containing the miRNA species to
#' be counted in first column, named `ID`, and their counts in that file in
#' the second one. The name of the second column in the data frame is obtained
#' by removing the `prefix` from the input file name. If no `prefix` is given,
#' the whole file name is used.
#'
#' If the input file is empty, the returned data frame will consist on one row
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
#' with a `NA` in both fields.
#'
#' @seealso [tryCatch()] which this function uses.
get_table <- function(tbl_pth, prefix) {
sample <- gsub(prefix, "", tbl_pth)
fields <- c("ID", basename(sample))

tryCatch(
expr = {
table <- read.table(tbl_pth, sep = '\t', col.names = fields)
return(table)
},
error = function(e) {
table <- data.frame(matrix(NA, ncol = 2, nrow = 1))
colnames(table) <- fields
return(table)
}
)
}

#' Merge tables with the same prefix
#'
#' `merge_tables()` takes all the files in `cwd` that start with `prefix` and
#' merge them keeping all the miRNA species present in each of the tables.
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @details The function `get_table()` is used to make sure that even if an
#' empty input file is given, the merge can still be done. Thus, before
#' returning the merged table, the row with a `NA` in the `ID` field, if any,
#' is removed.
#'
#' The function `dplyr::full_join()` method is used for the merge, therefore,
#' if a miRNA species in `ID` is missing in any of the tables being joined,
#' its value is set to `NA` in that column.
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @param cwd Path to the input tables directory.
#' @param prefix String used in all the tables to be selected for the merge. If
#' not provided, all the files in `cwd` are used.
#'
#' @returns `merge_tables()` returns a single data frame, `mat`, with all the
#' miRNA species present in the input tables in the first column, `ID`, and
#' their counts. Each input file has it own column.
#'
#' If all the input tables are empty, the output consists only on the table's
#' header, and if no files starting with `prefix` are found, nothing is
#' returned.
#'
#' @seealso [get_table()], [dplyr::full_join()] which this function uses.
merge_tables <- function(cwd, prefix) {
dataFiles <- dir(cwd, prefix, full.names = TRUE)
mat <- NULL

if (length(dataFiles)) {
mat <- read.table(dataFiles[1], sep='\t')
sample <- gsub(prefix, "", dataFiles[1])
colnames(mat)[2] <- basename(sample)
for (i in seq_len(length(dataFiles)-1)) {
mat <- full_join(mat, read.table(dataFiles[i+1], sep = "\t"), by='V1')
sample <- gsub(prefix, "", dataFiles[i+1])
colnames(mat)[i + 2] <- basename(sample)
mat <- get_table(dataFiles[1], prefix)

for (i in seq_len(length(dataFiles) - 1)) {
mat <- full_join(mat, get_table(dataFiles[i + 1], prefix), by = "ID")
}
colnames(mat)[1] <- "ID"
mat <- filter(mat, !is.na(ID))
}
return(mat)
}
Expand All @@ -113,22 +190,35 @@ merge_tables <- function(cwd, prefix){
### MAIN ###
######################
# Write log
if ( verb ) cat("Creating output directory...\n", sep="")
if (verb)
cat("Creating output directory...\n", sep = "")

# Create output directories
dir.create(dirname(out.file), recursive=TRUE, showWarnings=FALSE)
dir.create(dirname(out.file),
recursive = TRUE,
showWarnings = FALSE)
deliaBlue marked this conversation as resolved.
Show resolved Hide resolved

# Write log
if ( verb ) cat("Creating table...\n", sep="")
if (verb)
cat("Creating table...\n", sep = "")

# Create table from input directory files
myTable <- merge_tables(cwd=in.dir, prefix=prefix)
myTable <- merge_tables(cwd = in.dir, prefix = prefix)

# Write log
if ( verb ) cat(paste("Writing table: ", out.file, "\n", sep=""), sep="")
if (verb)
cat(paste("Writing table: ", out.file, "\n", sep = ""), sep = "")

# Writing table
write.table(myTable, out.file, row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t")
write.table(
myTable,
out.file,
row.names = FALSE,
col.names = TRUE,
quote = FALSE,
sep = "\t"
)

# Write log
if ( verb ) cat("Done.\n", sep="")
if (verb)
cat("Done.\n", sep = "")
Loading