Skip to content

Commit

Permalink
making progress
Browse files Browse the repository at this point in the history
  • Loading branch information
sage-wright committed May 17, 2024
1 parent bb718da commit 5874671
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Byte-compiled / optimized / DLL files
__pycache__/

# IDE
.vscode/

# output from pytest
*.csv
.pytest_cache/
*.pyc
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Sage M. Wright
19 changes: 19 additions & 0 deletions mercury/CheckInputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os
import argparse

def is_table_valid(filename):
"""
Checks if the input TSV file is valid
"""
if not os.path.exists(filename) and filename != "-":
raise argparse.ArgumentTypeError("{0} cannot be accessed".format(filename))
return filename

def is_comma_delimited_list(string):
"""
Checks if the input string is a list
"""
if string is not None:
return string.split(",")
else:
raise argparse.ArgumentTypeError("{0} is not a valid list".format(string))
54 changes: 54 additions & 0 deletions mercury/Runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import mercury.Table as Table
import logging
import subprocess
import sys

class Runner:
"""This class intiates Mercury
"""

def __init__(self, options):
logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr)
self.logger = logging.getLogger(__name__)
if options.verbose:
self.logger.setLevel(logging.INFO)
self.logger.info("RUNNER:Verbose mode enabled")
elif options.debug:
self.logger.setLevel(logging.DEBUG)
self.logger.debug("RUNNER:Debug mode enabled")

self.input_table = options.input_table
self.table_name = options.table_name
self.samplenames = options.samplenames
self.output_prefix = options.output_prefix
self.gcp_bucket_uri = options.gcp_bucket_uri
self.organism = options.organism
self.skip_ncbi = options.skip_ncbi
self.skip_county = options.skip_county
self.usa_territory = options.usa_territory
self.clearlabs_data = options.using_clearlabs_data
self.reads_dehosted = options.using_reads_dehosted
self.vadr_alert_limit = options.vadr_alert_limit
self.number_n_threshold = options.number_n_threshold

def check_gcloud_dependency(self):
result = subprocess.run(
["gcloud", "storage", "cp", "--help"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
if result.returncode != 0:
self.logger.error("RUNNER:Error: gcloud storage cp command not found")
sys.exit(1)
self.logger.info("RUNNER:Found `gcloud storage cp` command, continuing")

def run(self):
"""This class runs the different parts of Mercury
"""
self.logger.info("RUNNER:Starting to run Mercury")
self.logger.debug("RUNNER:Checking for `gcloud storage cp` command")




68 changes: 68 additions & 0 deletions mercury/Table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pandas as pd
import numpy as np
import re


class Table:
"""This class controls the manipulation of the table
"""

def __init__(self, input_table, table_name, samplenames, skip_county, clearlabs_data, reads_dehosted):
self.input_table = input_table
self.table_name = table_name
self.samplenames = samplenames
self.skip_county = skip_county
self.clearlabs_data = clearlabs_data
self.reads_dehosted = reads_dehosted

def get_year_from_date(date):
r = re.compile('^\d{4}-\d{2}-\d{2}')
if pd.isna(date):
print("Incorrect collection date format; collection date must be in YYYY-MM-DD format. Invalid date was: NaN")
elif r.match(date) is None:
print("Incorrect collection date format; collection date must be in YYYY-MM-DD format. Invalid date was: " + str(date))
return np.nan
else:
return date.split("-")[0]

def create_table(self):
table = pd.read_csv(self.input_table, sep="\t", header=0, dtype={self.table_name: 'str'})
return table

def extract_samples(self, table):
working_table = table[table[self.table_name].isin(self.samplenames.split(","))]
working_table.columns = working_table.columns.str.lower()
return working_table

def create_standard_variables(self, table):
table["year"] = table["collection_date"].apply(lambda x: self.get_year_from_date(x))
table["host"] = "Human" #(????)

def remove_nas(self, table, required_metadata):
"""This function removes rows with missing values in the required metadata columns and returns the cleaned table and a table of excluded samples
Args:
table (DataFrame): Table containing metadata
required_metadata (List): List of columns that are required to have values
"""
# replace blank cells with NaNs (blanks are missing values)
table.replace(r'^\s+$', np.nan, regex=True)
# remove rows with missing values in the required metadata columns
excluded_samples = table[table[required_metadata].isna().any(axis=1)]
# set the index to the sample name
excluded_samples.set_index("~{table_name}_id".lower(), inplace=True)
# remove all optional columns so only required columns are shown
excluded_samples = excluded_samples[excluded_samples.columns.intersection(required_metadata)]
# remove all NON-NA columns so only columns with NAs remain; Shelly is a wizard and I love her
excluded_samples = excluded_samples.loc[:, excluded_samples.isna().any()]
# remove all rows that are required with NaNs from table
table.dropna(subset=required_metadata, axis=0, how='any', inplace=True)

return table, excluded_samples


def process_table(self):
table = self.create_table()
table = self.extract_samples(table)
self.create_standard_variables(table)
return table
1 change: 1 addition & 0 deletions mercury/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__VERSION__ = "v0.0.1"
62 changes: 62 additions & 0 deletions mercury/mercury.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python3
import CheckInputs
import argparse
from __init__ import __VERSION__
from Runner import Runner

def main():
parser = argparse.ArgumentParser(
prog = "mercury",
description = "Mercury prepares and formats metadata for submission to national & international genomic databases",
usage = "python3 /mercury/mercury/mercury.py <input_table.tsv> [<args>]",
epilog = "Please contact [email protected] with any questions",
formatter_class = lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=10)
)
parser.add_argument("-v", "--version", action="version", version=str(__VERSION__))

parser.add_argument("input_table",
help="The table containing the metadata for the samples to be submitted", type=CheckInputs.is_table_valid)
parser.add_argument("table_name",
help="The name of the first column in the table (A1)", type=str)
parser.add_argument("samplenames",
help="The sample names to be extracted from the table", type=CheckInputs.is_comma_delimited_list)
parser.add_argument("-o", "--output_prefix",
help="The prefix for the output files", type=str)
parser.add_argument("-b", "--gcp_bucket_uri",
help="The GCP bucket URI to store the temporarily store the read files", type=str)

submission_type_arguments = parser.add_argument_group("submission type arguments", "options that determine submission type")
submission_type_arguments.add_argument("-g", "--organism",
help="The organism type of the samples in the table\ndefault=\"sars-cov-2\"", default="sars-cov-2", metavar="\b",type=str)
submission_type_arguments.add_argument("-s", "--skip_ncbi",
help="Add to skip NCBI metadata preparation; prep only for GISAID submission", action="store_true", default=False)

customization_arguments = parser.add_argument_group("submission customization arguments", "options that customize the submission")
customization_arguments.add_argument("-c", "--skip_county",
help="Add to skip adding county to location in GISAID metadata", action="store_true", default=False)
customization_arguments.add_argument("-u", "--usa_territory",
help="Add if the country is a USA territory to use the territory name in the state column", action="store_true", default=False)
customization_arguments.add_argument("-d", "--using_clearlabs_data",
help="Add if using Clearlabs-generated data and metrics", action="store_true", default=False)
customization_arguments.add_argument("-r", "--using_reads_dehosted",
help="Add if using reads_dehosted instead of clearlabs data", action="store_true", default=False)

qc_arguments = parser.add_argument_group("quality control arguments", "options that control quality thresholds")
qc_arguments.add_argument("-a", "--vadr_alert_limit",
help="The maximum number of VADR alerts allowed for SARS-CoV-2 samples\ndefault=0", default=0, type=int)
qc_arguments.add_argument("-n", "--number_n_threshold",
help="The maximum number of Ns allowed in SARS-CoV-2 assemblies\ndefault=5000", default=5000, type=int)

logging_arguments = parser.add_argument_group("logging arguments", "options that change the verbosity of the stdout logging")
logging_arguments.add_argument("--verbose",
help="Add to enable verbose logging", action="store_true", default=False)
logging_arguments.add_argument("--debug",
help="Add to enable debug logging; overwrites --verbose", action="store_true", default=False)

options = parser.parse_args()

parse = Runner(options)
parse.run()

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pandas >= 1.4.2
numpy >= 1.2

0 comments on commit 5874671

Please sign in to comment.