-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bb718da
commit 5874671
Showing
8 changed files
with
217 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
|
||
# IDE | ||
.vscode/ | ||
|
||
# output from pytest | ||
*.csv | ||
.pytest_cache/ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Sage M. Wright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import os | ||
import argparse | ||
|
||
def is_table_valid(filename): | ||
""" | ||
Checks if the input TSV file is valid | ||
""" | ||
if not os.path.exists(filename) and filename != "-": | ||
raise argparse.ArgumentTypeError("{0} cannot be accessed".format(filename)) | ||
return filename | ||
|
||
def is_comma_delimited_list(string): | ||
""" | ||
Checks if the input string is a list | ||
""" | ||
if string is not None: | ||
return string.split(",") | ||
else: | ||
raise argparse.ArgumentTypeError("{0} is not a valid list".format(string)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import mercury.Table as Table | ||
import logging | ||
import subprocess | ||
import sys | ||
|
||
class Runner: | ||
"""This class intiates Mercury | ||
""" | ||
|
||
def __init__(self, options): | ||
logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr) | ||
self.logger = logging.getLogger(__name__) | ||
if options.verbose: | ||
self.logger.setLevel(logging.INFO) | ||
self.logger.info("RUNNER:Verbose mode enabled") | ||
elif options.debug: | ||
self.logger.setLevel(logging.DEBUG) | ||
self.logger.debug("RUNNER:Debug mode enabled") | ||
|
||
self.input_table = options.input_table | ||
self.table_name = options.table_name | ||
self.samplenames = options.samplenames | ||
self.output_prefix = options.output_prefix | ||
self.gcp_bucket_uri = options.gcp_bucket_uri | ||
self.organism = options.organism | ||
self.skip_ncbi = options.skip_ncbi | ||
self.skip_county = options.skip_county | ||
self.usa_territory = options.usa_territory | ||
self.clearlabs_data = options.using_clearlabs_data | ||
self.reads_dehosted = options.using_reads_dehosted | ||
self.vadr_alert_limit = options.vadr_alert_limit | ||
self.number_n_threshold = options.number_n_threshold | ||
|
||
def check_gcloud_dependency(self): | ||
result = subprocess.run( | ||
["gcloud", "storage", "cp", "--help"], | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
text=True, | ||
) | ||
if result.returncode != 0: | ||
self.logger.error("RUNNER:Error: gcloud storage cp command not found") | ||
sys.exit(1) | ||
self.logger.info("RUNNER:Found `gcloud storage cp` command, continuing") | ||
|
||
def run(self): | ||
"""This class runs the different parts of Mercury | ||
""" | ||
self.logger.info("RUNNER:Starting to run Mercury") | ||
self.logger.debug("RUNNER:Checking for `gcloud storage cp` command") | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import re | ||
|
||
|
||
class Table: | ||
"""This class controls the manipulation of the table | ||
""" | ||
|
||
def __init__(self, input_table, table_name, samplenames, skip_county, clearlabs_data, reads_dehosted): | ||
self.input_table = input_table | ||
self.table_name = table_name | ||
self.samplenames = samplenames | ||
self.skip_county = skip_county | ||
self.clearlabs_data = clearlabs_data | ||
self.reads_dehosted = reads_dehosted | ||
|
||
def get_year_from_date(date): | ||
r = re.compile('^\d{4}-\d{2}-\d{2}') | ||
if pd.isna(date): | ||
print("Incorrect collection date format; collection date must be in YYYY-MM-DD format. Invalid date was: NaN") | ||
elif r.match(date) is None: | ||
print("Incorrect collection date format; collection date must be in YYYY-MM-DD format. Invalid date was: " + str(date)) | ||
return np.nan | ||
else: | ||
return date.split("-")[0] | ||
|
||
def create_table(self): | ||
table = pd.read_csv(self.input_table, sep="\t", header=0, dtype={self.table_name: 'str'}) | ||
return table | ||
|
||
def extract_samples(self, table): | ||
working_table = table[table[self.table_name].isin(self.samplenames.split(","))] | ||
working_table.columns = working_table.columns.str.lower() | ||
return working_table | ||
|
||
def create_standard_variables(self, table): | ||
table["year"] = table["collection_date"].apply(lambda x: self.get_year_from_date(x)) | ||
table["host"] = "Human" #(????) | ||
|
||
def remove_nas(self, table, required_metadata): | ||
"""This function removes rows with missing values in the required metadata columns and returns the cleaned table and a table of excluded samples | ||
Args: | ||
table (DataFrame): Table containing metadata | ||
required_metadata (List): List of columns that are required to have values | ||
""" | ||
# replace blank cells with NaNs (blanks are missing values) | ||
table.replace(r'^\s+$', np.nan, regex=True) | ||
# remove rows with missing values in the required metadata columns | ||
excluded_samples = table[table[required_metadata].isna().any(axis=1)] | ||
# set the index to the sample name | ||
excluded_samples.set_index("~{table_name}_id".lower(), inplace=True) | ||
# remove all optional columns so only required columns are shown | ||
excluded_samples = excluded_samples[excluded_samples.columns.intersection(required_metadata)] | ||
# remove all NON-NA columns so only columns with NAs remain; Shelly is a wizard and I love her | ||
excluded_samples = excluded_samples.loc[:, excluded_samples.isna().any()] | ||
# remove all rows that are required with NaNs from table | ||
table.dropna(subset=required_metadata, axis=0, how='any', inplace=True) | ||
|
||
return table, excluded_samples | ||
|
||
|
||
def process_table(self): | ||
table = self.create_table() | ||
table = self.extract_samples(table) | ||
self.create_standard_variables(table) | ||
return table |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__VERSION__ = "v0.0.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python3 | ||
import CheckInputs | ||
import argparse | ||
from __init__ import __VERSION__ | ||
from Runner import Runner | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
prog = "mercury", | ||
description = "Mercury prepares and formats metadata for submission to national & international genomic databases", | ||
usage = "python3 /mercury/mercury/mercury.py <input_table.tsv> [<args>]", | ||
epilog = "Please contact [email protected] with any questions", | ||
formatter_class = lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=10) | ||
) | ||
parser.add_argument("-v", "--version", action="version", version=str(__VERSION__)) | ||
|
||
parser.add_argument("input_table", | ||
help="The table containing the metadata for the samples to be submitted", type=CheckInputs.is_table_valid) | ||
parser.add_argument("table_name", | ||
help="The name of the first column in the table (A1)", type=str) | ||
parser.add_argument("samplenames", | ||
help="The sample names to be extracted from the table", type=CheckInputs.is_comma_delimited_list) | ||
parser.add_argument("-o", "--output_prefix", | ||
help="The prefix for the output files", type=str) | ||
parser.add_argument("-b", "--gcp_bucket_uri", | ||
help="The GCP bucket URI to store the temporarily store the read files", type=str) | ||
|
||
submission_type_arguments = parser.add_argument_group("submission type arguments", "options that determine submission type") | ||
submission_type_arguments.add_argument("-g", "--organism", | ||
help="The organism type of the samples in the table\ndefault=\"sars-cov-2\"", default="sars-cov-2", metavar="\b",type=str) | ||
submission_type_arguments.add_argument("-s", "--skip_ncbi", | ||
help="Add to skip NCBI metadata preparation; prep only for GISAID submission", action="store_true", default=False) | ||
|
||
customization_arguments = parser.add_argument_group("submission customization arguments", "options that customize the submission") | ||
customization_arguments.add_argument("-c", "--skip_county", | ||
help="Add to skip adding county to location in GISAID metadata", action="store_true", default=False) | ||
customization_arguments.add_argument("-u", "--usa_territory", | ||
help="Add if the country is a USA territory to use the territory name in the state column", action="store_true", default=False) | ||
customization_arguments.add_argument("-d", "--using_clearlabs_data", | ||
help="Add if using Clearlabs-generated data and metrics", action="store_true", default=False) | ||
customization_arguments.add_argument("-r", "--using_reads_dehosted", | ||
help="Add if using reads_dehosted instead of clearlabs data", action="store_true", default=False) | ||
|
||
qc_arguments = parser.add_argument_group("quality control arguments", "options that control quality thresholds") | ||
qc_arguments.add_argument("-a", "--vadr_alert_limit", | ||
help="The maximum number of VADR alerts allowed for SARS-CoV-2 samples\ndefault=0", default=0, type=int) | ||
qc_arguments.add_argument("-n", "--number_n_threshold", | ||
help="The maximum number of Ns allowed in SARS-CoV-2 assemblies\ndefault=5000", default=5000, type=int) | ||
|
||
logging_arguments = parser.add_argument_group("logging arguments", "options that change the verbosity of the stdout logging") | ||
logging_arguments.add_argument("--verbose", | ||
help="Add to enable verbose logging", action="store_true", default=False) | ||
logging_arguments.add_argument("--debug", | ||
help="Add to enable debug logging; overwrites --verbose", action="store_true", default=False) | ||
|
||
options = parser.parse_args() | ||
|
||
parse = Runner(options) | ||
parse.run() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
pandas >= 1.4.2 | ||
numpy >= 1.2 |