making progress

theiagen · May 17, 2024 · 5874671 · 5874671
1 parent bb718da
commit 5874671
Show file tree

Hide file tree

Showing 8 changed files with 217 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+
+# IDE
+.vscode/
+
+# output from pytest
+*.csv
+.pytest_cache/
+*.pyc
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1 @@
+Sage M. Wright
diff --git a/mercury/CheckInputs.py b/mercury/CheckInputs.py
@@ -0,0 +1,19 @@
+import os
+import argparse
+
+def is_table_valid(filename):
+  """
+  Checks if the input TSV file is valid
+  """
+  if not os.path.exists(filename) and filename != "-":
+    raise argparse.ArgumentTypeError("{0} cannot be accessed".format(filename))
+  return filename
+
+def is_comma_delimited_list(string):
+  """
+  Checks if the input string is a list
+  """
+  if string is not None:
+    return string.split(",")
+  else:
+    raise argparse.ArgumentTypeError("{0} is not a valid list".format(string))
diff --git a/mercury/Runner.py b/mercury/Runner.py
@@ -0,0 +1,54 @@
+import mercury.Table as Table
+import logging
+import subprocess
+import sys
+
+class Runner:
+  """This class intiates Mercury 
+  """
+
+  def __init__(self, options):
+    logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr)
+    self.logger = logging.getLogger(__name__)
+    if options.verbose:
+        self.logger.setLevel(logging.INFO)
+        self.logger.info("RUNNER:Verbose mode enabled")
+    elif options.debug:
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.debug("RUNNER:Debug mode enabled")
+
+    self.input_table = options.input_table
+    self.table_name = options.table_name
+    self.samplenames = options.samplenames
+    self.output_prefix = options.output_prefix
+    self.gcp_bucket_uri = options.gcp_bucket_uri
+    self.organism = options.organism
+    self.skip_ncbi = options.skip_ncbi
+    self.skip_county = options.skip_county
+    self.usa_territory = options.usa_territory
+    self.clearlabs_data = options.using_clearlabs_data
+    self.reads_dehosted = options.using_reads_dehosted
+    self.vadr_alert_limit = options.vadr_alert_limit
+    self.number_n_threshold = options.number_n_threshold
+
+  def check_gcloud_dependency(self):
+    result = subprocess.run(
+      ["gcloud", "storage", "cp", "--help"],
+      stdout=subprocess.PIPE,
+      stderr=subprocess.PIPE,
+      text=True,
+    )
+    if result.returncode != 0:
+      self.logger.error("RUNNER:Error: gcloud storage cp command not found")
+      sys.exit(1)
+    self.logger.info("RUNNER:Found `gcloud storage cp` command, continuing")
+
+  def run(self):
+    """This class runs the different parts of Mercury
+    """
+    self.logger.info("RUNNER:Starting to run Mercury")
+    self.logger.debug("RUNNER:Checking for `gcloud storage cp` command")
+
+
+
+
diff --git a/mercury/Table.py b/mercury/Table.py
@@ -0,0 +1,68 @@
+import pandas as pd
+import numpy as np
+import re
+
+
+class Table:
+  """This class controls the manipulation of the table
+  """
+
+  def __init__(self, input_table, table_name, samplenames, skip_county, clearlabs_data, reads_dehosted):
+    self.input_table = input_table
+    self.table_name = table_name
+    self.samplenames = samplenames
+    self.skip_county = skip_county
+    self.clearlabs_data = clearlabs_data
+    self.reads_dehosted = reads_dehosted  
+
+  def get_year_from_date(date):
+    r = re.compile('^\d{4}-\d{2}-\d{2}')      
+    if pd.isna(date):
+      print("Incorrect collection date format; collection date must be in YYYY-MM-DD format. Invalid date was: NaN")
+    elif r.match(date) is None:
+      print("Incorrect collection date format; collection date must be in YYYY-MM-DD format. Invalid date was: " + str(date))
+      return np.nan
+    else:
+      return date.split("-")[0]
+
+  def create_table(self):
+    table = pd.read_csv(self.input_table, sep="\t", header=0, dtype={self.table_name: 'str'})
+    return table
+
+  def extract_samples(self, table):
+    working_table = table[table[self.table_name].isin(self.samplenames.split(","))]
+    working_table.columns = working_table.columns.str.lower()
+    return working_table
+
+  def create_standard_variables(self, table):
+    table["year"] = table["collection_date"].apply(lambda x: self.get_year_from_date(x))
+    table["host"] = "Human" #(????)
+
+  def remove_nas(self, table, required_metadata):
+    """This function removes rows with missing values in the required metadata columns and returns the cleaned table and a table of excluded samples
+
+    Args:
+        table (DataFrame): Table containing metadata
+        required_metadata (List): List of columns that are required to have values
+    """
+    # replace blank cells with NaNs (blanks are missing values)
+    table.replace(r'^\s+$', np.nan, regex=True)
+    # remove rows with missing values in the required metadata columns
+    excluded_samples = table[table[required_metadata].isna().any(axis=1)]
+    # set the index to the sample name
+    excluded_samples.set_index("~{table_name}_id".lower(), inplace=True)
+    # remove all optional columns so only required columns are shown
+    excluded_samples = excluded_samples[excluded_samples.columns.intersection(required_metadata)]
+    # remove all NON-NA columns so only columns with NAs remain; Shelly is a wizard and I love her 
+    excluded_samples = excluded_samples.loc[:, excluded_samples.isna().any()] 
+    # remove all rows that are required with NaNs from table
+    table.dropna(subset=required_metadata, axis=0, how='any', inplace=True) 
+
+    return table, excluded_samples
+
+
+  def process_table(self):
+    table = self.create_table()
+    table = self.extract_samples(table)
+    self.create_standard_variables(table)
+    return table
diff --git a/mercury/__init__.py b/mercury/__init__.py
@@ -0,0 +1 @@
+__VERSION__ = "v0.0.1"
diff --git a/mercury/mercury.py b/mercury/mercury.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import CheckInputs
+import argparse
+from __init__ import __VERSION__
+from Runner import Runner
+
+def main():
+  parser = argparse.ArgumentParser(
+    prog = "mercury",
+    description = "Mercury prepares and formats metadata for submission to national & international genomic databases",
+    usage = "python3 /mercury/mercury/mercury.py <input_table.tsv> [<args>]",
+    epilog = "Please contact [email protected] with any questions",
+    formatter_class = lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=10)
+  )
+  parser.add_argument("-v", "--version", action="version", version=str(__VERSION__))
+
+  parser.add_argument("input_table",
+                      help="The table containing the metadata for the samples to be submitted", type=CheckInputs.is_table_valid)
+  parser.add_argument("table_name",
+                      help="The name of the first column in the table (A1)", type=str)
+  parser.add_argument("samplenames",
+                      help="The sample names to be extracted from the table", type=CheckInputs.is_comma_delimited_list)
+  parser.add_argument("-o", "--output_prefix",
+                      help="The prefix for the output files", type=str)
+  parser.add_argument("-b", "--gcp_bucket_uri",
+                      help="The GCP bucket URI to store the temporarily store the read files", type=str)
+
+  submission_type_arguments = parser.add_argument_group("submission type arguments", "options that determine submission type")
+  submission_type_arguments.add_argument("-g", "--organism", 
+                                         help="The organism type of the samples in the table\ndefault=\"sars-cov-2\"", default="sars-cov-2", metavar="\b",type=str)
+  submission_type_arguments.add_argument("-s", "--skip_ncbi", 
+                                         help="Add to skip NCBI metadata preparation; prep only for GISAID submission", action="store_true", default=False)
+
+  customization_arguments = parser.add_argument_group("submission customization arguments", "options that customize the submission")
+  customization_arguments.add_argument("-c", "--skip_county",
+                                       help="Add to skip adding county to location in GISAID metadata", action="store_true", default=False)
+  customization_arguments.add_argument("-u", "--usa_territory",
+                                       help="Add if the country is a USA territory to use the territory name in the state column", action="store_true", default=False)
+  customization_arguments.add_argument("-d", "--using_clearlabs_data",
+                                       help="Add if using Clearlabs-generated data and metrics", action="store_true", default=False)
+  customization_arguments.add_argument("-r", "--using_reads_dehosted",
+                                       help="Add if using reads_dehosted instead of clearlabs data", action="store_true", default=False)
+
+  qc_arguments = parser.add_argument_group("quality control arguments", "options that control quality thresholds")
+  qc_arguments.add_argument("-a", "--vadr_alert_limit",
+                            help="The maximum number of VADR alerts allowed for SARS-CoV-2 samples\ndefault=0", default=0, type=int)
+  qc_arguments.add_argument("-n", "--number_n_threshold",
+                            help="The maximum number of Ns allowed in SARS-CoV-2 assemblies\ndefault=5000", default=5000, type=int)
+
+  logging_arguments = parser.add_argument_group("logging arguments", "options that change the verbosity of the stdout logging")
+  logging_arguments.add_argument("--verbose",
+                                 help="Add to enable verbose logging", action="store_true", default=False)
+  logging_arguments.add_argument("--debug",
+                                  help="Add to enable debug logging; overwrites --verbose", action="store_true", default=False)
+
+  options = parser.parse_args()
+
+  parse = Runner(options)
+  parse.run()
+
+if __name__ == "__main__":
+  main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+pandas >= 1.4.2
+numpy >= 1.2