Skip to content

Commit

Permalink
Replace manual HTTP requests with PubMLSTClient
Browse files Browse the repository at this point in the history
  • Loading branch information
ahdamin committed Dec 17, 2024
1 parent a5f4c71 commit 977a240
Showing 1 changed file with 109 additions and 64 deletions.
173 changes: 109 additions & 64 deletions microSALT/utils/referencer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import subprocess
import urllib.request
import zipfile
from microSALT.utils.pubmlst.client import PubMLSTClient

from Bio import Entrez
import xml.etree.ElementTree as ET
Expand Down Expand Up @@ -43,6 +44,8 @@ def __init__(self, config, log, sampleinfo={}, force=False):
self.sampleinfo = self.sampleinfo[0]
self.name = self.sampleinfo.get("CG_ID_sample")
self.sample = self.sampleinfo
self.client = PubMLSTClient()


def identify_new(self, cg_id="", project=False):
""" Automatically downloads pubMLST & NCBI organisms not already downloaded """
Expand Down Expand Up @@ -385,92 +388,134 @@ def add_pubmlst(self, organism):

def query_pubmlst(self):
""" Returns a json object containing all organisms available via pubmlst.org """
# Example request URI: http://rest.pubmlst.org/db/pubmlst_neisseria_seqdef/schemes/1/profiles_csv
seqdef_url = dict()
databases = "http://rest.pubmlst.org/db"
db_req = urllib.request.Request(databases)
with urllib.request.urlopen(db_req) as response:
db_query = json.loads(response.read().decode("utf-8"))
client = PubMLSTClient()
db_query = client.query_databases()
return db_query


def get_mlst_scheme(self, subtype_href):
""" Returns the path for the MLST data scheme at pubMLST """
try:
mlst = False
record_req_1 = urllib.request.Request("{}/schemes/1".format(subtype_href))
with urllib.request.urlopen(record_req_1) as response:
scheme_query_1 = json.loads(response.read().decode("utf-8"))
if "MLST" in scheme_query_1["description"]:
mlst = "{}/schemes/1".format(subtype_href)
if not mlst:
record_req = urllib.request.Request("{}/schemes".format(subtype_href))
with urllib.request.urlopen(record_req) as response:
record_query = json.loads(response.read().decode("utf-8"))
for scheme in record_query["schemes"]:
if scheme["description"] == "MLST":
mlst = scheme["scheme"]
parsed_data = self.client.parse_pubmlst_url(subtype_href)
db = parsed_data.get('db')
if not db:
self.logger.warning(f"Could not extract database name from URL: {subtype_href}")
return None

# First, check scheme 1
scheme_query_1 = self.client.retrieve_scheme_info(db, 1)
mlst = None
if "MLST" in scheme_query_1.get("description", ""):
mlst = f"{subtype_href}/schemes/1"
else:
# If scheme 1 isn't MLST, list all schemes and find the one with 'description' == 'MLST'
record_query = self.client.list_schemes(db)
for scheme in record_query.get("schemes", []):
if scheme.get("description") == "MLST":
mlst = scheme.get("scheme")
break

if mlst:
self.logger.debug("Found data at pubMLST: {}".format(mlst))
self.logger.debug(f"Found data at pubMLST: {mlst}")
return mlst
else:
self.logger.warning("Could not find MLST data at {}".format(subtype_href))
else:
self.logger.warning(f"Could not find MLST data at {subtype_href}")
return None
except Exception as e:
self.logger.warning(e)
return None


def external_version(self, organism, subtype_href):
""" Returns the version (date) of the data available on pubMLST """
mlst_href = self.get_mlst_scheme(subtype_href)
try:
with urllib.request.urlopen(mlst_href) as response:
ver_query = json.loads(response.read().decode("utf-8"))
return ver_query["last_updated"]
mlst_href = self.get_mlst_scheme(subtype_href)
if not mlst_href:
self.logger.warning(f"MLST scheme not found for URL: {subtype_href}")
return None

parsed_data = self.client.parse_pubmlst_url(mlst_href)
db = parsed_data.get('db')
scheme_id = parsed_data.get('scheme_id')
if not db or not scheme_id:
self.logger.warning(f"Could not extract database name or scheme ID from MLST URL: {mlst_href}")
return None

scheme_info = self.client.retrieve_scheme_info(db, scheme_id)
last_updated = scheme_info.get("last_updated")
if last_updated:
self.logger.debug(f"Retrieved last_updated: {last_updated} for organism: {organism}")
return last_updated
else:
self.logger.warning(f"No 'last_updated' field found for db: {db}, scheme_id: {scheme_id}")
return None
except Exception as e:
self.logger.warning("Could not determine pubMLST version for {}".format(organism))
self.logger.warning(f"Could not determine pubMLST version for {organism}")
self.logger.warning(e)
return None


def download_pubmlst(self, organism, subtype_href, force=False):
""" Downloads ST and loci for a given organism stored on pubMLST if it is more recent. Returns update date """
organism = organism.lower().replace(" ", "_")

# Pull version
extver = self.external_version(organism, subtype_href)
currver = self.db_access.get_version("profile_{}".format(organism))
if (
int(extver.replace("-", ""))
<= int(currver.replace("-", ""))
and not force
):
# self.logger.info("Profile for {} already at latest version".format(organism.replace('_' ,' ').capitalize()))
return currver

# Pull ST file
mlst_href = self.get_mlst_scheme(subtype_href)
st_target = "{}/{}".format(self.config["folders"]["profiles"], organism)
st_input = "{}/profiles_csv".format(mlst_href)
urllib.request.urlretrieve(st_input, st_target)

# Pull locus files
loci_input = mlst_href
loci_req = urllib.request.Request(loci_input)
with urllib.request.urlopen(loci_req) as response:
loci_query = json.loads(response.read().decode("utf-8"))

output = "{}/{}".format(self.config["folders"]["references"], organism)

try:
# Pull version
extver = self.external_version(organism, subtype_href)
currver = self.db_access.get_version(f"profile_{organism}")
if (
int(extver.replace("-", ""))
<= int(currver.replace("-", ""))
and not force
):
self.logger.info(f"Profile for {organism.replace('_', ' ').capitalize()} already at the latest version.")
return currver

# Retrieve the MLST scheme URL
mlst_href = self.get_mlst_scheme(subtype_href)
if not mlst_href:
self.logger.warning(f"MLST scheme not found for URL: {subtype_href}")
return None

# Parse the database name and scheme ID
parsed_data = self.client.parse_pubmlst_url(mlst_href)
db = parsed_data.get('db')
scheme_id = parsed_data.get('scheme_id')
if not db or not scheme_id:
self.logger.warning(f"Could not extract database name or scheme ID from MLST URL: {mlst_href}")
return None

# Step 1: Download the profiles CSV
st_target = f"{self.config['folders']['profiles']}/{organism}"
profiles_csv = self.client.download_profiles_csv(db, scheme_id)
with open(st_target, "w") as profile_file:
profile_file.write(profiles_csv)
self.logger.info(f"Profiles CSV downloaded to {st_target}")

# Step 2: Fetch scheme information to get loci
scheme_info = self.client.retrieve_scheme_info(db, scheme_id)
loci_list = scheme_info.get("loci", [])

# Step 3: Download loci FASTA files
output = f"{self.config['folders']['references']}/{organism}"
if os.path.isdir(output):
shutil.rmtree(output)
except FileNotFoundError as e:
pass
os.makedirs(output)

for locipath in loci_query["loci"]:
loci = os.path.basename(os.path.normpath(locipath))
urllib.request.urlretrieve(
"{}/alleles_fasta".format(locipath), "{}/{}.tfa".format(output, loci)
)
# Create new indexes
self.index_db(output, ".tfa")
os.makedirs(output)

for locus_uri in loci_list:
locus_name = os.path.basename(os.path.normpath(locus_uri))
loci_fasta = self.client.download_locus(db, locus_name)
with open(f"{output}/{locus_name}.tfa", "w") as fasta_file:
fasta_file.write(loci_fasta)
self.logger.info(f"Locus FASTA downloaded: {locus_name}.tfa")

# Step 4: Create new indexes
self.index_db(output, ".tfa")

return extver
except Exception as e:
self.logger.error(f"Failed to download data for {organism}: {e}")
return None


def fetch_pubmlst(self, force=False):
""" Updates reference for data that is stored on pubMLST """
Expand Down

0 comments on commit 977a240

Please sign in to comment.