Skip to content

Commit

Permalink
Making tool and its module more robust to errors
Browse files Browse the repository at this point in the history
  • Loading branch information
kbessonov1984 committed Jul 30, 2024
1 parent 74edb81 commit 22b6a98
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 27 deletions.
1 change: 1 addition & 0 deletions ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def run_program():
# Add empty rows for genomes without a blast result or non-E.coli samples that did not undergo typing
final_predictions = predictionFunctions.add_non_predicted(
raw_genome_files, predictions_dict, other_genomes_dict, filesnotfound_dict, ecoli_genomes_dict)
print(final_predictions)

for sample in final_predictions.keys():
final_predictions[sample]["database"] = "v"+ectyperdb_dict["version"] + " (" + ectyperdb_dict["date"] + ")"
Expand Down
6 changes: 3 additions & 3 deletions ectyper/genomeFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,14 @@ def get_files_as_list(files_or_directories, max_depth_level):
for file_or_directory in sorted([os.path.abspath(p) for p in files_or_directories]):

dir_level_current = get_relative_directory_level(file_or_directory, init_min_dir_level)
LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...")


if dir_level_current > max_depth_level:
LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping directory {file_or_directory} ...")
LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping {file_or_directory} ...")
continue

# if single directory is specified
if os.path.isdir(file_or_directory):
LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...")
# Create a list containing the file names
for root, dirs, files in os.walk(os.path.abspath(file_or_directory)):
dir_level = get_relative_directory_level(root, init_min_dir_level)
Expand Down
10 changes: 5 additions & 5 deletions ectyper/predictionFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,7 @@ def shiga_toxing_subtyping(pathotype_genes_tmp_df, output_dir, debug):
if len(results_dict[k]) != 0:
results_dict[k] = ";".join([results_dict[k][i] for i in sorted_order])
else:
results_dict[k] = "-"
print(results_dict)
results_dict[k] = "-"
return results_dict

def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_genomes_dict, temp_dir, verify_species_flag, pident, pcov,
Expand All @@ -193,10 +192,10 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen
Returns:
list: list of pathotypes
"""
LOG.info(f"Starting pathotype predictions on {len(ecoli_genome_files_dict.keys())} samples. Reminder: Please use --verify option to run pathotype predictions only on E.coli samples ...")
LOG.info(f"Starting pathotype predictions on {len(ecoli_genome_files_dict.keys())} E.coli and non-E.coli {len(other_genomes_dict.keys())} samples. Reminder: Please use --verify option to run pathotype predictions only on E.coli samples ...")

if len(other_genomes_dict.keys()) > 0 and verify_species_flag == True:
LOG.info(f"A total of {len(other_genomes_dict.keys())} non-E.coli sample(s) will not be pathotyped. Omit --verify option if you want to type ALL samples regardless ...")
LOG.info(f"A total of {len(other_genomes_dict.keys())} non-E.coli sample(s) will not be pathotyped. If you still want to type ALL samples regardless omit --verify option ...")

path2patho_db = json2fasta(definitions.PATHOTYPE_ALLELE_JSON, temp_dir)
json_patho_db = load_json(definitions.PATHOTYPE_ALLELE_JSON)
Expand All @@ -223,7 +222,8 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen
"-outfmt", "6 qseqid qlen sseqid length pident sstart send sframe slen qcovhsp bitscore sseq"
]
LOG.debug(f"BLASTN results on pathotype database written to {temp_dir}/blast_pathotype_result.txt ...")
subprocess_util.run_subprocess(cmd)
cmd_status = subprocess_util.run_subprocess(cmd)

if os.stat(f'{temp_dir}/blast_pathotype_result.txt').st_size == 0:
LOG.warning(f"No pathotype signatures found for sample {g} as pathotype BLAST results file {temp_dir}/blast_pathotype_result.txt is empty. Skipping ...")
predictions_pathotype_dict[g]={field:'-' for field in definitions.PATHOTYPE_TOXIN_FIELDS}
Expand Down
34 changes: 17 additions & 17 deletions ectyper/speciesIdentification.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def get_species(file, args, cores=1):
Returns:
str: name of estimated species
"""

LOG.debug(f"Get species prediction for {file}")
top_match="-"; top_match_dist="-"; top_match_hashratio="-"; species="-"
sketch_metadata_file = args.reference+'.txt'
if os.path.exists(sketch_metadata_file) == False:
Expand Down Expand Up @@ -202,36 +202,36 @@ def get_species(file, args, cores=1):


if len(top_hit_lines) < 1:
top_hit_line = ""
LOG.warning('For {file} no hits returned by MASH species id sketch search. Species identification failed!')
else:
top_hit_line = top_hit_lines[0]
LOG.info('For {} following top hits and hash ratios returned by MASH {}'.format(file,
[(top_hit_line.split("\t")[0],top_hit_line.split("\t")[4]) for top_hit_line in top_hit_lines if len(top_hit_line.split("\t")[0])>0]))


top_hit_line_elements = top_hit_line.split()

for top_hit_line in top_hit_lines:

top_hit_line_elements = top_hit_line.split()

if len(top_hit_line_elements) < 5:
LOG.warning("No columns in the mash results output to split. Species identification failed!")
continue

if len(top_hit_line_elements) < 5:
LOG.warning("No columns in the mash results output to split. Species identification failed!")
species = "-"
else:
top_match = top_hit_line_elements[0]; top_match_dist = top_hit_line_elements[2]; top_match_hashratio = top_hit_line_elements[4]
matched_hashes = top_match_hashratio.split('/')[0]
matched_meta_line = subprocess_util.run_subprocess(['grep',top_match, sketch_metadata_file],
ignorereturncode=True).stdout.decode('utf-8').split('\t')
ignorereturncode=True).stdout.decode('utf-8').split('\t')
if len(matched_meta_line) == 4 and matched_hashes != '0':
m=re.search('s__(.+)',matched_meta_line[3])
if m:
species = m.group(1).strip('"')
LOG.info(
"MASH species top hit {} identified as {} with distance {} to {} and shared hashes ratio {}".format(top_match, species, top_match_dist, file,
top_match_hashratio))
LOG.info("MASH dist predicted species name: '{}' based on species ID sketch {}".format(species, args.reference))
species = m.group(1).strip('"')
LOG.info(
"MASH species top hit {} identified as {} with distance {} to {} and shared hashes ratio {}".format(top_match, species, top_match_dist, file,
top_match_hashratio))
LOG.info("MASH dist predicted species name: '{}' based on species ID sketch {}".format(species, args.reference))
else:
LOG.warning(f"Could not determine species based on MASH distance for {file}")
species = "-"
return species
species = "-"
return species


def getSampleName(file):
Expand Down
5 changes: 3 additions & 2 deletions ectyper/subprocess_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def run_subprocess(cmd, input_data=None, un=False, ignorereturncode=False):
else:
LOG.error("Error in subprocess. The following command failed: {}".format(cmd))
LOG.error("Subprocess failed with error: \"{}\"".format(comp_proc.stderr.decode("utf-8")))
LOG.critical("ectyper has stopped")
raise Exception(f"subprocess failure while running {cmd} command")
#LOG.critical("ectyper has stopped")
return comp_proc
#raise Exception(f"subprocess failure while running {cmd} command")

0 comments on commit 22b6a98

Please sign in to comment.