From 22b6a98b9b1608ffe8e8d861628de478aaffa969 Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Tue, 30 Jul 2024 14:25:58 -0400 Subject: [PATCH] Making tool and its module more robust to errors --- ectyper/ectyper.py | 1 + ectyper/genomeFunctions.py | 6 +++--- ectyper/predictionFunctions.py | 10 +++++----- ectyper/speciesIdentification.py | 34 ++++++++++++++++---------------- ectyper/subprocess_util.py | 5 +++-- 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py index a3f9898..906c873 100644 --- a/ectyper/ectyper.py +++ b/ectyper/ectyper.py @@ -196,6 +196,7 @@ def run_program(): # Add empty rows for genomes without a blast result or non-E.coli samples that did not undergo typing final_predictions = predictionFunctions.add_non_predicted( raw_genome_files, predictions_dict, other_genomes_dict, filesnotfound_dict, ecoli_genomes_dict) + print(final_predictions) for sample in final_predictions.keys(): final_predictions[sample]["database"] = "v"+ectyperdb_dict["version"] + " (" + ectyperdb_dict["date"] + ")" diff --git a/ectyper/genomeFunctions.py b/ectyper/genomeFunctions.py index 184adb4..f062290 100644 --- a/ectyper/genomeFunctions.py +++ b/ectyper/genomeFunctions.py @@ -40,14 +40,14 @@ def get_files_as_list(files_or_directories, max_depth_level): for file_or_directory in sorted([os.path.abspath(p) for p in files_or_directories]): dir_level_current = get_relative_directory_level(file_or_directory, init_min_dir_level) - LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...") - + if dir_level_current > max_depth_level: - LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping directory {file_or_directory} ...") + LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping {file_or_directory} ...") continue # if single directory is specified if os.path.isdir(file_or_directory): + LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...") # Create a list containing the file names for root, dirs, files in os.walk(os.path.abspath(file_or_directory)): dir_level = get_relative_directory_level(root, init_min_dir_level) diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py index 8e59932..0721bad 100644 --- a/ectyper/predictionFunctions.py +++ b/ectyper/predictionFunctions.py @@ -174,8 +174,7 @@ def shiga_toxing_subtyping(pathotype_genes_tmp_df, output_dir, debug): if len(results_dict[k]) != 0: results_dict[k] = ";".join([results_dict[k][i] for i in sorted_order]) else: - results_dict[k] = "-" - print(results_dict) + results_dict[k] = "-" return results_dict def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_genomes_dict, temp_dir, verify_species_flag, pident, pcov, @@ -193,10 +192,10 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen Returns: list: list of pathotypes """ - LOG.info(f"Starting pathotype predictions on {len(ecoli_genome_files_dict.keys())} samples. Reminder: Please use --verify option to run pathotype predictions only on E.coli samples ...") + LOG.info(f"Starting pathotype predictions on {len(ecoli_genome_files_dict.keys())} E.coli and non-E.coli {len(other_genomes_dict.keys())} samples. Reminder: Please use --verify option to run pathotype predictions only on E.coli samples ...") if len(other_genomes_dict.keys()) > 0 and verify_species_flag == True: - LOG.info(f"A total of {len(other_genomes_dict.keys())} non-E.coli sample(s) will not be pathotyped. Omit --verify option if you want to type ALL samples regardless ...") + LOG.info(f"A total of {len(other_genomes_dict.keys())} non-E.coli sample(s) will not be pathotyped. If you still want to type ALL samples regardless omit --verify option ...") path2patho_db = json2fasta(definitions.PATHOTYPE_ALLELE_JSON, temp_dir) json_patho_db = load_json(definitions.PATHOTYPE_ALLELE_JSON) @@ -223,7 +222,8 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen "-outfmt", "6 qseqid qlen sseqid length pident sstart send sframe slen qcovhsp bitscore sseq" ] LOG.debug(f"BLASTN results on pathotype database written to {temp_dir}/blast_pathotype_result.txt ...") - subprocess_util.run_subprocess(cmd) + cmd_status = subprocess_util.run_subprocess(cmd) + if os.stat(f'{temp_dir}/blast_pathotype_result.txt').st_size == 0: LOG.warning(f"No pathotype signatures found for sample {g} as pathotype BLAST results file {temp_dir}/blast_pathotype_result.txt is empty. Skipping ...") predictions_pathotype_dict[g]={field:'-' for field in definitions.PATHOTYPE_TOXIN_FIELDS} diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py index dd3901b..ac19473 100644 --- a/ectyper/speciesIdentification.py +++ b/ectyper/speciesIdentification.py @@ -162,7 +162,7 @@ def get_species(file, args, cores=1): Returns: str: name of estimated species """ - + LOG.debug(f"Get species prediction for {file}") top_match="-"; top_match_dist="-"; top_match_hashratio="-"; species="-" sketch_metadata_file = args.reference+'.txt' if os.path.exists(sketch_metadata_file) == False: @@ -202,36 +202,36 @@ def get_species(file, args, cores=1): if len(top_hit_lines) < 1: + top_hit_line = "" LOG.warning('For {file} no hits returned by MASH species id sketch search. Species identification failed!') else: + top_hit_line = top_hit_lines[0] LOG.info('For {} following top hits and hash ratios returned by MASH {}'.format(file, [(top_hit_line.split("\t")[0],top_hit_line.split("\t")[4]) for top_hit_line in top_hit_lines if len(top_hit_line.split("\t")[0])>0])) + + top_hit_line_elements = top_hit_line.split() - for top_hit_line in top_hit_lines: - - top_hit_line_elements = top_hit_line.split() - - if len(top_hit_line_elements) < 5: - LOG.warning("No columns in the mash results output to split. Species identification failed!") - continue - + if len(top_hit_line_elements) < 5: + LOG.warning("No columns in the mash results output to split. Species identification failed!") + species = "-" + else: top_match = top_hit_line_elements[0]; top_match_dist = top_hit_line_elements[2]; top_match_hashratio = top_hit_line_elements[4] matched_hashes = top_match_hashratio.split('/')[0] matched_meta_line = subprocess_util.run_subprocess(['grep',top_match, sketch_metadata_file], - ignorereturncode=True).stdout.decode('utf-8').split('\t') + ignorereturncode=True).stdout.decode('utf-8').split('\t') if len(matched_meta_line) == 4 and matched_hashes != '0': m=re.search('s__(.+)',matched_meta_line[3]) if m: - species = m.group(1).strip('"') - LOG.info( - "MASH species top hit {} identified as {} with distance {} to {} and shared hashes ratio {}".format(top_match, species, top_match_dist, file, - top_match_hashratio)) - LOG.info("MASH dist predicted species name: '{}' based on species ID sketch {}".format(species, args.reference)) + species = m.group(1).strip('"') + LOG.info( + "MASH species top hit {} identified as {} with distance {} to {} and shared hashes ratio {}".format(top_match, species, top_match_dist, file, + top_match_hashratio)) + LOG.info("MASH dist predicted species name: '{}' based on species ID sketch {}".format(species, args.reference)) else: LOG.warning(f"Could not determine species based on MASH distance for {file}") - species = "-" - return species + species = "-" + return species def getSampleName(file): diff --git a/ectyper/subprocess_util.py b/ectyper/subprocess_util.py index a9d2cba..7cd78a8 100644 --- a/ectyper/subprocess_util.py +++ b/ectyper/subprocess_util.py @@ -37,6 +37,7 @@ def run_subprocess(cmd, input_data=None, un=False, ignorereturncode=False): else: LOG.error("Error in subprocess. The following command failed: {}".format(cmd)) LOG.error("Subprocess failed with error: \"{}\"".format(comp_proc.stderr.decode("utf-8"))) - LOG.critical("ectyper has stopped") - raise Exception(f"subprocess failure while running {cmd} command") + #LOG.critical("ectyper has stopped") + return comp_proc + #raise Exception(f"subprocess failure while running {cmd} command")