From 22b6a98b9b1608ffe8e8d861628de478aaffa969 Mon Sep 17 00:00:00 2001
From: Kirill Bessonov <kbessonov@gmail.com>
Date: Tue, 30 Jul 2024 14:25:58 -0400
Subject: [PATCH] Making tool and its module  more robust to errors

---
 ectyper/ectyper.py               |  1 +
 ectyper/genomeFunctions.py       |  6 +++---
 ectyper/predictionFunctions.py   | 10 +++++-----
 ectyper/speciesIdentification.py | 34 ++++++++++++++++----------------
 ectyper/subprocess_util.py       |  5 +++--
 5 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
index a3f9898..906c873 100644
--- a/ectyper/ectyper.py
+++ b/ectyper/ectyper.py
@@ -196,6 +196,7 @@ def run_program():
     # Add empty rows for genomes without a blast result or non-E.coli samples that did not undergo typing
     final_predictions = predictionFunctions.add_non_predicted(
         raw_genome_files, predictions_dict, other_genomes_dict, filesnotfound_dict, ecoli_genomes_dict)
+    print(final_predictions)
     
     for sample in final_predictions.keys():
         final_predictions[sample]["database"] = "v"+ectyperdb_dict["version"] + " (" + ectyperdb_dict["date"] + ")"
diff --git a/ectyper/genomeFunctions.py b/ectyper/genomeFunctions.py
index 184adb4..f062290 100644
--- a/ectyper/genomeFunctions.py
+++ b/ectyper/genomeFunctions.py
@@ -40,14 +40,14 @@ def get_files_as_list(files_or_directories, max_depth_level):
     for file_or_directory in sorted([os.path.abspath(p) for p in files_or_directories]):
 
         dir_level_current = get_relative_directory_level(file_or_directory, init_min_dir_level)
-        LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...")
-
+        
         if dir_level_current > max_depth_level:
-            LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping directory {file_or_directory} ...")
+            LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping {file_or_directory} ...")
             continue
         
         # if single directory is specified
         if os.path.isdir(file_or_directory):
+            LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...")
             # Create a list containing the file names
             for root, dirs, files in os.walk(os.path.abspath(file_or_directory)):
                 dir_level = get_relative_directory_level(root, init_min_dir_level)
diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py
index 8e59932..0721bad 100644
--- a/ectyper/predictionFunctions.py
+++ b/ectyper/predictionFunctions.py
@@ -174,8 +174,7 @@ def shiga_toxing_subtyping(pathotype_genes_tmp_df, output_dir, debug):
         if len(results_dict[k]) != 0:
             results_dict[k] = ";".join([results_dict[k][i] for i in sorted_order])
         else:
-            results_dict[k] = "-"   
-    print(results_dict)        
+            results_dict[k] = "-"          
     return results_dict
 
 def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_genomes_dict, temp_dir, verify_species_flag, pident, pcov, 
@@ -193,10 +192,10 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen
     Returns:
         list: list of pathotypes
     """
-    LOG.info(f"Starting pathotype predictions on {len(ecoli_genome_files_dict.keys())} samples. Reminder: Please use --verify option to run pathotype predictions only on E.coli samples ...")
+    LOG.info(f"Starting pathotype predictions on {len(ecoli_genome_files_dict.keys())} E.coli and non-E.coli {len(other_genomes_dict.keys())} samples. Reminder: Please use --verify option to run pathotype predictions only on E.coli samples ...")
 
     if len(other_genomes_dict.keys()) > 0 and verify_species_flag == True:
-        LOG.info(f"A total of {len(other_genomes_dict.keys())} non-E.coli sample(s) will not be pathotyped. Omit --verify option if you want to type ALL samples regardless ...")
+        LOG.info(f"A total of {len(other_genomes_dict.keys())} non-E.coli sample(s) will not be pathotyped. If you still want to type ALL samples regardless omit --verify option ...")
     
     path2patho_db = json2fasta(definitions.PATHOTYPE_ALLELE_JSON, temp_dir)
     json_patho_db = load_json(definitions.PATHOTYPE_ALLELE_JSON)
@@ -223,7 +222,8 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen
             "-outfmt", "6 qseqid qlen sseqid length pident sstart send sframe slen qcovhsp bitscore sseq"
         ]
         LOG.debug(f"BLASTN results on pathotype database written to {temp_dir}/blast_pathotype_result.txt ...")
-        subprocess_util.run_subprocess(cmd)
+        cmd_status = subprocess_util.run_subprocess(cmd)
+        
         if os.stat(f'{temp_dir}/blast_pathotype_result.txt').st_size == 0:
             LOG.warning(f"No pathotype signatures found for sample {g} as pathotype BLAST results file {temp_dir}/blast_pathotype_result.txt is empty. Skipping ...")
             predictions_pathotype_dict[g]={field:'-' for field in definitions.PATHOTYPE_TOXIN_FIELDS}
diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py
index dd3901b..ac19473 100644
--- a/ectyper/speciesIdentification.py
+++ b/ectyper/speciesIdentification.py
@@ -162,7 +162,7 @@ def get_species(file, args, cores=1):
     Returns:
         str: name of estimated species
     """
-
+    LOG.debug(f"Get species prediction for {file}")
     top_match="-"; top_match_dist="-"; top_match_hashratio="-"; species="-"
     sketch_metadata_file = args.reference+'.txt' 
     if os.path.exists(sketch_metadata_file) == False:
@@ -202,36 +202,36 @@ def get_species(file, args, cores=1):
 
 
     if len(top_hit_lines) < 1:
+        top_hit_line = ""
         LOG.warning('For {file} no hits returned by MASH species id sketch search. Species identification failed!')
     else:
+        top_hit_line = top_hit_lines[0]
         LOG.info('For {} following top hits and hash ratios returned by MASH {}'.format(file,
                                                                         [(top_hit_line.split("\t")[0],top_hit_line.split("\t")[4]) for top_hit_line in top_hit_lines if len(top_hit_line.split("\t")[0])>0]))
 
+    
+    top_hit_line_elements = top_hit_line.split()
 
-    for top_hit_line in top_hit_lines:
-
-        top_hit_line_elements = top_hit_line.split()
-
-        if len(top_hit_line_elements) < 5:
-            LOG.warning("No columns in the mash results output to split. Species identification failed!")
-            continue
-
+    if len(top_hit_line_elements) < 5:
+        LOG.warning("No columns in the mash results output to split. Species identification failed!")
+        species = "-"
+    else:
         top_match = top_hit_line_elements[0]; top_match_dist = top_hit_line_elements[2]; top_match_hashratio = top_hit_line_elements[4]
         matched_hashes = top_match_hashratio.split('/')[0]
         matched_meta_line = subprocess_util.run_subprocess(['grep',top_match, sketch_metadata_file],
-                                                      ignorereturncode=True).stdout.decode('utf-8').split('\t')
+                                                        ignorereturncode=True).stdout.decode('utf-8').split('\t')
         if len(matched_meta_line) == 4 and matched_hashes != '0':
             m=re.search('s__(.+)',matched_meta_line[3])
             if m:
-               species = m.group(1).strip('"')
-               LOG.info(
-        "MASH species top hit {} identified as {} with distance {} to {} and shared hashes ratio {}".format(top_match, species, top_match_dist, file,
-                                                                                            top_match_hashratio))
-               LOG.info("MASH dist predicted species name: '{}' based on species ID sketch {}".format(species, args.reference))
+                species = m.group(1).strip('"')
+                LOG.info(
+            "MASH species top hit {} identified as {} with distance {} to {} and shared hashes ratio {}".format(top_match, species, top_match_dist, file,
+                                                                                                top_match_hashratio))
+                LOG.info("MASH dist predicted species name: '{}' based on species ID sketch {}".format(species, args.reference))
         else:
             LOG.warning(f"Could not determine species based on MASH distance for {file}")
-            species = "-"
-        return species    
+            species = "-" 
+    return species    
         
 
 def getSampleName(file):
diff --git a/ectyper/subprocess_util.py b/ectyper/subprocess_util.py
index a9d2cba..7cd78a8 100644
--- a/ectyper/subprocess_util.py
+++ b/ectyper/subprocess_util.py
@@ -37,6 +37,7 @@ def run_subprocess(cmd, input_data=None, un=False, ignorereturncode=False):
     else:
         LOG.error("Error in subprocess. The following command failed: {}".format(cmd))
         LOG.error("Subprocess failed with error: \"{}\"".format(comp_proc.stderr.decode("utf-8")))
-        LOG.critical("ectyper has stopped")
-        raise Exception(f"subprocess failure while running {cmd} command")
+        #LOG.critical("ectyper has stopped")
+        return comp_proc
+        #raise Exception(f"subprocess failure while running {cmd} command")