From ec7677cb4d59aad6ccba68f89a5f5cfc44c79c27 Mon Sep 17 00:00:00 2001
From: Josh Chorlton <josh@bugseq.com>
Date: Tue, 21 Jan 2025 17:03:48 +0000
Subject: [PATCH 1/2] change gz check to check suffix of path

---
 ectyper/ectyper.py | 84 +++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
index da58316..e4447bd 100644
--- a/ectyper/ectyper.py
+++ b/ectyper/ectyper.py
@@ -47,11 +47,11 @@ def check_database_struct(db, dbpath):
 
 def decompress_gunzip_files(raw_genome_files, temp_dir):
     for idx, g in enumerate(raw_genome_files):
-        if 'gz' in g:
+        if g.endswith("gz"):
             LOG.info(f"Decompression of the gunzip {g} file started ...")
             with open(g, 'rb') as inf, open(f'{temp_dir}/{os.path.basename(g)[:-3]}', 'w', encoding='utf8') as tof:
                 decom_str = gzip.decompress(inf.read()).decode('utf-8')
-                tof.write(decom_str)    
+                tof.write(decom_str)
             LOG.info(f"Wrote decompressed file to {temp_dir}/{os.path.basename(g)[:-3]}")
             raw_genome_files[idx]=os.path.join(temp_dir,os.path.basename(g)[:-3])
     return raw_genome_files
@@ -64,19 +64,19 @@ def run_program():
     """
     LOG.setLevel(logging.INFO)
     args = commandLineOptions.parse_command_line()
-    
-    
+
+
     output_directory = create_output_directory(args)
-    
+
     # Create a file handler for log messages in the output directory for the root thread
     fh = logging.FileHandler(os.path.join(output_directory, 'ectyper.log'), 'w', 'utf-8')
-    
+
     if args.debug:
         fh.setLevel(logging.DEBUG)
         LOG.setLevel(logging.DEBUG)
     else:
         fh.setLevel(logging.INFO)
-        
+
     LOG.addHandler(fh)
 
     #try to load database
@@ -97,13 +97,13 @@ def run_program():
 
 
     check_database_struct(ectyperdb_dict, dbpath)
-    
+
     with open(definitions.PATHOTYPE_ALLELE_JSON) as fp:
         pathotype_db = json.load(fp)
 
     LOG.info("Starting ectyper v{} running on O and H antigen allele database v{} ({}) and pathotype database v{}".format(
-        __version__, 
-        ectyperdb_dict["version"], 
+        __version__,
+        ectyperdb_dict["version"],
         ectyperdb_dict["date"],
         pathotype_db['version']))
     LOG.debug("Command-line arguments were:\n{}".format(args))
@@ -113,18 +113,18 @@ def run_program():
     # Init MASH species database for species identification
     if speciesIdentification.get_species_mash(args.reference) == False:
         LOG.critical("MASH RefSeq sketch does not exists and was not able to be downloaded. Aborting run ...")
-        exit("No MASH RefSeq sketch file found at {}".format(args.reference))    
+        exit("No MASH RefSeq sketch file found at {}".format(args.reference))
 
-    # Initialize ectyper temporary directory. If --debug is specified then temp folder will be not be deleted. 
+    # Initialize ectyper temporary directory. If --debug is specified then temp folder will be not be deleted.
     # Python 3.12 introduced delete = False/True option in tempfile lib, so using explicit code supporting Python < 3.12
     temp_dir = create_temporary_directory(output_directory)
     os.makedirs(temp_dir, exist_ok=True)
-   
+
     LOG.info("Gathering genome files list ...")
-    
+
     input_files_list = genomeFunctions.get_files_as_list(args.input, args.maxdirdepth)
     raw_genome_files = decompress_gunzip_files(input_files_list, temp_dir)
-  
+
     LOG.info(f"Identifying genome file types on {len(raw_genome_files)} inputs ...")
     raw_files_dict = genomeFunctions.identify_raw_files(raw_genome_files,
                                                             args)
@@ -136,11 +136,11 @@ def run_program():
             genomeFunctions.create_combined_alleles_and_markers_file(
                 alleles_fasta_file, temp_dir, args.pathotype) #create a fasta reference from O-H alleles and optionally from pathotypes alleles database
 
-    
+
     bowtie_base = genomeFunctions.create_bowtie_base(temp_dir,
                                                          combined_fasta, args.cores) if \
                                                          raw_files_dict['fastq'] else None #only run this function on raw read inputs
-        
+
 
     # Assemble any fastq files, get final fasta list
     LOG.info("Assembling final list of fasta files")
@@ -157,10 +157,10 @@ def run_program():
             raw_files_dict['other'],
             raw_files_dict['filesnotfound'],
             args)
-    
-    
+
+
     LOG.info("Standardizing the E.coli genome headers based on file names")
-    
+
     predictions_dict={}; predictions_pathotype_dict={}
     if ecoli_genomes_dict:
         ecoli_genomes_dict = genomeFunctions.get_genome_names_from_files(
@@ -174,31 +174,31 @@ def run_program():
                                           alleles_fasta_file,
                                           temp_dir,
                                           ectyperdb_dict)
-           
-          
+
+
     if other_genomes_dict:
         other_genomes_dict = genomeFunctions.get_genome_names_from_files(
                 other_genomes_dict,
                 temp_dir,
                 args
-                )  
-            
-                
+                )
+
+
 
     # Run pathotype predictions if requested irrespective of --verify option
     if args.pathotype:
         predictions_pathotype_dict = predictionFunctions.predict_pathotype_and_shiga_toxin_subtype(ecoli_genomes_dict, other_genomes_dict,
                                                                                    temp_dir,
                                                                                    args.verify,
-                                                                                   args.percentIdentityPathotype, 
-                                                                                   args.percentCoveragePathotype, 
+                                                                                   args.percentIdentityPathotype,
+                                                                                   args.percentCoveragePathotype,
                                                                                    args.output, args.debug, pathotype_db)
 
     # Add empty rows for genomes without a blast result or non-E.coli samples that did not undergo typing
     final_predictions = predictionFunctions.add_non_predicted(
         raw_genome_files, predictions_dict, other_genomes_dict, filesnotfound_dict, ecoli_genomes_dict)
-    
-    
+
+
     for sample in final_predictions.keys():
         final_predictions[sample]["database"] = "v"+ectyperdb_dict["version"] + " (" + ectyperdb_dict["date"] + ")"
         if args.pathotype:
@@ -206,10 +206,10 @@ def run_program():
                 final_predictions[sample]["pathotype"] = "/".join(sorted(predictions_pathotype_dict[sample]['pathotype']))
                 for field_name in  [f for f in definitions.PATHOTYPE_TOXIN_FIELDS if 'pathotype_' in f]:
                     final_predictions[sample][field_name] = predictions_pathotype_dict[sample][field_name]
-                
+
                 for field_name in [f for f in definitions.PATHOTYPE_TOXIN_FIELDS if 'stx_' in f]:
                     final_predictions[sample][field_name] = predictions_pathotype_dict[sample][field_name]
-                    
+
         if 'O' in final_predictions[sample]: #not all samples will have O-antigen dictionary
             highsimilar_Ogroup = getOantigenHighSimilarGroup(final_predictions,sample)
 
@@ -227,13 +227,13 @@ def run_program():
 
         if args.verify:
             final_predictions[sample]["QC"] = predictionFunctions.getQuality_control_results(sample,final_predictions,ectyperdb_dict)
-  
+
     # Store most recent result in working directory
     LOG.info("Reporting final results to output.tsv file ...")
     predictionFunctions.report_result(final_predictions, output_directory,
                                           os.path.join(output_directory,
                                                        'output.tsv'),args)
-    
+
     if args.debug == False:
         shutil.rmtree(temp_dir, ignore_errors=True)
     LOG.info(f"ECTyper has finished successfully. Results available at {os.path.abspath(args.output)}")
@@ -265,7 +265,7 @@ def create_output_directory(args):
     :return: The output directory
     """
     # If no output directory is specified for the run, create a one based on time
-    
+
 
 
     if args.output is None:
@@ -280,19 +280,19 @@ def create_output_directory(args):
         args.output = out_dir
     else:
         if os.path.isabs(args.output):
-            out_dir = args.output 
+            out_dir = args.output
         else:
             out_dir = os.path.join(definitions.WORKPLACE_DIR, args.output)
 
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
-    
-    # clean previous ECTyper output files if the directory was used in previous runs 
+
+    # clean previous ECTyper output files if the directory was used in previous runs
     for file in definitions.OUTPUT_FILES_LIST:
         path2file = os.path.join(out_dir,file)
         if os.path.exists(path2file):
             LOG.info(f"Cleaning ECTyper previous files. Removing previously generated {path2file} ...")
-            os.remove(path2file) 
+            os.remove(path2file)
     return out_dir
 
 
@@ -351,7 +351,7 @@ def run_prediction(genome_files_dict, args, alleles_fasta, temp_dir, ectyperdb_d
         # merge the database predictions with the final predictions dict
         for r in results:
             predictions_dict = {**r, **predictions_dict}
-           
+
     for genome_name in predictions_dict.keys():
         predictions_dict[genome_name]["species"] = "-"
         predictions_dict[genome_name]["species_mash_hash_ratio2ref"] = "-"
@@ -379,7 +379,7 @@ def genome_group_prediction(g_group, alleles_fasta, args, temp_dir, ectyperdb_di
     :param temp_dir: ectyper run temp dir
     :return: dictionary of the results for the g_group
     """
-    
+
     # create a temp dir for blastdb -- each process gets its own directory
     LOG.setLevel(logging.INFO) #set level to info as by default only WARNING level is set at init time
     temp_dir_group = create_temporary_directory(temp_dir)
@@ -417,7 +417,7 @@ def genome_group_prediction(g_group, alleles_fasta, args, temp_dir, ectyperdb_di
             blast_output_file,
             ectyperdb_dict,
             args);
-    
+
     blast_output_file_path = os.path.join(args.output,f"blastn_output_alleles.txt")
     if os.path.exists(blast_output_file_path) == False:
         blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False)
@@ -425,6 +425,6 @@ def genome_group_prediction(g_group, alleles_fasta, args, temp_dir, ectyperdb_di
     else:
         blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , mode="a", header=False, sep="\t", index=False)
         LOG.info("Appending BLAST output file against reference alleles at {}".format(blast_output_file_path))
-    
+
 
     return db_prediction_dict

From 8ced875d691fff402d8556784d09c270214bf44d Mon Sep 17 00:00:00 2001
From: Josh Chorlton <josh@bugseq.com>
Date: Tue, 21 Jan 2025 17:06:09 +0000
Subject: [PATCH 2/2] add .

---
 ectyper/ectyper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
index e4447bd..ad258f5 100644
--- a/ectyper/ectyper.py
+++ b/ectyper/ectyper.py
@@ -47,7 +47,7 @@ def check_database_struct(db, dbpath):
 
 def decompress_gunzip_files(raw_genome_files, temp_dir):
     for idx, g in enumerate(raw_genome_files):
-        if g.endswith("gz"):
+        if g.endswith(".gz"):
             LOG.info(f"Decompression of the gunzip {g} file started ...")
             with open(g, 'rb') as inf, open(f'{temp_dir}/{os.path.basename(g)[:-3]}', 'w', encoding='utf8') as tof:
                 decom_str = gzip.decompress(inf.read()).decode('utf-8')