Merge branch 'gtonkinhill:devel' into devel

nzmacalasdair · Feb 11, 2025 · 73efdbb · 73efdbb
2 parents aa64fed + 4dc499e
commit 73efdbb
Show file tree

Hide file tree

Showing 12 changed files with 179 additions and 152 deletions.
diff --git a/.github/workflows/panaroo_test.yml b/.github/workflows/panaroo_test.yml
@@ -21,9 +21,9 @@ jobs:
       matrix:
         python-version: [3.7, 3.8, 3.9]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
 

diff --git a/panaroo/__init__.py b/panaroo/__init__.py
@@ -1,2 +1,2 @@
 '''An updated pipeline for pangenome investigation'''
-__version__ = '1.4.2'
+__version__ = '1.5.1'
diff --git a/panaroo/__main__.py b/panaroo/__main__.py
@@ -114,6 +114,10 @@ def get_options(args):
                           dest="len_dif_percent",
                           help="length difference cutoff (default=0.98)",
                           type=float)
+    matching.add_argument("--family_len_dif_percent",
+                          dest="family_len_dif_percent",
+                          help="length difference cutoff at the gene family level (default=0.0)",
+                          type=float)
     matching.add_argument("--merge_paralogs",
                           dest="merge_paralogs",
                           help="don't split paralogs",
@@ -135,11 +139,24 @@ def get_options(args):
               "be found in order to consider it a match"),
         default=0.2,
         type=float)
-    refind.add_argument("--refind_strict",
-                          dest="only_valid_genes",
-                          help="Prevent fragmented, misassembled, or potential pseudogene sequences from being re-found.",
-                          action='store_true',
-                          default=False)
+
+    refind.add_argument(
+        "--refind-mode",
+        dest="refind_mode",
+        help=
+        ('''R|The stringency mode at which to re-find genes.
+
+default: 
+Will re-find similar gene sequences. Allows for premature stop codons and incorrect lengths \
+to account for misassemblies.
+
+strict: 
+Prevents fragmented, misassembled, or potential pseudogene sequences from being re-found.
+
+off: 
+Turns off all re-finding steps.'''),
+        choices=['default', 'strict', 'off'],
+        default='default')
 
     graph = parser.add_argument_group('Graph correction')
 
@@ -226,7 +243,7 @@ def get_options(args):
         help=
         "Specify an aligner. Options:'prank', 'clustal', and default: 'mafft'",
         type=str,
-        choices=['prank', 'clustal', 'mafft'],
+        choices=['prank', 'clustal', 'mafft', 'none'],
         default="mafft")
     core.add_argument(
         "--codons",
@@ -373,6 +390,7 @@ def main():
                           outdir=temp_dir,
                           dna_error_threshold=0.98,
                           correct_mistranslations=True,
+                          family_len_dif_percent=args.family_len_dif_percent,
                           length_outlier_support_proportion=args.
                           length_outlier_support_proportion,
                           n_cpu=args.n_cpu,
@@ -388,6 +406,7 @@ def main():
         outdir=temp_dir,
         family_threshold=args.family_threshold,
         correct_mistranslations=False,
+        family_len_dif_percent=args.family_len_dif_percent,
         length_outlier_support_proportion=args.
         length_outlier_support_proportion,
         n_cpu=args.n_cpu,
@@ -406,41 +425,48 @@ def main():
             "reducing your clustering sequence identity thresholds and/or running "
             "Panaroo in sensitive mode.")
 
-    if args.verbose:
-        print("refinding genes...")
-
-    # find genes that Prokka has missed
-    G = find_missing(G,
-                     args.input_files,
-                     dna_seq_file=args.output_dir + "combined_DNA_CDS.fasta",
-                     prot_seq_file=args.output_dir +
-                     "combined_protein_CDS.fasta",
-                     gene_data_file=args.output_dir + "gene_data.csv",
-                     remove_by_consensus=args.remove_by_consensus,
-                     search_radius=args.search_radius,
-                     prop_match=args.refind_prop_match,
-                     pairwise_id_thresh=args.id,
-                     merge_id_thresh=max(0.8, args.family_threshold),
-                     only_valid_genes=args.only_valid_genes,
-                     n_cpu=args.n_cpu,
-                     verbose=args.verbose)
-
-    # remove edges that are likely due to misassemblies (by consensus)
-
-    # merge again in case refinding has resolved issues
-    if args.verbose:
-        print("collapse gene families with refound genes...")
-    G = collapse_families(G,
-                          seqid_to_centroid=seqid_to_centroid,
-                          outdir=temp_dir,
-                          family_threshold=args.family_threshold,
-                          correct_mistranslations=False,
-                          length_outlier_support_proportion=args.
-                          length_outlier_support_proportion,
-                          n_cpu=args.n_cpu,
-                          quiet=(not args.verbose),
-                          distances_bwtn_centroids=distances_bwtn_centroids,
-                          centroid_to_index=centroid_to_index)[0]
+    if args.refind_mode!="off":
+        if args.refind_mode=="strict":
+            only_valid_genes=True
+        else:
+            only_valid_genes=False
+
+        if args.verbose:
+            print("refinding genes...")
+
+        # find genes that Prokka has missed
+        G = find_missing(G,
+                        args.input_files,
+                        dna_seq_file=args.output_dir + "combined_DNA_CDS.fasta",
+                        prot_seq_file=args.output_dir +
+                        "combined_protein_CDS.fasta",
+                        gene_data_file=args.output_dir + "gene_data.csv",
+                        remove_by_consensus=args.remove_by_consensus,
+                        search_radius=args.search_radius,
+                        prop_match=args.refind_prop_match,
+                        pairwise_id_thresh=args.id,
+                        merge_id_thresh=max(0.8, args.family_threshold),
+                        only_valid_genes=only_valid_genes,
+                        n_cpu=args.n_cpu,
+                        verbose=args.verbose)
+
+        # remove edges that are likely due to misassemblies (by consensus)
+
+        # merge again in case refinding has resolved issues
+        if args.verbose:
+            print("collapse gene families with refound genes...")
+        G = collapse_families(G,
+                            seqid_to_centroid=seqid_to_centroid,
+                            outdir=temp_dir,
+                            family_threshold=args.family_threshold,
+                            correct_mistranslations=False,
+                            family_len_dif_percent=args.family_len_dif_percent,
+                            length_outlier_support_proportion=args.
+                            length_outlier_support_proportion,
+                            n_cpu=args.n_cpu,
+                            quiet=(not args.verbose),
+                            distances_bwtn_centroids=distances_bwtn_centroids,
+                            centroid_to_index=centroid_to_index)[0]
 
     if args.clean_edges:
         G = clean_misassembly_edges(
@@ -524,9 +550,10 @@ def main():
         if args.verbose: print("generating pan genome MSAs...")
         generate_pan_genome_alignment(G, temp_dir, args.output_dir, args.n_cpu,
                                       args.alr, args.codons, isolate_names)
-        core_nodes = get_core_gene_nodes(G, args.core, len(args.input_files))
-        core_names = [G.nodes[x]["name"] for x in core_nodes]
-        concatenate_core_genome_alignments(core_names, args.output_dir, args.hc_threshold)
+        if args.alr!='none':
+            core_nodes = get_core_gene_nodes(G, args.core, len(args.input_files))
+            core_names = [G.nodes[x]["name"] for x in core_nodes]
+            concatenate_core_genome_alignments(core_names, args.output_dir, args.hc_threshold)
     elif args.aln == "core":
         if args.verbose: print("generating core genome MSAs...")
         generate_core_genome_alignment(G, temp_dir, args.output_dir,

diff --git a/panaroo/cdhit.py b/panaroo/cdhit.py
@@ -190,7 +190,7 @@ def cluster_nodes_cdhit(
                       s=s,
                       aL=aL,
                       AL=AL,
-                      aS=AS,
+                      aS=aS,
                       accurate=accurate,
                       use_local=use_local,
                       strand=strand,
@@ -203,7 +203,7 @@ def cluster_nodes_cdhit(
                   s=s,
                   aL=aL,
                   AL=AL,
-                  aS=AS,
+                  aS=aS,
                   accurate=accurate,
                   use_local=use_local,
                   quiet=quiet,

diff --git a/panaroo/clean_network.py b/panaroo/clean_network.py
@@ -92,6 +92,7 @@ def collapse_families(G,
                       outdir,
                       family_threshold=0.7,
                       dna_error_threshold=0.99,
+                      family_len_dif_percent=0,
                       correct_mistranslations=False,
                       length_outlier_support_proportion=0.01,
                       n_cpu=1,
@@ -113,6 +114,7 @@ def collapse_families(G,
         cdhit_clusters = iterative_cdhit(G,
                                          outdir,
                                          thresholds=threshold,
+                                         s=family_len_dif_percent,
                                          n_cpu=n_cpu,
                                          quiet=True,
                                          dna=True,
@@ -124,6 +126,7 @@ def collapse_families(G,
         cdhit_clusters = iterative_cdhit(G,
                                          outdir,
                                          thresholds=threshold,
+                                         s=family_len_dif_percent,
                                          n_cpu=n_cpu,
                                          quiet=True,
                                          dna=False)

diff --git a/panaroo/extract_gene_fasta.py b/panaroo/extract_gene_fasta.py
@@ -109,6 +109,10 @@ def main():
 
     # make sure trailing forward slash is present
     args.output_dir = os.path.join(args.output_dir, "")
+    #create folder for output gene sequences
+    extracted_folder = args.output_dir + "extracted_gene_sequences/"
+    if not os.path.isdir(extracted_folder):
+        os.mkdir(extracted_folder)
 
     with open(args.pa_file, 'r') as infile:
         header = next(infile).strip().split(',')
@@ -122,9 +126,10 @@ def main():
                     genes = genes.replace("_len", "").replace("_pseudo", "").split(";")
                     for g in genes:
                         geneids.add((genome,g))
+                outfile_name = extracted_folder + line[0][:248] + ".fasta"                    
                 generate_fasta(
                     geneids,
-                    outputfile=args.output_dir + line[0] + ".fasta",
+                    outputfile=outfile_name,
                     genedata=args.gene_data,
                     isdna=args.isdna,
                     idtype=args.idtype,

diff --git a/panaroo/find_missing.py b/panaroo/find_missing.py
@@ -101,7 +101,8 @@ def find_missing(G,
                             search_radius=search_radius,
                             prop_match=prop_match,
                             pairwise_id_thresh=pairwise_id_thresh,
-                            merge_id_thresh=merge_id_thresh)
+                            merge_id_thresh=merge_id_thresh,
+                            only_valid_genes=only_valid_genes)
         for member, gff_handle in tqdm(enumerate(gff_file_handles),
                                        disable=(not verbose))))
 
@@ -184,7 +185,6 @@ def find_missing(G,
                         if (node, member) in bad_node_mem_pairs: continue
 
                         hit_protein = hits_trans_dict[member][i]
-                        if not is_valid_gene(dna_hit, hit_protein) and only_valid_genes: continue
 
                         hit_strand = '+' if node_locs[node][1][2]==0 else '-'
                         G.nodes[node]['members'].add(member)
@@ -226,6 +226,7 @@ def search_gff(node_search_dict,
                prop_match=0.2,
                pairwise_id_thresh=0.95,
                merge_id_thresh=0.7,
+               only_valid_genes=False,
                n_cpu=1):
 
     gff_handle = open(gff_handle_name, 'r')
@@ -323,6 +324,10 @@ def search_gff(node_search_dict,
             if len(hit) > len(best_hit):
                 best_hit = hit
                 best_loc = [gene[0], loc]
+
+        if only_valid_genes:
+            if not is_valid_gene(hit, translate(search[0])):
+                continue
 
         hits.append((node, best_hit))
         if (best_loc is not None) and (best_hit != ""):