Skip to content

Commit

Permalink
Merge branch 'gtonkinhill:devel' into devel
Browse files Browse the repository at this point in the history
  • Loading branch information
nzmacalasdair authored Feb 11, 2025
2 parents aa64fed + 4dc499e commit 73efdbb
Show file tree
Hide file tree
Showing 12 changed files with 179 additions and 152 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/panaroo_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ jobs:
matrix:
python-version: [3.7, 3.8, 3.9]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

Expand Down
2 changes: 1 addition & 1 deletion panaroo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
'''An updated pipeline for pangenome investigation'''
__version__ = '1.4.2'
__version__ = '1.5.1'
115 changes: 71 additions & 44 deletions panaroo/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ def get_options(args):
dest="len_dif_percent",
help="length difference cutoff (default=0.98)",
type=float)
matching.add_argument("--family_len_dif_percent",
dest="family_len_dif_percent",
help="length difference cutoff at the gene family level (default=0.0)",
type=float)
matching.add_argument("--merge_paralogs",
dest="merge_paralogs",
help="don't split paralogs",
Expand All @@ -135,11 +139,24 @@ def get_options(args):
"be found in order to consider it a match"),
default=0.2,
type=float)
refind.add_argument("--refind_strict",
dest="only_valid_genes",
help="Prevent fragmented, misassembled, or potential pseudogene sequences from being re-found.",
action='store_true',
default=False)

refind.add_argument(
"--refind-mode",
dest="refind_mode",
help=
('''R|The stringency mode at which to re-find genes.
default:
Will re-find similar gene sequences. Allows for premature stop codons and incorrect lengths \
to account for misassemblies.
strict:
Prevents fragmented, misassembled, or potential pseudogene sequences from being re-found.
off:
Turns off all re-finding steps.'''),
choices=['default', 'strict', 'off'],
default='default')

graph = parser.add_argument_group('Graph correction')

Expand Down Expand Up @@ -226,7 +243,7 @@ def get_options(args):
help=
"Specify an aligner. Options:'prank', 'clustal', and default: 'mafft'",
type=str,
choices=['prank', 'clustal', 'mafft'],
choices=['prank', 'clustal', 'mafft', 'none'],
default="mafft")
core.add_argument(
"--codons",
Expand Down Expand Up @@ -373,6 +390,7 @@ def main():
outdir=temp_dir,
dna_error_threshold=0.98,
correct_mistranslations=True,
family_len_dif_percent=args.family_len_dif_percent,
length_outlier_support_proportion=args.
length_outlier_support_proportion,
n_cpu=args.n_cpu,
Expand All @@ -388,6 +406,7 @@ def main():
outdir=temp_dir,
family_threshold=args.family_threshold,
correct_mistranslations=False,
family_len_dif_percent=args.family_len_dif_percent,
length_outlier_support_proportion=args.
length_outlier_support_proportion,
n_cpu=args.n_cpu,
Expand All @@ -406,41 +425,48 @@ def main():
"reducing your clustering sequence identity thresholds and/or running "
"Panaroo in sensitive mode.")

if args.verbose:
print("refinding genes...")

# find genes that Prokka has missed
G = find_missing(G,
args.input_files,
dna_seq_file=args.output_dir + "combined_DNA_CDS.fasta",
prot_seq_file=args.output_dir +
"combined_protein_CDS.fasta",
gene_data_file=args.output_dir + "gene_data.csv",
remove_by_consensus=args.remove_by_consensus,
search_radius=args.search_radius,
prop_match=args.refind_prop_match,
pairwise_id_thresh=args.id,
merge_id_thresh=max(0.8, args.family_threshold),
only_valid_genes=args.only_valid_genes,
n_cpu=args.n_cpu,
verbose=args.verbose)

# remove edges that are likely due to misassemblies (by consensus)

# merge again in case refinding has resolved issues
if args.verbose:
print("collapse gene families with refound genes...")
G = collapse_families(G,
seqid_to_centroid=seqid_to_centroid,
outdir=temp_dir,
family_threshold=args.family_threshold,
correct_mistranslations=False,
length_outlier_support_proportion=args.
length_outlier_support_proportion,
n_cpu=args.n_cpu,
quiet=(not args.verbose),
distances_bwtn_centroids=distances_bwtn_centroids,
centroid_to_index=centroid_to_index)[0]
if args.refind_mode!="off":
if args.refind_mode=="strict":
only_valid_genes=True
else:
only_valid_genes=False

if args.verbose:
print("refinding genes...")

# find genes that Prokka has missed
G = find_missing(G,
args.input_files,
dna_seq_file=args.output_dir + "combined_DNA_CDS.fasta",
prot_seq_file=args.output_dir +
"combined_protein_CDS.fasta",
gene_data_file=args.output_dir + "gene_data.csv",
remove_by_consensus=args.remove_by_consensus,
search_radius=args.search_radius,
prop_match=args.refind_prop_match,
pairwise_id_thresh=args.id,
merge_id_thresh=max(0.8, args.family_threshold),
only_valid_genes=only_valid_genes,
n_cpu=args.n_cpu,
verbose=args.verbose)

# remove edges that are likely due to misassemblies (by consensus)

# merge again in case refinding has resolved issues
if args.verbose:
print("collapse gene families with refound genes...")
G = collapse_families(G,
seqid_to_centroid=seqid_to_centroid,
outdir=temp_dir,
family_threshold=args.family_threshold,
correct_mistranslations=False,
family_len_dif_percent=args.family_len_dif_percent,
length_outlier_support_proportion=args.
length_outlier_support_proportion,
n_cpu=args.n_cpu,
quiet=(not args.verbose),
distances_bwtn_centroids=distances_bwtn_centroids,
centroid_to_index=centroid_to_index)[0]

if args.clean_edges:
G = clean_misassembly_edges(
Expand Down Expand Up @@ -524,9 +550,10 @@ def main():
if args.verbose: print("generating pan genome MSAs...")
generate_pan_genome_alignment(G, temp_dir, args.output_dir, args.n_cpu,
args.alr, args.codons, isolate_names)
core_nodes = get_core_gene_nodes(G, args.core, len(args.input_files))
core_names = [G.nodes[x]["name"] for x in core_nodes]
concatenate_core_genome_alignments(core_names, args.output_dir, args.hc_threshold)
if args.alr!='none':
core_nodes = get_core_gene_nodes(G, args.core, len(args.input_files))
core_names = [G.nodes[x]["name"] for x in core_nodes]
concatenate_core_genome_alignments(core_names, args.output_dir, args.hc_threshold)
elif args.aln == "core":
if args.verbose: print("generating core genome MSAs...")
generate_core_genome_alignment(G, temp_dir, args.output_dir,
Expand Down
4 changes: 2 additions & 2 deletions panaroo/cdhit.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def cluster_nodes_cdhit(
s=s,
aL=aL,
AL=AL,
aS=AS,
aS=aS,
accurate=accurate,
use_local=use_local,
strand=strand,
Expand All @@ -203,7 +203,7 @@ def cluster_nodes_cdhit(
s=s,
aL=aL,
AL=AL,
aS=AS,
aS=aS,
accurate=accurate,
use_local=use_local,
quiet=quiet,
Expand Down
3 changes: 3 additions & 0 deletions panaroo/clean_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def collapse_families(G,
outdir,
family_threshold=0.7,
dna_error_threshold=0.99,
family_len_dif_percent=0,
correct_mistranslations=False,
length_outlier_support_proportion=0.01,
n_cpu=1,
Expand All @@ -113,6 +114,7 @@ def collapse_families(G,
cdhit_clusters = iterative_cdhit(G,
outdir,
thresholds=threshold,
s=family_len_dif_percent,
n_cpu=n_cpu,
quiet=True,
dna=True,
Expand All @@ -124,6 +126,7 @@ def collapse_families(G,
cdhit_clusters = iterative_cdhit(G,
outdir,
thresholds=threshold,
s=family_len_dif_percent,
n_cpu=n_cpu,
quiet=True,
dna=False)
Expand Down
7 changes: 6 additions & 1 deletion panaroo/extract_gene_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ def main():

# make sure trailing forward slash is present
args.output_dir = os.path.join(args.output_dir, "")
#create folder for output gene sequences
extracted_folder = args.output_dir + "extracted_gene_sequences/"
if not os.path.isdir(extracted_folder):
os.mkdir(extracted_folder)

with open(args.pa_file, 'r') as infile:
header = next(infile).strip().split(',')
Expand All @@ -122,9 +126,10 @@ def main():
genes = genes.replace("_len", "").replace("_pseudo", "").split(";")
for g in genes:
geneids.add((genome,g))
outfile_name = extracted_folder + line[0][:248] + ".fasta"
generate_fasta(
geneids,
outputfile=args.output_dir + line[0] + ".fasta",
outputfile=outfile_name,
genedata=args.gene_data,
isdna=args.isdna,
idtype=args.idtype,
Expand Down
9 changes: 7 additions & 2 deletions panaroo/find_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ def find_missing(G,
search_radius=search_radius,
prop_match=prop_match,
pairwise_id_thresh=pairwise_id_thresh,
merge_id_thresh=merge_id_thresh)
merge_id_thresh=merge_id_thresh,
only_valid_genes=only_valid_genes)
for member, gff_handle in tqdm(enumerate(gff_file_handles),
disable=(not verbose))))

Expand Down Expand Up @@ -184,7 +185,6 @@ def find_missing(G,
if (node, member) in bad_node_mem_pairs: continue

hit_protein = hits_trans_dict[member][i]
if not is_valid_gene(dna_hit, hit_protein) and only_valid_genes: continue

hit_strand = '+' if node_locs[node][1][2]==0 else '-'
G.nodes[node]['members'].add(member)
Expand Down Expand Up @@ -226,6 +226,7 @@ def search_gff(node_search_dict,
prop_match=0.2,
pairwise_id_thresh=0.95,
merge_id_thresh=0.7,
only_valid_genes=False,
n_cpu=1):

gff_handle = open(gff_handle_name, 'r')
Expand Down Expand Up @@ -323,6 +324,10 @@ def search_gff(node_search_dict,
if len(hit) > len(best_hit):
best_hit = hit
best_loc = [gene[0], loc]

if only_valid_genes:
if not is_valid_gene(hit, translate(search[0])):
continue

hits.append((node, best_hit))
if (best_loc is not None) and (best_hit != ""):
Expand Down
Loading

0 comments on commit 73efdbb

Please sign in to comment.