Skip to content

Commit

Permalink
Add automatic reference selection (#2)
Browse files Browse the repository at this point in the history
* added find_best_reference.py script

* added necessary dependencies to env.yaml

* updated nextflow scripts to include find_best_reference
  • Loading branch information
jpalmer37 authored Oct 8, 2024
1 parent 81bdebb commit 3619c22
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 16 deletions.
131 changes: 131 additions & 0 deletions bin/find_best_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python3
import os
import pandas as pd
from glob import glob
from Bio import Entrez
import argparse


def collect_refseeker_files(input_path):
print("Collecting referenceseeker files ...")

refseeker_files = glob(os.path.join(input_path, '*', '*_referenceseeker.tsv'))

if len(refseeker_files) == 0:
print("ERROR: No referenceseeker files found in input directory.")
raise ValueError

return refseeker_files

def find_best_reference(filepaths: list) -> str:

print("Finding best reference ...")

dfs = []

for filepath in filepaths:
refseeker_df = pd.read_csv(filepath, sep='\t')

refseeker_df.insert(0, 'sample_id', os.path.basename(filepath).split("_")[0])
refseeker_df['z_mash'] = - (refseeker_df['Mash Distance'] - refseeker_df['Mash Distance'].mean()) / refseeker_df['Mash Distance'].std()
refseeker_df['z_ani'] = (refseeker_df['ANI'] - refseeker_df['ANI'].mean()) / refseeker_df['ANI'].std()
refseeker_df['z_con_dna'] = (refseeker_df['Con. DNA'] - refseeker_df['Con. DNA'].mean()) / refseeker_df['Con. DNA'].std()
refseeker_df['z_total'] = refseeker_df['z_mash'] + refseeker_df['z_ani'] + refseeker_df['z_con_dna']
refseeker_df = refseeker_df.sort_values('z_total',ascending=False)
dfs.append(refseeker_df)

concat_df = pd.concat(dfs)

best_references = concat_df.groupby('#ID')['z_total'].sum().sort_values(ascending=False)

return best_references.index[0]

def write_reference_file(assembly_id, outfile_name):

print("Searching for top reference assembly ...")

success = False
attempt = 1
while not success and attempt <= 3:
try:

with Entrez.esearch(db='nucleotide', term=assembly_id, retmax=1, idtype='acc') as handle:
results = Entrez.read(handle)
success = True

except RuntimeError as e :
print(f"Encountered RuntimeEerror with Entrez esearch on attempt #{attempt}. Retrying ...")
attempt += 1

if not success:
raise ValueError

if "IdList" not in results or len(results['IdList']) == 0:
print("ERROR: No GenBank entries found")
raise ValueError

if len(results['IdList']) > 1:
print("ERROR: Multiple GenBank entries found")
raise ValueError

best_reference_accno = results['IdList'][0]


print("Fetching and writing FASTA reference file ...")

success = False
attempt = 1
while not success and attempt <= 3:
try:
with Entrez.efetch(db='nucleotide', id=best_reference_accno, rettype='fasta', retmode='text') as infile, open(outfile_name + '.fa', 'w') as outfile:
for line in infile.readlines():
outfile.write(line)

success = True
except RuntimeError as e:
print(f"Encountered RuntimeEerror with Entrez efetch on attempt #{attempt}. Retrying ...")
attempt += 1

if not success:
raise ValueError

print("Fetching and writing GenBank reference file ...")

success = False
attempt = 1
while not success and attempt <= 3:
try:
with Entrez.efetch(db='nucleotide', id=best_reference_accno, rettype='gb', retmode='text') as infile, open(outfile_name + '.gb', 'w') as outfile:
for line in infile.readlines():
outfile.write(line)

success = True
except RuntimeError as e:
print(f"Encountered RuntimeEerror with Entrez efetch on attempt #{attempt}. Retrying ...")
attempt += 1

if not success:
raise ValueError

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True, help='Referenceseeker results path')
parser.add_argument('-o', '--outname', required=True, help='Prefix of output file')
return parser.parse_args()

def main():

args = get_args()

refseeker_files = collect_refseeker_files(args.input)

best_reference_id = find_best_reference(refseeker_files)

write_reference_file(best_reference_id, args.outname)

print("Complete.")


if __name__ == '__main__':
main()

3 changes: 3 additions & 0 deletions environments/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ channels:
dependencies:
- python=3
- csvkit=1.0.5
- pandas=2.1.3
- biopython=1.81
- snippy=4.6.0
- fastp=0.20.1
- qualimap=2.2.2d
- samtools=1.20
38 changes: 22 additions & 16 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,26 @@

nextflow.enable.dsl = 2

include { fastp } from './modules/snippy.nf'
include { snippy } from './modules/snippy.nf'
include { count_variants } from './modules/snippy.nf'
include { qualimap_bamqc } from './modules/snippy.nf'
include { fastp } from './modules/snippy.nf'
include { find_best_reference } from './modules/snippy.nf'
include { snippy } from './modules/snippy.nf'
include { count_variants } from './modules/snippy.nf'
include { qualimap_bamqc } from './modules/snippy.nf'

workflow {



// samplesheet input
if (params.samplesheet_input != 'NO_FILE') {

// if --ref file provided, use this reference for all entries
if (params.ref != 'NO_FILE'){
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }
ch_ref = ch_fastq.map{it -> it[0]}.combine(Channel.fromPath( "${params.ref}", type: 'file'))

// if no --ref file provided, look for REF column in the samplesheet on a per-sample basis
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }.unique{ it -> it[0] }
} else {
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2'], it['REF']] }
ch_ref = ch_fastq.map{it -> [it[0], it[3]]}
ch_fastq = ch_fastq.map{it -> it.subList(0,3)}
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2'], it['REF']] }.unique{ it -> it[0] }
}

// fastq input
} else {
if (params.ref != 'NO_FILE'){
ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
ch_ref = ch_fastq.map{it -> it[0]}.combine(Channel.fromPath( "${params.ref}", type: 'file'))
} else {
error "ERROR: Reference file must be supplied with --ref when using --fastq_input"
}
Expand All @@ -42,6 +33,21 @@ workflow {
// ch_ref: [sample_id, reference_path]

main:

if (params.ref != 'NO_FILE' && file(params.ref).isFile()){
println("Detected file as reference input. Using reference file for all samples ...")
ch_ref = ch_fastq.map{it -> it[0]}.combine(Channel.fromPath( "${params.ref}", type: 'file'))
}else if (params.ref != 'NO_FILE' && file(params.ref).isDirectory()){
println("Detected directory as reference input. Finding best reference based on referenceseeker results ...")
find_best_reference(Channel.fromPath(params.ref))
ch_ref = ch_fastq.map{it -> it[0]}.combine(find_best_reference.out.fasta)
}else if (params.ref == 'NO_FILE' && params.samplesheet_input != 'NO_FILE'){
ch_ref = ch_fastq.map{it -> [it[0], it[3]]}
ch_fastq = ch_fastq.map{it -> it.subList(0,3)}
}else {
error "ERROR: Invalid input."
}

fastp(ch_fastq)
snippy(fastp.out.reads.join(ch_ref))
qualimap_bamqc(snippy.out.alignment)
Expand Down
21 changes: 21 additions & 0 deletions modules/snippy.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ process fastp {
script:
"""
fastp \
--thread ${task.cpus} \
-i ${reads_r1} \
-I ${reads_r2} \
--cut_tail \
Expand All @@ -26,6 +27,26 @@ process fastp {
}


process find_best_reference {

executor 'local'

publishDir "${params.outdir}", mode: 'copy', pattern: "reference.*"

input:
path(refseeker_results_path)

output:
path("reference.fa"), emit: fasta
path("reference.gb"), emit: gb

script:
"""
find_best_reference.py --input ${refseeker_results_path} --outname reference
"""

}

process snippy {

tag { sample_id }
Expand Down

0 comments on commit 3619c22

Please sign in to comment.