Add automatic reference selection (#2)

* added find_best_reference.py script * added necessary dependencies to env.yaml * updated nextflow scripts to include find_best_reference
jpalmer37 · Oct 8, 2024 · 3619c22 · 3619c22
1 parent 81bdebb
commit 3619c22
Show file tree

Hide file tree

Showing 4 changed files with 177 additions and 16 deletions.
diff --git a/bin/find_best_reference.py b/bin/find_best_reference.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+import os 
+import pandas as pd 
+from glob import glob 
+from Bio import Entrez
+import argparse
+
+
+def collect_refseeker_files(input_path):
+	print("Collecting referenceseeker files ...")
+
+	refseeker_files = glob(os.path.join(input_path, '*', '*_referenceseeker.tsv'))
+
+	if len(refseeker_files) == 0:
+		print("ERROR: No referenceseeker files found in input directory.")
+		raise ValueError
+
+	return refseeker_files
+
+def find_best_reference(filepaths: list) -> str:
+
+	print("Finding best reference ...")
+
+	dfs = []
+
+	for filepath in filepaths:
+		refseeker_df = pd.read_csv(filepath, sep='\t')
+
+		refseeker_df.insert(0, 'sample_id', os.path.basename(filepath).split("_")[0])
+		refseeker_df['z_mash']    = - (refseeker_df['Mash Distance'] - refseeker_df['Mash Distance'].mean()) / refseeker_df['Mash Distance'].std()
+		refseeker_df['z_ani']     = (refseeker_df['ANI'] - refseeker_df['ANI'].mean()) / refseeker_df['ANI'].std()
+		refseeker_df['z_con_dna'] = (refseeker_df['Con. DNA'] - refseeker_df['Con. DNA'].mean()) / refseeker_df['Con. DNA'].std()
+		refseeker_df['z_total']   = refseeker_df['z_mash'] + refseeker_df['z_ani'] + refseeker_df['z_con_dna']
+		refseeker_df = refseeker_df.sort_values('z_total',ascending=False)
+		dfs.append(refseeker_df)
+
+	concat_df = pd.concat(dfs)
+
+	best_references = concat_df.groupby('#ID')['z_total'].sum().sort_values(ascending=False)
+
+	return best_references.index[0]
+
+def write_reference_file(assembly_id, outfile_name):
+
+	print("Searching for top reference assembly ...")
+
+	success = False
+	attempt = 1
+	while not success and attempt <= 3:
+		try: 
+
+			with Entrez.esearch(db='nucleotide', term=assembly_id, retmax=1, idtype='acc') as handle:
+				results = Entrez.read(handle)
+			success = True
+
+		except RuntimeError as e :
+			print(f"Encountered RuntimeEerror with Entrez esearch on attempt #{attempt}. Retrying ...")
+			attempt += 1
+
+	if not success:
+		raise ValueError
+
+	if "IdList" not in results or len(results['IdList']) == 0:
+		print("ERROR: No GenBank entries found") 
+		raise ValueError
+
+	if len(results['IdList']) > 1:
+		print("ERROR: Multiple GenBank entries found") 
+		raise ValueError
+
+	best_reference_accno = results['IdList'][0]
+
+
+	print("Fetching and writing FASTA reference file ...")
+
+	success = False
+	attempt = 1
+	while not success and attempt <= 3:
+		try:
+			with Entrez.efetch(db='nucleotide', id=best_reference_accno, rettype='fasta', retmode='text') as infile, open(outfile_name + '.fa', 'w') as outfile:
+				for line in infile.readlines():
+					outfile.write(line)
+
+			success = True
+		except RuntimeError as e:
+			print(f"Encountered RuntimeEerror with Entrez efetch on attempt #{attempt}. Retrying ...")
+			attempt += 1
+
+	if not success:
+		raise ValueError
+
+	print("Fetching and writing GenBank reference file ...")
+
+	success = False
+	attempt = 1
+	while not success and attempt <= 3:
+		try:
+			with Entrez.efetch(db='nucleotide', id=best_reference_accno, rettype='gb', retmode='text') as infile, open(outfile_name + '.gb', 'w') as outfile:
+				for line in infile.readlines():
+					outfile.write(line)
+
+			success = True
+		except RuntimeError as e:
+			print(f"Encountered RuntimeEerror with Entrez efetch on attempt #{attempt}. Retrying ...")
+			attempt += 1
+
+	if not success:
+		raise ValueError
+
+def get_args():
+	parser = argparse.ArgumentParser()
+	parser.add_argument('-i', '--input', required=True, help='Referenceseeker results path')
+	parser.add_argument('-o', '--outname', required=True, help='Prefix of output file')
+	return parser.parse_args()
+
+def main():
+
+	args = get_args()
+
+	refseeker_files = collect_refseeker_files(args.input)
+
+	best_reference_id = find_best_reference(refseeker_files)
+
+	write_reference_file(best_reference_id, args.outname)
+
+	print("Complete.")
+
+
+if __name__ == '__main__':
+	main()
+
diff --git a/environments/environment.yml b/environments/environment.yml
@@ -6,6 +6,9 @@ channels:
 dependencies:
   - python=3
   - csvkit=1.0.5
+  - pandas=2.1.3
+  - biopython=1.81
   - snippy=4.6.0
   - fastp=0.20.1
   - qualimap=2.2.2d
+  - samtools=1.20
diff --git a/main.nf b/main.nf
@@ -2,35 +2,26 @@
 
 nextflow.enable.dsl = 2
 
-include { fastp }          from './modules/snippy.nf'
-include { snippy }         from './modules/snippy.nf'
-include { count_variants } from './modules/snippy.nf'
-include { qualimap_bamqc } from './modules/snippy.nf'
+include { fastp }                     from './modules/snippy.nf'
+include { find_best_reference }          from './modules/snippy.nf'
+include { snippy }                    from './modules/snippy.nf'
+include { count_variants }            from './modules/snippy.nf'
+include { qualimap_bamqc }            from './modules/snippy.nf'
 
 workflow {
 
-
-
   // samplesheet input
   if (params.samplesheet_input != 'NO_FILE') {
-
-    // if --ref file provided, use this reference for all entries
     if (params.ref != 'NO_FILE'){
-      ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }
-      ch_ref = ch_fastq.map{it -> it[0]}.combine(Channel.fromPath( "${params.ref}", type: 'file'))
-
-    // if no --ref file provided, look for REF column in the samplesheet on a per-sample basis
+      ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }.unique{ it -> it[0] }
     } else {
-      ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2'], it['REF']] }
-      ch_ref = ch_fastq.map{it -> [it[0], it[3]]}
-      ch_fastq = ch_fastq.map{it -> it.subList(0,3)}
+      ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2'], it['REF']] }.unique{ it -> it[0] }
     }
 
   // fastq input 
   } else {
     if (params.ref != 'NO_FILE'){
       ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
-      ch_ref = ch_fastq.map{it -> it[0]}.combine(Channel.fromPath( "${params.ref}", type: 'file'))
     } else {
       error "ERROR: Reference file must be supplied with --ref when using --fastq_input"
     }
@@ -42,6 +33,21 @@ workflow {
   // ch_ref: [sample_id, reference_path]
 
   main:
+
+    if (params.ref != 'NO_FILE' && file(params.ref).isFile()){
+      println("Detected file as reference input. Using reference file for all samples ...")
+      ch_ref = ch_fastq.map{it -> it[0]}.combine(Channel.fromPath( "${params.ref}", type: 'file'))
+    }else if (params.ref != 'NO_FILE' && file(params.ref).isDirectory()){
+      println("Detected directory as reference input. Finding best reference based on referenceseeker results ...")
+      find_best_reference(Channel.fromPath(params.ref))
+      ch_ref = ch_fastq.map{it -> it[0]}.combine(find_best_reference.out.fasta)
+    }else if (params.ref == 'NO_FILE' && params.samplesheet_input != 'NO_FILE'){
+      ch_ref = ch_fastq.map{it -> [it[0], it[3]]}
+      ch_fastq = ch_fastq.map{it -> it.subList(0,3)}
+    }else {
+      error "ERROR: Invalid input."
+    }
+
     fastp(ch_fastq)
     snippy(fastp.out.reads.join(ch_ref))
     qualimap_bamqc(snippy.out.alignment)

diff --git a/modules/snippy.nf b/modules/snippy.nf
@@ -15,6 +15,7 @@ process fastp {
   script:
   """
   fastp \
+    --thread ${task.cpus} \
     -i ${reads_r1} \
     -I ${reads_r2} \
     --cut_tail \
@@ -26,6 +27,26 @@ process fastp {
 }
 
 
+process find_best_reference {
+
+    executor 'local'
+
+    publishDir "${params.outdir}", mode: 'copy', pattern: "reference.*"
+
+    input:
+    path(refseeker_results_path)
+
+    output:
+    path("reference.fa"),                                               emit: fasta
+    path("reference.gb"),                                               emit: gb
+
+    script:
+    """
+    find_best_reference.py --input ${refseeker_results_path} --outname reference
+    """
+
+}
+
 process snippy {
 
     tag { sample_id }