This repository has been archived by the owner on Feb 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPhyloFlash_16S_ID.sh
110 lines (88 loc) · 6.41 KB
/
PhyloFlash_16S_ID.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash -l
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --mem-per-cpu=500G
#SBATCH --time=5-00:00:00 # 5 days
#SBATCH --output=your_output_file_date.stdout
#SBATCH [email protected]
#SBATCH --mail-type=ALL
#SBATCH --job-name="Job title and date here"
#SBATCH -p highmem
# you can use any of the following: intel, batch, highmem, gpu
cd your/path/here ### change to your currenty directory for sanity's sake
today=$(date "+%m.%d.%y") # date is the command to get today's date, and the "+%m_%d_%y" will print it in month_day_year format
# load modules you need for phyloflash (programs recruited within phyloflash)
module unload miniconda2
module load miniconda3
module load funannotate
module load perl
module load BBMap/38.86
module load vsearch
module load SPAdes/3.14.1
module load bedtools
module load mafft
module load barrnap
module load phyloFlash/3.3b4
## create PhyloFlash database with locally downloaded databases (Silva, UniVec)
## for database creation, visit this link https://hrgv.github.io/phyloFlash/install.html and see step 4.Setting up the reference database
phyloFlash_makedb.pl --univec_file path/to/file --silva_file path/to/file.1_SSURef_NR99_tax_silva_trunc.fasta.gz
mv 138.1 138_1db #rename database so it's compatible with PhyloFlash and has a name that sort of makes sense -- using Silva 138.1 version
## * cannot name db with the name "SILVA" due to the db file having the same name -- program uses that to ID file, so the actual db name has to be different
path=/path/to/where/sequences/are/stored # path to current directory
path2=${path}/PhyloFlash_Results # path to directory I want results stored
for i in $path/*_R1_CLEANEST.fastq;
do
SAMPLE=$(echo ${i} | sed 's/.*Pool-\(.*\)/\1/' | sed 's/_L001_R1_CLEANEST.fastq//') ## edit files to grab sample names - can adjust for your own sample file names
echo ${SAMPLE} "Finding 16S regions in trimmed, raw metagenome reads"
mkdir ${SAMPLE}_PhyloFlash_Output_${today} # make directory for specific sample results
cd ./${SAMPLE}_PhyloFlash_Output_${today}
phyloFlash.pl -lib Lib_phyflash -readlength 150 -CPUs 4 -poscov -log -dbhome $path2/138_1db -read1 $path/EA_Pool-${SAMPLE}_L001_R1_CLEANEST.fastq -read2 $path/EA_Pool-${SAMPLE}_L001_R2_CLEANEST.fastq
rename -v Lib_phyflash ${SAMPLE}_lib * #rename all files to include sample name in file name; must have rename command installed
cd $path2
done
## PhyloFlash Notes
# -log - writes status messages to log file
# -dbhome DIR Directory containing phyloFlash reference databases, prepared with phyloFlash_makedb.pl. If not specified, phyloFlash will check for an environment variable $PHYLOFLASH_DBHOME, then look in the current directory, the home directory, and the directory where the phyloFlash.pl script is located, for a suitable database directory containing the necessary files.
# - If you think phyloFlash is not detecting a certain organism that is very distant from the known SSU rRNA sequences please try lowering the minimum sequence identity for a mapping hit by using e.g. -id 0.63
# The name of the Fasta file should begin with SILVA_{DBNAME}_ where DBNAME is the name of the database (e.g. CustomDB), and will also be the name of the output folder containing the formatted database files
## PhyloFlash Output File Descriptions
#Report files
#These are the main human-readable output from phyloFlash.
#
#LIBNAME.phyloFlash.html phyloFlash report file in HTML format, with a report on the taxonomic composition of SSU rRNA reads, quality metrics for the library, and affiliation of the reconstructed/assembled full-length sequences.
#LIBNAME.phyloFlash plain text file version of the report
#Unassembled sequence files
#Reads that map to the reference database are extracted to these files in Fastq format
#
#LIBNAME.test_F.fq.gz.SSU.1.fq the filtered SSU reads and their paired read, forward read file
#LIBNAME.test_F.fq.gz.SSU.2.fq the filtered SSU reads and their paired read, reverse read file
#Assembled/reconstructed sequence files
#Assembled or reconstructed full-length SSU rRNA reads are output unless the -skip_spades or -skip_emirge options are used.
#
#LIBNAME.spades_rRNAs.final.fasta assembled OTUs from SPAdes with phyloFlash simplified headers
#
#LIBNAME.emirge.final.phyloFlash.notmatched.fa a fasta file with the reconstructed SSU sequences with no significant hit in the provided SSU database
#LIBNAME.emirge.final.fa a fasta file with the Emirge reconstructed SSU OTUs
#LIBNAME.emirge.final.phyloFlash.dbhits.fa a fasta file with the best hits for the reconstructed SSU sequences in the provided SSU database
#
#LIBNAME.all.final.fasta All assembled and reconstructed sequences from SPAdes and/or EMIRGE in a single file
#LIBNAME.all.final.phyloFlash.dbhits.fa
#LIBNAME.all.final.phyloFlash.notmatched.fa
#
#LIBNAME.all.dbhits.NR97.fa Reference sequences from database with hits from the supplied reads, clustered at 97% identity
#Alignments
#LIBNAME.SSU.collection.alignment.fasta an aligned multifasta of all the predicted OTUs and the references
#LIBNAME.SSU.collection.fasta a multifasta of all the predicted OTUs and the references
#LIBNAME.SSU.collection.fasta.tree an NJ tree of the mafft alignment of all the predicted OTUs and the references. PDF and PNG versions are created for the HTML report if the -html option is set
#Other statistics
#LIBNAME.inserthistogram Histogram of detected insert sizes in tab-separated format, if paired-end reads were input. PDF and PNG versions are created for the HTML report if the -html option is set
#LIBNAME.idhistogram Histogram of the % identity of reads vs. reference database sequences, in tab-separated format. PDF and PNG versions are created for the HTML report if the -html option is set
#LIBNAME.phyloFlash.NTUabundance.csv the list of uniqe higher level taxa (e.g. orders for bacteria) in the order of their appearance
#LIBNAME.scaffolds.arch.gff 16S rRNA gene predictions for assembled OTUs based on archaeal SSU rRNA hmm profile
#LIBNAME.scaffolds.bac.gff 16S rRNA gene predictions for assembled OTUs based on bacterial SSU rRNA hmm profile
#LIBNAME.scaffolds.euk.gff 18S rRNA gene predictions for assembled OTUs based on eukaryote SSU rRNA hmm profile
#CSV files used for multiple-sample comparison *** aka what to import into R!
#These files are used by the phyloFlash_heatmap.R script if you wish to compare multiple samples by their taxonomic composition.
#LIBNAME.phyloFlash.NTUabundance.csv
#LIBNAME.phyloFlash.report.csv