-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseparate_genes.py
executable file
·159 lines (126 loc) · 5.52 KB
/
separate_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to separate genes from MiSeq run fastQ files (NEXTERA style)
AFTER the reads have been trimmed for the tail and adapter (see trim_adaptors.py)
Trim forward and reverse primers.
It will produce one file for each gene (4 here - 16S, 18S, UPA, tufA)
Created on Mon Jul 28 17:27:33 2014
@author: VanessaRM
"""
""" USAGE:
python speratae_genes.py my_fastq_file.fastq
"""
from Bio import SeqIO
import regex
import sys
# Help text
if len(sys.argv) == 1:
print ""
print "Script to separate genes and trim primers from a fastQ file."
print "Allows one mismatch in the primer sequence"
print ""
print "Usage: supply the file name"
print "ex: python speratae_genes.py my_fastq_file.fastq"
print ""
print "If you want to run it for multiple files, use the shell:"
print "for file in *_R1_001.fastq; do python separate_genes.py $file; done >> screen.out 2>> screen.err &"
print ""
sys.exit()
# input files and arguments:
input_file = str(sys.argv[1])
output_file_16S = input_file + "_16S.fastq"
output_file_18S = input_file + "_18S.fastq"
output_file_UPA = input_file + "_UPA.fastq"
output_file_tufA = input_file + "_tufA.fastq"
output_file_umatched = input_file + "_unmatched.fastq"
# Globals
finder = ""
# Create lists to store seqs
#all_seqs =[] #just to see if the code behaves well
s16_list =[]
s18_list = []
UPA_list=[]
tufA_list =[]
unmatched=[]
# Forward Primers (\w = ambiguous positions, only 12 "internal" bases used), allowing for 1 mismatch.
p16S_f = '(GTGCCAGC\wGCCGCGGTAA){e<=1}' #F515
p18S_f = '(GGTGGTGCATGGCCGTTCTTAGTT){e<=1}' #NF1
pUPA_f = '(GGACAGAAAGACCCTATGAA){e<=1}' #p23SrV_f1
ptufA_f = '(AC\wGG\wCG\wGG\wAC\wGT){e<=1}' #Oq_tuf2F
#Reverse primers(their reverse complement):
p16S_r = '(ATTAGA\wACCC\w\wGTAGTCC){e<=1}' #R806 rc ATTAGAWACCCBDGTAGTCC
p18S_r = '(ATTACGTCCCTGCCCTTTGTA){e<=1}' # B18Sr2B rc
pUPA_r = '(CTCTAGGGATAACAGGCTGA){e<=1}' #p23SrV_r1 rc
ptufA_r = '(GCG\wTT\wGC\wATTCG\wGAAGG){e<=1}' #tufAR rc
#define primer_finder function
def primer_finder (records, primer_f1, primer_r1, primer_f2, primer_r2, primer_f3, primer_r3, primer_f4, primer_r4):
"Trims the primers and saves the sequences in amplicon-separated files. Put primers in order: 16S, 18S, UPA, tufA"
for record in records:
sequence = str(record.seq)
#Initial values of cut_off - in case you don't need to trim anything
cut_off_f= 0
cut_off_r = len(sequence)
#Search the primers f and r
index_f = regex.search((primer_f1), sequence)
index_r = regex.search((primer_r1), sequence)
if index_f != None:
#found the forward primer, so define where the sequence needs to be trimmed
cut_off_f = int(index_f.span()[1])
if index_r != None:
#found the reverse primer
cut_off_r = int(index_r.span()[0]+1) #the +1 is to cut just when the primer starts.
#Store the trimmed seq
if index_f or index_r != None:
s16_list.append(record [cut_off_f:cut_off_r])
else: #search for next primer
index_f = regex.search((primer_f2), sequence)
index_r = regex.search((primer_r2), sequence)
if index_f != None:
cut_off_f = int(index_f.span()[1])
if index_r != None:
cut_off_r = int(index_r.span()[0]+1)
if index_f or index_r != None:
s18_list.append(record [cut_off_f:cut_off_r])
else:
index_f = regex.search((primer_f3), sequence)
index_r = regex.search((primer_r3), sequence)
if index_f != None:
cut_off_f = int(index_f.span()[1])
if index_r != None:
cut_off_r = int(index_r.span()[0]+1)
if index_f or index_r != None:
UPA_list.append(record [cut_off_f:cut_off_r])
else:
index_f = regex.search((primer_f4), sequence)
index_r = regex.search((primer_r4), sequence)
if index_f != None:
cut_off_f = int(index_f.span()[1])
if index_r != None:
cut_off_r = int(index_r.span()[0]+1)
if index_f or index_r != None:
tufA_list.append(record [cut_off_f:cut_off_r])
else:
unmatched.append(record)
#Iterate over fastq file
print ("separating genes... It can take a while...")
original_reads = SeqIO.parse(input_file, "fastq")
do_it = primer_finder(original_reads,p16S_f,p16S_r,p18S_f,p18S_r,pUPA_f,pUPA_r,ptufA_f,ptufA_r)
count_um = SeqIO.write(unmatched, output_file_umatched, "fastq")
print ""
print "%i sequences did not match the primers and were stored in the %s file." %(count_um, output_file_umatched)
print ""
count_16S = SeqIO.write(s16_list, output_file_16S, "fastq")
print"",
print "Saved %i reads in the %s file." %(count_16S, output_file_16S)
count_18S = SeqIO.write(s18_list, output_file_18S, "fastq")
print""
print "Saved %i reads in the %s file." %(count_18S, output_file_18S)
count_UPA = SeqIO.write(UPA_list, output_file_UPA, "fastq")
print""
print "Saved %i reads in the %s file." %(count_UPA, output_file_UPA)
count_tufA = SeqIO.write(tufA_list, output_file_tufA, "fastq")
print""
print "Saved %i reads in the %s file." %(count_tufA, output_file_tufA)
print""
print "Done!"