-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathunmerged.py
executable file
·163 lines (141 loc) · 6.37 KB
/
unmerged.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
**unmergerged.py
** Copyright (C) 2018 Jose Sergio Hleap
Given a pairend sequencing, if the amplicon is too long for the reads,
blast each one separately and report the matches nhgc
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
E-mail: [email protected]
"""
from io import StringIO
import pandas as pd
from multiprocessing import Pool
from joblib import Parallel, delayed
from subprocess import Popen, PIPE
import optparse
from itertools import zip_longest
import gzip
import csv
import os
names = 'qseqid sseqid pident evalue qcovs qlen length staxid'.split()
def iterzip(a, b, kwargs=None):
"""
zip two iterators (assumes the same length)
:param a: iterator one
:param b: iterator 2
:return: tuple of elements
"""
for e_a in a:
e_b = next(b)
assert e_a['id'].strip().split()[0] == e_b['id'].strip().split()[0]
yield e_a, e_b, kwargs
def grouper(iterable, n, fillvalue=None):
"""
Collect data into fixed-length chunks or blocks
From itertools recipe
:param iterable: container to be iterate over
:param n: Number of items to return
:param fillvalue: Padding
:return: zip_longest iterator
"""
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
def read_fastq(fn):
"""
Iterator over fastq files
:param fn: filename
:return: dictionary
"""
fnc = gzip.open if 'gz' in fn else open
fields = ['id', 'seq', 'plus', 'qual']
with fnc(fn) as f:
for lines in grouper(f, 4, ''):
assert len(lines) == 4
yield dict(zip(fields,lines))
def blast_helper(idx, r):
r1, r2, kwargs = r
db, evalue, p_id, tgts = kwargs
blast_line = ['blastn', '-db', db, '-evalue', str(evalue),
'-perc_identity', str(p_id), '-max_target_seqs', str(tgts),
'-outfmt', '6 qseqid sseqid pident evalue qcovs qlen length '
'staxid']
seq1 = '>read%d\n%s' % (idx, r1['seq'].decode('utf-8').strip())
seq2 = '>read%d\n%s' % (idx, r2['seq'].decode('utf-8').strip())
mapreads = '%s\t%s\tread%d' % (r1['id'].strip(), r2['id'].strip(), idx)
# run blasts
p1 = Popen(blast_line, stdin=PIPE, stdout=PIPE, stderr=PIPE,
env=os.environ.copy(), encoding='utf8', cwd=os.getcwd())
o1, e1 = p1.communicate(r1['seq'].decode('utf-8').strip())
p2 = Popen(blast_line, stdin=PIPE, stdout=PIPE, stderr=PIPE,
env=os.environ.copy(), encoding='utf8', cwd=os.getcwd())
o2, e2 = p2.communicate(r2['seq'].decode('utf-8').strip())
# read outputs
df1 = pd.read_table(StringIO(o1), sep='\t', header=None, names=names,
comment='#', quoting=csv.QUOTE_NONE, encoding='utf-8')
df2 = pd.read_table(StringIO(o2), sep='\t', header=None, names=names,
comment='#', quoting=csv.QUOTE_NONE, encoding='utf-8')
# check taxids
b = df2.staxid.isin(df1.staxid)
c = df1.staxid.isin(df2.staxid)
if b.any():
concordant = df1[c].merge(df2[b], on=['qseqid', 'staxid'],
suffixes=('_R1', '_R2'),)
discordant = None
c_appended = '>seq%s_appended\n%s%s' % (idx, r1['seq'].decode(
'utf-8').strip(), r2['seq'].decode('utf-8').strip())
d_appended = None
else:
df1['qseqid'] = df1.qseqid.apply(lambda x: '%s_R1' % x)
df2['qseqid'] = df2.qseqid.apply(lambda x: '%s_R2' % x)
discordant = pd.concat([df1, df2])
concordant = None
c_appended = None
d_appended = (seq1, seq2)
return concordant, discordant, mapreads, c_appended, d_appended
def blast_pair(prefix, db, evalue=1E-10, p_id=95, tgts=50, cpus=-1):
cpus = os.cpu_count() if cpus == -1 else cpus
fwd = '%s_R1.fastq.gz' % prefix
rev = '%s_R2.fastq.gz' % prefix
kwargs = (db, evalue, p_id, tgts)
it = iterzip(read_fastq(fwd), read_fastq(rev), kwargs)
with Pool(processes=cpus) as pool:
out = pool.starmap(blast_helper, enumerate(it))
#out = Parallel(n_jobs=cpus)(delayed(blast_helper)(
# r[0], r[1], idx, db, evalue, p_id, tgts) for idx, r in enumerate(it))
concordant, discordant, mapreads, c_appended, d_appended = zip(*out)
pd.concat(concordant).to_csv('%s_concordantblast.tsv' % prefix, sep='\t',
index=False)
pd.concat(concordant).to_csv('%s_discordantblast.tsv' % prefix, sep='\t',
index=False)
fwd, rev = zip(*[x for x in d_appended if x is not None])
with open('%s_mapreads.txt' % prefix, 'w') as m, \
open('%s_appended.fasta' % prefix, 'w') as a, \
open('%s_discordant_R1.fasta' % prefix, 'w') as r1, \
open('%s_discordant_R2.fasta' % prefix, 'w') as r2:
m.write('R1\tR2\tread\n%s' % '\n'.join(mapreads))
a.write('\n'.join([x for x in c_appended if x is not None]))
r1.write('\n'.join(fwd))
r2.write('\n'.join(rev))
if __name__ == '__main__':
intro = '''python %prog fastq_prefix db [options]'''
opts = optparse.OptionParser(usage='%s' % intro)
opts.add_option('--pident', '-p', action='store', type=float, default=95,
help='Minimum percent identity [default: %default]')
opts.add_option('--eval', '-e', action='store', type=float, default=1E-10,
help='Maximum evalue [default: %default]')
opts.add_option('--max_target_seqs', '-m', action='store', type=int,
default=50, help='Maximum number of targets '
'[default: %default]')
opts.add_option('--threads', '-t', action='store', type=int,
default=-1, help='threads to use [default: %default]')
opt, arg = opts.parse_args()
blast_pair(arg[0], arg[1], evalue=opt.eval, p_id=opt.pident,
tgts=opt.max_target_seqs, cpus=opt.threads)