-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbarcount.py
391 lines (308 loc) · 18.2 KB
/
barcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
import sys
import argparse
from warnings import warn
import pandas as ps
import numpy as np
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
def levenshtein_distance(s1, s2):
"""
Python version of Levenshtein distance for compatability. The third-party
library is much faster and recommended. Taken from recipe at:
http://code.activestate.com/recipes/576874-levenshtein-distance/
"""
l1 = len(s1)
l2 = len(s2)
matrix = [range(l1 + 1)] * (l2 + 1)
for zz in xrange(l2 + 1):
matrix[zz] = range(zz, zz + l1 + 1)
for zz in range(0, l2):
for sz in range(0, l1):
if s1[sz] == s2[zz]:
matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1,
matrix[zz][sz + 1] + 1, matrix[zz][sz])
else:
matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1,
matrix[zz][sz + 1] + 1, matrix[zz][sz] + 1)
return matrix[l2][l1]
try:
import Levenshtein
levenshtein_distance = Levenshtein.distance
except ImportError:
warn("python-Levenshtein package not found. The package is " +
"recommended as it makes barcode splitting much faster. " +
"Using native Python Levenshtein distance function instead.")
def hamming2(s1, s2):
"""Calculate the Hamming distance between two bit strings"""
assert len(s1) == len(s2)
return sum(c1 != c2 for c1, c2 in zip(s1, s2))
def run_analysis(barcode_table, out, fastq, flanking_left, flanking_right, min_read_length=None, max_read_length=None, rmdup=False, max_distance_flanks=0, max_distance_barcode=1, verbose=False, debug=False, seqfile_format='fastq', save_extracted_barcodes=False, umiA_start=None, umiA_end=None, umiB_start=None, umiB_end=None):
#Read the table with gene-barcode matchings
barcode_data = ps.read_csv(barcode_table, header=None, names=['gene',], index_col=1)
barcode_data.loc['unassigned'] = np.nan
if len(barcode_data.index) != len(list(set(barcode_data.index))):
raise IOError('Duplicated entry in barcode table or multiple barcodes map to the same gene.')
if len(barcode_data.columns) != 1:
raise IOError('Error reading barcode table. Make sure the file is comma-separated and has exactly two columns: gene id, barcode')
if verbose : 'Read barcode table containing %i barcode-gene pairs'%len(barcode_data.index)
barcodes = barcode_data.index.tolist()
#Setup output stats
stats = ps.Series()
stats['fastq_entries'] = 0
stats['failed_findBarcode'] = 0
stats['unmatched'] = 0
stats['sucessfully_matched'] = 0
force_read_length = True if (min_read_length or max_read_length) else False
if force_read_length:
stats['failed_minReadLength'] = 0
stats['failed_maxReadLength'] = 0
barcode_data['count_postFilters'] = 0
if rmdup:
barcode_data['failed_rmdup'] = 0
if umiA_start:
barcode_data['failed_sameUMI'] = 0
#Set up rmdup cache
rmdup_cache = []
umi_cache = {bc : [] for bc in barcode_data.index}
#Matching quality
matchingqs = {bc : [] for bc in barcode_data.index}
barcodelengths = {bc : [] for bc in barcode_data.index}
#Open files
read_inf = open(fastq, 'r')
unmachted_fastq = open(out+'_unmatched.fastq', 'w')
no_barcode_fastq = open(out+'_noBarcode.fastq', 'w')
if save_extracted_barcodes:
barcodes_fastq = open(out+'_barcodes.fastq', 'w')
if debug:
debug_file = open(out+'_debug.csv', 'w')
debug_log = ['fastq_entry',#0
'fastq_header',#1
'sequence',#2
'sequence_length',#3
'forceReadLengthFilter',#4
'left_matchingq',#5
'left_start',#6
'left_end',#7
'right_matchingq',#8
'right_start',#9
'right_end',#10
'barcode',#11
'found_gene_match',#12
'matched_gene',#13
'matching_quality',#14
'checksum',#15
'rmdupFilter',#16
'umiA',#17
'umiB',#18
'sameUMIFilter',#19
]
###MAIN LOOP###
seq_count = 0
for r in SeqIO.parse(read_inf, seqfile_format):
#write the debug line from last iteration and reset
if debug: debug_file.write(','.join(map(str, debug_log))+'\n')
debug_log = ['NA']*20
seq_count += 1
if verbose:
if seq_count%5000 == 0:
print('Processed %i reads'%seq_count)
debug_log[0] = str(seq_count)
debug_log[1] = str(r.description)
debug_log[2] = str(r.seq)
debug_log[3] = len(str(r.seq))
#Apply readlength filter, move on to next sequence if it fails
if force_read_length:
if (min_read_length and (len(r.seq) < min_read_length)):
stats['failed_minReadLength'] += 1
debug_log[4] = 'FAIL'
continue
elif max_read_length and (len(r.seq) > max_read_length):
stats['failed_maxReadLength'] += 1
debug_log[4] = 'FAIL'
continue
else:
debug_log[4] = 'PASS'
#Try to find the barcode
barcode, debug_log[5:11] = find_region(str(r.seq), flanking_left, flanking_right, max_distance=max_distance_flanks)
debug_log[11] = barcode
if barcode == 'FAIL':
stats['failed_findBarcode'] += 1
SeqIO.write([r,], no_barcode_fastq, 'fastq')
continue
if save_extracted_barcodes:
SeqIO.write([r[debug_log[7]:debug_log[9]],], barcodes_fastq, 'fastq')
#Match barcode to gene
matched_barcode, matchq = match_barcode(barcode, barcodes, max_distance_barcode)
if matched_barcode == 'unassigned':
SeqIO.write([r,], unmachted_fastq, 'fastq')
stats['unmatched'] += 1
debug_log[12] = 'FAIL'
continue
else:
gene = barcode_data.loc[matched_barcode,'gene']
matchingqs[matched_barcode].append(matchq)
debug_log[12] = 'PASS'
debug_log[13] = gene
debug_log[14] = str(matchq)
stats['sucessfully_matched'] += 1
barcodelengths[matched_barcode].append(len(barcode))
#If rmdup filter is active, check if we have seen this sequence before
if rmdup:
checksum = seguid(r.seq)
debug_log[15] = checksum
if checksum in rmdup_cache:
debug_log[16] = 'FAIL'
barcode_data.loc[matched_barcode, 'failed_rmdup'] += 1
continue
else:
debug_log[16] = 'PASS'
rmdup_cache.append(checksum)
#If umi filter is active, check if we have seen this UMI before
if umiA_start:
umiA = str(r.seq)[umiA_start:umiA_end]
debug_log[17] = umiA
if umiB_start:
umiB = str(r.seq)[umiB_start:umiB_end]
else:
umiB = ''
debug_log[18] = umiB
umiA += umiB
if umiA in umi_cache[matched_barcode]:
debug_log[19] = 'FAIL'
barcode_data.loc[matched_barcode, 'failed_sameUMI'] +=1
continue
else:
debug_log[19] = 'PASS'
umi_cache[matched_barcode].append(umiA)
#If loop has not been continued yet, count this as a true hit
barcode_data.loc[matched_barcode, 'count_postFilters'] += 1
#Write debug file for final iteration
if debug: debug_file.write(','.join(map(str, debug_log))+'\n')
#Close all files
read_inf.close()
unmachted_fastq.close()
no_barcode_fastq.close()
if save_extracted_barcodes:
barcodes_fastq.close()
if debug: debug_file.close()
if verbose: print('Processed fastq file sucessfully. Generating output files.')
#Export data
barcode_data['mean_matching_distance'] = [np.mean(matchingqs[k]) if len(matchingqs[k])>0 else np.nan for k in barcode_data.index]
barcode_data['mean_barcode_lengths'] = [np.mean(barcodelengths[k]) if len(barcodelengths[k])>0 else np.nan for k in barcode_data.index]
barcode_data = barcode_data.sort_values('count_postFilters', ascending=False)
barcode_data.to_csv(out+'.csv')
stats['fastq_entries'] = seq_count
stats.to_csv(out + '_stats.csv')
if verbose: print('Done.')
return barcode_data, stats
def find_region(rs, fleft, fright, max_distance=0):
log = ['NA']*6 #left_matchingq, left_start, left_end, right_matchingq, right_start, right_end
log[:3] = search_substring(rs, fleft, max_distance)
log[3:] = search_substring(rs, fright, max_distance)
try:
return rs[log[2]:log[4]], log
except Exception:
return 'FAIL', log
def search_substring(rss, subrss, max_distance=0):
'''This function finds a substring in a string using a custom, greedy search algorith which returns if a match was found without necessarily exploring the entire search space. Barcode reads will in >80% of the cases contain exact matches, so these are looked for first. If the distance is set to 1, substrings with one single base substitution are looked for. Indels are considered last. This means the behaviour for multiple matches is slightly inconsistent: If there are multiple exact matches, the first one is returned. If there are multiple matches with one SNP, the last one is returned. If there are multiple matches with one indel, the first one is returned.'''
assert max_distance in [0,1]
lowest = [100, None, None]#initialise: matchingq, start, end
#Start with substrings of the same length. If there is an exact match, return immediately
for startpos in range(len(rss)-len(subrss)-1):
temp_substring = rss[startpos:startpos+len(subrss)]
temp_distance = hamming2(temp_substring, subrss)#Hamming distance is much quicker than levenshtein, and in 90% of cases, this is all that's needed
if temp_distance < lowest[0]:
lowest = [temp_distance, startpos, startpos+len(subrss)]
if lowest[0] == 0:
return lowest
#If there was a hit with 1 mismatch, return that one. I.e. this algorithm favours SNPs over indels.
if max_distance == 1:
if lowest[0] == 1:
return lowest
#If no match was found yet, look at substrings with 1 length difference. Return immediately when there was a hit
if max_distance == 1:
#Start with substrings with one insertion
for startpos in range(len(rss)-len(subrss)-2):
temp_substring = rss[startpos:startpos+len(subrss)+1]
temp_distance = levenshtein_distance(temp_substring, subrss)
if temp_distance < lowest[0]:
lowest = [temp_distance, startpos, startpos+len(subrss)+1]#Here it is strictly not necessary to keep track of the lowest (because it returns immediately when there is a hit, but it also doesn't do any harm.
if lowest[0] == 1:
return lowest
#Next, substrings with one deletion
for startpos in range(len(rss)-len(subrss)):
temp_substring = rss[startpos:startpos+len(subrss)-1]
temp_distance = levenshtein_distance(temp_substring, subrss)
if temp_distance < lowest[0]:
lowest = [temp_distance, startpos, startpos+len(subrss)-1]
if lowest[0] == 1:
return lowest
#If no match was found, return 'FAIL'
return ['FAIL', 'NA', 'NA']
def match_barcode(barcode, barcode_list, max_distance):
"""Return the best matching barcode and the matching quality"""
if barcode in barcode_list:
return barcode, 0
else:
# find the Levenshtein distance to each
distances = [levenshtein_distance(barcode, b) for b in barcode_list]
best = min(distances)
# check if there's a tie or the distance is too great:
if best > max_distance or distances.count(best) > 1:
return 'unassigned', best
else:
# otherwise, return the best one, after caching it for future use
return barcode_list[distances.index(best)], best
def print_dict_long(indict):
outstr = ''
for k,v in indict.items():
outstr += str(k) + ': ' + str(v) + '\n'
return outstr
if __name__ == '__main__':
#Set up parsing of command line arguments with argparse
parser = argparse.ArgumentParser(description='Welcome to barcount v0.9. Count barcodes in fastq files and match to genes. Written by [email protected] and maintained at https://github.com/Bahler-Lab/barcount.')
parser.add_argument('--barcode_table', type=str, required=True, help='Path to barcode database. This is a file containing a gene id and its barcode, separated by comma without additional whitespace on each line. There must be no header.')
parser.add_argument('--min_read_length', type=int, help='Minimum read length.')
parser.add_argument('--max_read_length', type=int, help='Maximum read length.')
parser.add_argument('--rmdup', default=False, action='store_true', help='If set, consider only the first occurence of the entire read in the fastq file.')
parser.add_argument('--flanking_left', type=str, required=True, help='Flanking sequence of the barcode region on the left.')
parser.add_argument('--flanking_right', type=str, required=True, help='Flanking sequence of the barcode region on the right.')
parser.add_argument('--max_distance_flanks', type=int, choices=[0,1], help='Number of single-letter permutations (Levenshtein distance) to allow between barcodes. The default is 0. The maximum is 1.', default=0)
parser.add_argument('--max_distance_barcode', type=int, help='Number of single-letter permutations (Levenshtein distance) to allow between barcodes. The default is 1.', default=1)
parser.add_argument('--out', type=str, required=True, help='Output files will be saved under this path/name. This includes the count table, qc plots and additional stats (required).')
parser.add_argument('--verbose', default=False, action='store_true', help='If set, use verbose output.')
parser.add_argument('--debug', default=False, action='store_true', help='If set, make debug file True/False.')
parser.add_argument('--seqfile_format', default='fastq', help='Format of the sequence file (fastq by default). This will be passed on to Biopython SeqIO.')
parser.add_argument('--save_extracted_barcodes', default=False, action='store_true', help='If set, save an additional fastq file with the extracted barcodes .')
parser.add_argument('--fastq', type=str, required=True, help='Path to the fastq file to analyse.')
#UMI options
parser.add_argument('--umiA_position', type=str, help='Find the first UMI by position in the read. The position must be in the format start:stop following python-like indexing, which is 0-based and does not include the stop posiion in the slice. Prefilter read file for correct readlength or use with --max_read_length or --min_read_length.')
parser.add_argument('--umiB_position', type=str, help='Find the second UMI by position in the read. The position must be in the format start:stop following python-like indexing, which is 0-based and does not include the stop posiion in the slice. Prefilter read file for correct readlength or use with --max_read_length or --min_read_length.')
args = parser.parse_args()
if args.verbose : print('Welcome to barcount v0.92 (20.6.2020). Count barcodes in fastq files and match to genes. Written by [email protected] and maintained at https://github.com/Bahler-Lab/barcount.')
if args.verbose: print('Command line call: '+' '.join(sys.argv))
if args.umiA_position:
try:
umiA_start = int(args.umiA_position.split(':')[0])
umiA_end = int(args.umiA_position.split(':')[1])
except Exception:
raise ValueError('umiA position is not specified correctly. The position must be in the format start:stop following python-like indexing, which is 0-based and does not include the stop posiion in the slice. Argument received: %s'%args.umiA_position)
else:
umiA_start = None
umiA_end = None
if args.umiB_position:
try:
umiB_start= int(args.umiB_position.split(':')[0])
umiB_end = int(args.umiB_position.split(':')[1])
except Exception:
raise ValueError('umiB position is not specified correctly. The position must be in the format start:stop following python-like indexing, which is 0-based and does not include the stop posiion in the slice. Argument received: %s'%args.umiB_position)
else:
umiB_start = None
umiB_end = None
if umiA_start or umiB_start:
warn('You have set options to finc UMIs by position. This is only recommended if you first make sure that reads are of the correct length and it is your repsonsibility to the the min_red_length and max_read_length accordingly.')
if args.umiB_position and not args.umiA_position:
raise ValueError('umiB positions are specified but not umiA positions. If your reads contain only one UMI, please consider this umiA.')
if args.verbose: print('Running barcount with the following arguments:\n' + print_dict_long(vars(args)))
#Run analysis
_,_ = run_analysis(args.barcode_table, args.out, args.fastq, args.flanking_left, args.flanking_right, min_read_length=args.min_read_length, max_read_length=args.max_read_length, rmdup=args.rmdup, max_distance_flanks=args.max_distance_flanks, max_distance_barcode=args.max_distance_barcode, verbose=args.verbose, debug=args.debug, seqfile_format=args.seqfile_format, save_extracted_barcodes=args.save_extracted_barcodes, umiA_start=umiA_start, umiA_end=umiA_end, umiB_start=umiB_start, umiB_end=umiB_end)