-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpacbio-util
executable file
·460 lines (407 loc) · 18 KB
/
pacbio-util
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
#!/usr/bin/env perl
# Copyright (c) 2015 Douglas G. Scofield, Uppsala University
#
# No warranty is implied or assumed by this code. Covered by
# the Gnu General Public License v2. See LICENSE.
#
# Please send bugs, suggestions etc.
#
# Note: samtools >= 0.1.19 required for its --ff flag
#
# Changelog:
#
# 0.2 : added check of samtools version (20151201)
# initial version unversioned
use constant VERSION => '0.2';
use strict;
use warnings;
# base modules
use POSIX;
use Getopt::Long;
# BioPerl modules
use Bio::Seq;
use Bio::SeqIO;
my $o_verbose = 0;
sub print_usage_and_exit($$);
sub merge_pileup_columns($);
sub indel_targets();
sub indel_apply();
################
# main loop
my %command_values = ( "indel-targets" => "indel-targets",
"indel-apply" => "indel-apply");
my $command = shift @ARGV;
if (not defined $command or not defined $command_values{$command}) {
print STDERR "$0 indel-targets ...\n";
print STDERR "$0 indel-apply ...\n";
print STDERR "\nVersion: ".VERSION."\n";
exit 0;
} elsif ($command eq "indel-targets") {
indel_targets();
} elsif ($command eq "indel-apply") {
indel_apply();
}
################
sub print_usage_and_exit($$) {
my $usage = shift;
my $msg = shift;
print "$msg\n" if $msg;
print $usage;
exit 0;
}
################
my $do_merge = 0;
sub merge_pileup_columns($) {
# each BAM has 4 columns: coverage, bases, quality, mapping quality
# if coverage is 0 *and* the next two are '*', then there are only
# three columns. if the *and* is not true, then reads have been
# removed because of flags, and there are 4 columns of output
my $l = shift; # reference to array of pileup columns
my ($cvg, $base, $qual, $mapqual) = ( 0, "", "", "" );
my ($n_cols, $n_bams) = ( scalar(@$l), 0 );
my $i = 3; # start at 4th column (coverage)
while ($i < $n_cols) {
if ($l->[$i] > 0) { # coverage > 0, 4 columns
$cvg += $l->[$i];
$base .= $l->[$i + 1];
$qual .= $l->[$i + 2];
$mapqual .= $l->[$i + 3];
$i += 4;
} elsif ($l->[$i] == 0 and $l->[$i + 1] eq '*' and $l->[$i + 2] eq '*') {
if ($l->[$i + 3] eq '*') {
# samtools 1.0+
$i += 4;
} else {
# pre-samtools 1.0, this was a bug, with coverage 0 there are just 3 columns
$i += 3;
}
} else { # coverage 0 but 4 columns
$i += 4;
}
++$n_bams;
}
if (not $do_merge) {
print STDERR "Merging columns from $n_bams BAM files ...\n" if $o_verbose;
$do_merge = 1;
}
return ($l->[0], $l->[1], $l->[2], $cvg, $base, $qual, $mapqual);
}
################
sub indel_targets() {
my $o_fasta = "";
my $o_stdin;
my %mode_values = ( "filter" => "filter", "model" => "model");
my $o_mode = "filter";
my $o_indelfrac = 0.80;
my $o_indelmincov = 10;
my $o_includebadindels = 0;
my $o_maxindelsize = 1000;
my $o_mappingquality = 0;
my $mapQ_base = 33;
my $o_help = 0;
my $o_samtools;
my $o_allreads;
my $min_mapqual = 1;
my $min_basequal = 13;
my $samtools_filterflags = "--ff ".(256 + 1024); # not primary, is duplicate
my $o_allbases;
my $samtools = "samtools";
my $n_targets = 0;
my $usage = "
$0 indel-targets -f FASTA FILE1.bam [ FILE2.bam ... ]
Version: ".VERSION."
Generats indel targets for correction. Uses samtools for indel detection.
Targets are written to standard output.
Targets will only be generated where all read mappings containing an indel
agree on the size and sequence of the indel. Further criteria for minimum
indel fraction, minimum coverage, and maximum indel size may be adjusted via
options.
The first line of the targets output is the name of the assembly Fasta file
against which the targets were generated. Following that, the columns are
scaffold name, target position, reference base, total read coverage, type of
target (always 'indel' for this command), whether the target passed quality
criteria (`good` or `bad`), the frequency of coverage supporting the indel, the
size of the indel (positive for insertion, negative for deletion), a string
describing the indel size and sequence, the number of reads supporting the
indel, and the mean mapping quality of reads.
OPTIONS
-f FILE, --fasta FILE PacBio assembly in Fasta format
FILE1.bam [ FILE2.bam ] BAM files of reads mapped to
- Read mpileup from stdin rather than running
samtools. Note that mapping qualities
(option `-s`) are expected in the mpileup input.
--mode model | filter Determine indel targets using a probabilistic
model (not yet implemented!) or naive filtering,
which is what we currently do.
NAIVE-MODE OPTIONS
--indel-frac FLOAT Do not report indels at a position if the
fraction of reads containing them is below FLOAT
[default $o_indelfrac]
--indel-min-cov INT Minimum read coverage to consider an indel
[default $o_indelmincov]
--include-bad-indels Include indels in target set that do not pass
our criteria, these lines are marked 'bad' in
the sixth column
--max-indel-size INT Maximum indel size to accept, beyond this the
position is considered in error [default $o_maxindelsize]
--mapping-quality FLOAT Minimum mean mapping quality of reads covering a
target site. Note that a value of 0 means no
minimum applied; reads with 0 mapQ are still
excluded by default (see --all-reads) [default $o_mappingquality]
--all-reads Include all reads. Default is to exclude
multiply-mapped reads (mapQ < $min_mapqual), alignments
that are not the primary (SAM flag 256), and
duplicate alignments (flag 1024).
--all-bases Include all bases. Default is to exclude bases
below quality $min_basequal.
GENERAL OPTIONS
--summary Produce an informational summary to standard
error at the end of execution.
--samtools FILE Location of samtools executable
[default $samtools]
--verbose Print informational messages
--help, -? help message
";
print_usage_and_exit($usage, "") if not @ARGV;
GetOptions("" => \$o_stdin,
"fasta=s" => \$o_fasta,
"indel-frac=f" => \$o_indelfrac,
"indel-min-cov=i" => \$o_indelmincov,
"include-bad-indels" => \$o_includebadindels,
"mapping-quality=f" => \$o_mappingquality,
"max-indel-size=i" => \$o_maxindelsize,
"all-reads" => \$o_allreads,
"all-bases" => \$o_allbases,
"samtools=s" => \$o_samtools,
"verbose" => \$o_verbose,
"help|?" => \$o_help
) or print_usage_and_exit($usage, "unknown option");
print_usage_and_exit($usage, "") if $o_help;
# find samtools and check version
$samtools = $o_samtools if $o_samtools;
my $t = qx/$samtools 2>&1/;
die "cannot execute '$samtools'" if not defined $t;
$t =~ m/Version: ([0-9.]+)/; # finds version string containing just numbers and '.'
$t = $1;
#print STDERR "Version of samtools determined to be '$t'\n";
die "$samtools version ($t) too low, must be >= 0.1.19" if $t =~ m/^0\./ and $t ne '0.1.19';
die "unknown --mode" if not defined($mode_values{lc($o_mode)});
$o_mode = $mode_values{lc($o_mode)};
-f $o_fasta or die "cannot find reference file: $o_fasta";
my @BAM = grep { -f or die "cannot find BAM: $_" } @ARGV;
my $n_BAM = scalar(@BAM);
# set up samtools argument list
my $samtools_args = "-s -B -d 10000 -L 10000";
$samtools_args .= " $samtools_filterflags" if not $o_allreads;
$min_mapqual = 0 if $o_allreads;
$samtools_args .= " -q $min_mapqual";
$min_basequal = 0 if $o_allbases;
$samtools_args .= " -Q $min_basequal";
# open samtools on a pipe
my $samtools_pipe = "$samtools mpileup $samtools_args -f $o_fasta ".join(" ", @BAM);
$samtools_pipe .= ($o_verbose ? "" : " 2>/dev/null") . " |";
print STDERR "samtools pipe command: '$samtools_pipe'\n" if $o_verbose;
open(PILEUP, $samtools_pipe) or die "Could not initiate samtools pipe: $!";
print STDOUT "#assembly:$o_fasta\n";
while (<PILEUP>) {
next if m/^#/;
chomp;
my @l = split /\t/;
my ($ref, $pos, $refcall, $cvg, $base, $qual, $mapqual) = merge_pileup_columns(\@l);
die "Coverage contains non-numeric values: $cvg" if $cvg !~ /^[[:digit:]]+$/;
if ($refcall eq "*") {
print STDERR "skipping reference deletion '*', should this happen??\n";
next; # skip indel lines
}
# code modified from https://github.com/douglasgscofield/bioinfo/tree/master/pileVar
my %indels;
my %indel_ops;
my %indel_lengths;
my $in_reads = 0;
my $del_reads = 0;
my $mean_mapQ = 0;
my $n_bases;
if ($base =~ m/[\$\^\+-]/) {
$base =~ s/\^.//g; #removing the start of the read segement mark
$base =~ s/\$//g; #removing end of the read segment mark
while ($base =~ m/([\+-]){1}(\d+)/g) {
my $indel_type = $1;
my $indel_len = $2;
if ($indel_len > $o_maxindelsize) { # problem with pileup line?
print STDERR "line $. indel too big, $indel_len\n";
next;
}
# remove indel info from read base field
$base =~ s/([\+-]{1}$indel_len.{$indel_len})//;
my $indel_contents = $1;
++$indels{"indel, $indel_type, $indel_len bases, $indel_contents"};
++$indel_lengths{($indel_type eq "+" ? +$indel_len : -$indel_len)};
++$indel_ops{uc($indel_contents)};
++$in_reads if $indel_type eq "+";
++$del_reads if $indel_type eq "-";
}
$n_bases = length($base);
die "n bases $n_bases != number of base qualities" if $n_bases != length($qual);
my $qsum = 0;
map { $qsum += (ord($_) - $mapQ_base) } split //, $mapqual;
$mean_mapQ = sprintf("%.1f", $qsum / length($mapqual)) + 0.0;
next if $o_mappingquality and $mean_mapQ < $o_mappingquality;
}
my $indel_reads = $in_reads + $del_reads;
next if $indel_reads == 0; # no indel
my $indel_frac_pos = $indel_reads / $n_bases;
my $indel_qual = ($indel_frac_pos >= $o_indelfrac and
$indel_reads >= $o_indelmincov and
scalar(keys %indel_lengths) == 1 and
scalar(keys %indel_ops) == 1) ? "good" : "bad";
next if not $o_includebadindels and $indel_qual eq "bad";
my @out;
push @out, ($ref, $pos, $refcall, $cvg, "indel");
push @out, $indel_qual;
push @out, (sprintf "%.4f", $indel_frac_pos);
push @out, join(",", keys %indel_lengths);
push @out, join(",", keys %indel_ops);
push @out, $in_reads + $del_reads;
push @out, $mean_mapQ;
++$n_targets;
print STDOUT join("\t", @out), "\n";
}
print STDERR "$0 $command: $n_targets targets generated for assembly '$o_fasta'\n" if $o_verbose;
close PILEUP;
}
################
sub advance_to_seq($$$);
# One of the things I don't want to do is use Bio::Seq::LargeSeq, as that
# appears to dump directly to disk, at least in earlier version of BioPerl.
# Better to just assemble the sequences into a simple string, we should
# have the RAM, then pack it into Bio::Seq for writing.
sub indel_apply() {
my $o_fasta;
my $o_targets;
my $o_includebadindels = 0;
my $o_skipdeletionverify = 0;
my $o_skipsoftmasked = 0;
my $o_help = 0;
my $n_applied = 0;
my $usage = "
$0 indel-apply [ -f FASTA ] [ -t TARGETS ]
Version: ".VERSION."
Apply indel targets to correct an assembly. The corrected assembly is written
to standard output. The order of sequences in the assembly and targets must be
the same.
OPTIONS
-f | --fasta FILE PacBio assembly in Fasta format, to be corrected.
Must be the same assembly used for
'$0 indel-targets'.
If the assembly is not specified, it is determined
from the first line of the target list.
-t | --targets TARGETS List of indel targets to apply, standard output
from '$0 indel-targets -f FILE ...'. If no
targets are specified, they are read from stdin.
--include-bad-indels Apply indels in target set that are marked 'bad'.
--skip-deletion-verify Do not verify that a deletion matches the
sequence at that position in the assembly
[default $o_skipdeletionverify]
--skip-softmasked Do not apply targets to softmasked regions of the
assembly, as determined by use of lowercase
[default $o_skipsoftmasked]
--verbose Print informational messages
--help, -? help message
";
GetOptions("fasta=s" => \$o_fasta,
"targets=s" => \$o_targets,
"include-bad-indels" => \$o_includebadindels,
"skip-deletion-verify" => \$o_skipdeletionverify,
"verbose" => \$o_verbose,
"help|?" => \$o_help
) or print_usage_and_exit($usage, "unknown option");
print_usage_and_exit($usage, "") if $o_help;
# open targets
if (not $o_targets) {
*TARGETS = *STDIN;
} else {
open(TARGETS, "<$o_targets") or die "Could not open targets file '$o_targets': $!";
}
my $target_assembly = <TARGETS>; # first line is '#assembly:filename'
die "'$o_targets' does not contain assembly filename" if $target_assembly !~ m/^#assembly:/;
chomp $target_assembly;
$target_assembly = substr($target_assembly, 10); # remove '#assembly:' prefix
# open assembly
if (not defined $o_fasta) {
print STDERR "$0 $command: Opening target assembly '$target_assembly'\n" if not $o_fasta and $o_verbose;
$o_fasta = $target_assembly;
} elsif ($o_fasta ne $target_assembly) {
print STDERR "\n$0 $command: *** Warning: -f and target assembly do not match\n\n";
}
my $IN = Bio::SeqIO->new(-file => "<$o_fasta", -format => "fasta") or
die "Could not open assembly file '$o_fasta': $!";
my $OUT = Bio::SeqIO->new(-fh => \*STDOUT, -format => "fasta") or
die "Could not open output: $!";
my $seq = $IN->next_seq() or die "Could not read first assembly sequence";
my $seq_start = 1; # position in the original seq
my $sequence = ""; # for the actual sequence letters
while (<TARGETS>) {
next if m/^#/;
chomp;
my @l = split /\t/;
next if $l[5] eq "bad" and not $o_includebadindels;
# we now know we are going to apply this indel
if ($l[0] ne $seq->id) {
# wrap up the old sequence, move to a new seq
$sequence .= $seq->subseq($seq_start, $seq->length());
$seq->seq($sequence);
$OUT->write_seq($seq);
$seq = advance_to_seq($IN, $l[0], $OUT) or die "no assembly sequence '$l[0]'";
# initiate sequence
$sequence = "";
$seq_start = 1;
}
die "pos mismatch ".$seq->id.": seq_start $seq_start > target $l[1]" if $seq_start > $l[1];
# update sequence
$sequence .= $seq->subseq($seq_start, $l[1]);
$seq_start = $l[1] + 1;
# if skipping softmasked regions, if this is softmasked just go to the next target
my $base = $seq->subseq($seq_start, $seq_start);
next if $o_skipsoftmasked and $base ne lc $base;
# apply the indel, is it insertion or deletion?
my $sign = $l[7] <=> 0;
my $size = abs($l[7]);
my $indel = substr($l[8], 1 + length($size)); # remove string '+n' or '-n'
if ($sign < 0) {
# we need to delete from the assembly
if (not $o_skipdeletionverify) { # verify deletion sequence matches
my $del = $seq->subseq($seq_start, $seq_start + $size - 1);
if ($indel ne uc($del)) {
die "deletion '$indel' mismatch vs assembly ".$seq->id.":$seq_start, '$del'";
}
}
$seq_start += $size;
} elsif ($sign > 0) {
# we need to insert into the assembly
$sequence .= $indel;
} else {
die "indel sign is 0 at target line $.";
}
++$n_applied;
}
# wrap up final sequences
$sequence .= $seq->subseq($seq_start, $seq->length());
$seq->seq($sequence);
$OUT->write_seq($seq);
while (my $s = $IN->next_seq()) {
print STDERR "$0 $command: ".$s->id.", no targets\n" if $o_verbose;
$OUT->write_seq($s);
}
print STDERR "$0 $command: $n_applied targets applied to assembly '$o_fasta'\n" if $o_verbose;
}
sub advance_to_seq($$$) {
# if we skip a sequence, output it in its entirety, it has no targets
my ($seqin, $seq_name, $seqout) = @_;
while (my $s = $seqin->next_seq()) {
return $s if $s->id eq $seq_name;
print STDERR "$0 $command: ".$s->id.", no targets\n" if $o_verbose;
$seqout->write_seq($s);
}
return 0;
}