diff --git a/README.md b/README.md index 0ee425b9db1..55556f6f44d 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Please cite: * [The VG Paper](https://doi.org/10.1038/nbt.4227) when using `vg` * [The VG Giraffe Paper](https://doi.org/10.1126/science.abg8871) when using `vg giraffe` * [The VG Call Paper](https://doi.org/10.1186/s13059-020-1941-7) when SV genotyping with `vg call` -* [The GBZ Paper](https://doi.org/10.1093/bioinformatics/btad097) when using GBZ +* [The GBZ Paper](https://doi.org/10.1093/bioinformatics/btac656) when using GBZ * [The HPRC Paper](https://doi.org/10.1038/s41586-023-05896-x) when using `vg deconstruct` * [The Snarls Paper](https://doi.org/10.1089/cmb.2017.0251) when using `vg snarls` * [The Personalized Pangenome Paper](https://doi.org/10.1101/2023.12.13.571553) when using `vg haplotypes` and/or `vg giraffe --haplotype-name` @@ -228,7 +228,7 @@ There are multiple read mappers in `vg`: * `vg giraffe` is designed to be fast for highly accurate short reads, against graphs with haplotype information. * `vg map` is a general-purpose read mapper. -* `vg mpmap` does "munti-path" mapping, to allow describing local alignment uncertainty. [This is useful for transcriptomics.](#Transcriptomic-analysis) +* `vg mpmap` does "multi-path" mapping, to allow describing local alignment uncertainty. [This is useful for transcriptomics.](#Transcriptomic-analysis) #### Mapping with `vg giraffe` diff --git a/deps/gbwtgraph b/deps/gbwtgraph index 958dc8d6a00..de61d340695 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit 958dc8d6a003f485bed5c0b841cd99abf284e046 +Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a diff --git a/scripts/plot-pr.R b/scripts/plot-pr.R index 0fbda529781..2c6c9214bcc 100755 --- a/scripts/plot-pr.R +++ b/scripts/plot-pr.R @@ -10,11 +10,16 @@ require("tidyverse") require("ggrepel") # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { - # If the count column is not present, add i + # If the count column is not present, add it dat$count <- rep(1, nrow(dat)) } @@ -48,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names) name.lists <- name.lists[name.order] # Determine colors for aligners -bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b") -light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee") +bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1") +light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9") # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors. cursor <- 1 @@ -95,6 +100,25 @@ colors <- colors[aligner.names] # Add a bin "factor" to each row, binning float MAPQs into bins from 0 to 60 (and inclusing bins for out of range on each end) dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf)) +# We need to work out our scales +reads.per.condition <- sum(dat$count) / length(aligner.names) +# Start with small scale +labels <- c("1e-0","1e-1","1e-2","1e-3","1e-4") +breaks <- c(0,1,2,3,4) +limits <- c(0, 4) +if ( reads.per.condition > 10000 ) { + # Use big scale if there are a lot of reads + labels <- c(labels, "1e-5","1e-6") + breaks <- c(breaks, 5,6) + limits <- c(0, 6) +} +if ( reads.per.condition > 1000000 ) { + # Use big scale if there are a lot of reads + labels <- c(labels, "1e-7","1e-8","1e-9") + breaks <- c(breaks, 7,8,9) + limits <- c(0, 9) +} + # Now we break out the cool dplyr/magrittr/tidyverse tools like %>% pipe operators. dat.roc <- dat %>% # Make positive and negative count columns @@ -127,15 +151,15 @@ dat.plot <- dat.roc %>% # There will be points with variable sizes geom_point(aes(size=Positive+Negative)) + # We manually assign these selected colors - scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) + + scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) + # And we want a size legend scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) + # And we want a fake log Y axis - scale_y_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) + + scale_y_continuous(labels=labels, breaks=breaks, limits=limits) + # Label it ylab("1 - Precision") + # And we want a fake log X axis - scale_x_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) + + scale_x_continuous(labels=labels, breaks=breaks, limits=limits) + # Label it xlab("1 - Recall") + # And we want this cool theme diff --git a/scripts/plot-qq.R b/scripts/plot-qq.R index 0f5a8b7074a..434b9455805 100755 --- a/scripts/plot-qq.R +++ b/scripts/plot-qq.R @@ -2,15 +2,21 @@ # plot-qq.R [ [title]] -list.of.packages <- c("tidyverse", "ggrepel", "svglite") +list.of.packages <- c("tidyverse", "ggrepel", "svglite", "binom") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) require("tidyverse") require("ggrepel") +require("binom") # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { # If the count column is not present, add i @@ -47,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names) name.lists <- name.lists[name.order] # Determine colors for aligners -bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b") -light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee") +bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1") +light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9") # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors. cursor <- 1 @@ -93,14 +99,20 @@ colors <- colors[aligner.names] dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf)) -x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count))) +x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count), select(binom.confint(sum(correct * count), sum(count), conf.level=0.9, methods="lrt"), c("lower", "upper")))) + +print(names(x)) +print(x$ci) -dat.plot <- ggplot(x, aes(1-mapprob+1e-9, 1-observed+1e-9, color=aligner, size=N, weight=N, label=round(mapq,2))) + - scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) + - scale_y_log10("measured error", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + - scale_x_log10("error estimate", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + +# Now plot the points as different sizes, but the error bar line ranges as a consistent size +dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) + + scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) + + scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + + scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) + geom_point() + + # Only aesthetics that depend on each point need to be in the aes() mapping + geom_linerange(aes(x=1-mapprob+1e-7, ymin=1-upper+1e-7, ymax=1-lower+1e-7), linewidth=0.2, position=position_dodge(.05)) + geom_smooth() + geom_abline(intercept=0, slope=1, linetype=2) + theme_bw() diff --git a/scripts/plot-roc-log.R b/scripts/plot-roc-log.R index 54c3a653436..87321054a9f 100755 --- a/scripts/plot-roc-log.R +++ b/scripts/plot-roc-log.R @@ -20,8 +20,13 @@ require("tidyverse") require("ggrepel") # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { # If the count column is not present, add i diff --git a/scripts/plot-roc.R b/scripts/plot-roc.R index 3353f0c4d6b..657b4a9782e 100755 --- a/scripts/plot-roc.R +++ b/scripts/plot-roc.R @@ -21,8 +21,13 @@ require("ggrepel") require("scales") # For squish # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { # If the count column is not present, add i diff --git a/src/aligner.cpp b/src/aligner.cpp index f10c8512a27..808a4d189e5 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -1421,7 +1421,8 @@ void Aligner::align_pinned_multi(Alignment& alignment, vector& alt_al } void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding, bool permissive_banding) const { + int32_t band_padding, bool permissive_banding, + const unordered_map* left_align_strand) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -1446,7 +1447,8 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1455,7 +1457,8 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1464,7 +1467,8 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else { @@ -1473,14 +1477,16 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } } void Aligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const { + int32_t max_alt_alns, int32_t band_padding, bool permissive_banding, + const unordered_map* left_align_strand) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -1505,7 +1511,8 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1516,7 +1523,8 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1527,7 +1535,8 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else { @@ -1538,7 +1547,8 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false); + false, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } @@ -2095,7 +2105,8 @@ void QualAdjAligner::align_pinned_multi(Alignment& alignment, vector& } void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding, bool permissive_banding) const { + int32_t band_padding, bool permissive_banding, + const unordered_map* left_align_strand) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -2118,7 +2129,8 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true); + true, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -2127,7 +2139,8 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true); + true, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -2136,7 +2149,8 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true); + true, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else { @@ -2145,14 +2159,16 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true); + true, + left_align_strand); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } } void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const { + int32_t max_alt_alns, int32_t band_padding, bool permissive_banding, + const unordered_map* left_align_strand) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -2177,7 +2193,8 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector::max() && worst_score >= numeric_limits::min()) { @@ -2188,7 +2205,8 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector::max() && worst_score >= numeric_limits::min()) { @@ -2199,7 +2217,8 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector* left_align_strand = nullptr) const = 0; /// store top scoring global alignments in the vector in descending score order up to a maximum number /// of alternate alignments (including the optimal alignment). if there are fewer than the maximum @@ -161,7 +162,8 @@ namespace vg { /// optimal alignment will be stored in both the vector and the original alignment object virtual void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, int32_t max_alt_alns, int32_t band_padding = 0, - bool permissive_banding = true) const = 0; + bool permissive_banding = true, + const unordered_map* left_align_strand = nullptr) const = 0; /// xdrop aligner virtual void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, bool reverse_complemented, uint16_t max_gap_length = default_xdrop_max_gap_length) const = 0; @@ -358,14 +360,16 @@ namespace vg { /// permissive banding auto detects the width of band needed so that paths can travel /// through every node in the graph void align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding = 0, bool permissive_banding = true) const; + int32_t band_padding = 0, bool permissive_banding = true, + const unordered_map* left_align_strand = nullptr) const; /// store top scoring global alignments in the vector in descending score order up to a maximum number /// of alternate alignments (including the optimal alignment). if there are fewer than the maximum /// number of alignments in the return value, then the vector contains all possible alignments. the /// optimal alignment will be stored in both the vector and the original alignment object void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true) const; + int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true, + const unordered_map* left_align_strand = nullptr) const; /// xdrop aligner void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, @@ -428,11 +432,13 @@ namespace vg { void align(Alignment& alignment, const HandleGraph& g, bool traceback_aln) const; void align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding = 0, bool permissive_banding = true) const; + int32_t band_padding = 0, bool permissive_banding = true, + const unordered_map* left_align_strand = nullptr) const; void align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, bool xdrop = false, uint16_t xdrop_max_gap_length = default_xdrop_max_gap_length) const; void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true) const; + int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true, + const unordered_map* left_align_strand = nullptr) const; void align_pinned_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, bool pin_left, int32_t max_alt_alns) const; diff --git a/src/banded_global_aligner.cpp b/src/banded_global_aligner.cpp index 0dca6eb05bb..fb8ff9f73b9 100644 --- a/src/banded_global_aligner.cpp +++ b/src/banded_global_aligner.cpp @@ -209,13 +209,14 @@ void BandedGlobalAligner::BABuilder::finalize_alignment(const list BandedGlobalAligner::BAMatrix::BAMatrix(Alignment& alignment, handle_t node, int64_t top_diag, int64_t bottom_diag, - const vector& seeds, int64_t cumulative_seq_len) : + const vector& seeds, int64_t cumulative_seq_len, bool left_alignment_strand) : node(node), top_diag(top_diag), bottom_diag(bottom_diag), seeds(seeds), alignment(alignment), cumulative_seq_len(cumulative_seq_len), + left_alignment_strand(left_alignment_strand), match(nullptr), insert_col(nullptr), insert_row(nullptr) @@ -810,6 +811,11 @@ void BandedGlobalAligner::BAMatrix::traceback(const HandleGraph& graph, continue; } + vector prev_mats{InsertCol, Match, InsertRow}; + if (left_alignment_strand) { + std::swap(prev_mats[0], prev_mats[2]); + } + // find optimal traceback idx = i * ncols + j; bool found_trace = false; @@ -843,51 +849,71 @@ void BandedGlobalAligner::BAMatrix::traceback(const HandleGraph& graph, cerr << "[BAMatrix::traceback] transitioning from match, current score " << (int) match[idx] << " match/mismatch score " << (int) match_score << " from node char " << j << " (" << node_seq[j] << ") and read char " << i + top_diag + j << " (" << read[i + top_diag + j] << ")" << endl; #endif - source_score = match[next_idx]; - score_diff = curr_score - (source_score + match_score); - if (score_diff == 0) { + for (auto prev_mat : prev_mats) { + + switch (prev_mat) { + case Match: + { + source_score = match[next_idx]; + score_diff = curr_score - (source_score + match_score); + if (score_diff == 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback] found next cell in match matrix with score " << (int) match[next_idx] << endl; -#endif - mat = Match; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, Match); - } - - source_score = insert_row[next_idx]; - if (source_score > min_inf) { - score_diff = curr_score - (source_score + match_score); - if (!found_trace && score_diff == 0) { + cerr << "[BAMatrix::traceback] found next cell in match matrix with score " << (int) match[next_idx] << endl; +#endif + mat = Match; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, Match); + } + break; + } + case InsertRow: + { + source_score = insert_row[next_idx]; + if (source_score > min_inf) { + score_diff = curr_score - (source_score + match_score); + if (!found_trace && score_diff == 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback] found next cell in insert row matrix with score " << (int) insert_row[next_idx] << endl; -#endif - mat = InsertRow; - found_trace = true; - } - else { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertRow); - } - } - - source_score = insert_col[next_idx]; - if (source_score > min_inf) { - score_diff = curr_score - (source_score + match_score); - if (!found_trace && score_diff == 0) { + cerr << "[BAMatrix::traceback] found next cell in insert row matrix with score " << (int) insert_row[next_idx] << endl; +#endif + mat = InsertRow; + found_trace = true; + } + else { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertRow); + } + } + break; + } + case InsertCol: + { + source_score = insert_col[next_idx]; + if (source_score > min_inf) { + score_diff = curr_score - (source_score + match_score); + if (!found_trace && score_diff == 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback] found next cell in insert column matrix with score " << (int) insert_col[next_idx] << endl; -#endif - mat = InsertCol; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertCol); + cerr << "[BAMatrix::traceback] found next cell in insert column matrix with score " << (int) insert_col[next_idx] << endl; +#endif + mat = InsertCol; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertCol); + } + + } + break; + } + default: + { + cerr << "error: invalid previous matrix" << endl; + exit(1); + } } - } if (!found_trace) { @@ -918,41 +944,61 @@ void BandedGlobalAligner::BAMatrix::traceback(const HandleGraph& graph, curr_score = insert_row[idx]; next_idx = (i - 1) * ncols + j; - - source_score = match[next_idx]; - score_diff = curr_score - (source_score - gap_open); - if (score_diff == 0) { - mat = Match; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, Match); - } - - source_score = insert_row[next_idx]; - if (source_score > min_inf) { - score_diff = curr_score - (source_score - gap_extend); - if (!found_trace && score_diff == 0) { - mat = InsertRow; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertRow); - } - } - - source_score = insert_col[next_idx]; - if (source_score > min_inf) { - score_diff = curr_score - (source_score - gap_open); - if (!found_trace && score_diff == 0) { - mat = InsertCol; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertCol); + + for (auto prev_mat : prev_mats) { + + switch (prev_mat) { + case Match: + { + source_score = match[next_idx]; + score_diff = curr_score - (source_score - gap_open); + if (score_diff == 0) { + mat = Match; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, Match); + } + break; + } + case InsertRow: + { + source_score = insert_row[next_idx]; + if (source_score > min_inf) { + score_diff = curr_score - (source_score - gap_extend); + if (!found_trace && score_diff == 0) { + mat = InsertRow; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertRow); + } + } + break; + } + case InsertCol: + { + source_score = insert_col[next_idx]; + if (source_score > min_inf) { + score_diff = curr_score - (source_score - gap_open); + if (!found_trace && score_diff == 0) { + mat = InsertCol; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertCol); + } + } + break; + } + default: + { + cerr << "error: invalid previous matrix" << endl; + exit(1); + } } } @@ -978,40 +1024,59 @@ void BandedGlobalAligner::BAMatrix::traceback(const HandleGraph& graph, curr_score = insert_col[idx]; next_idx = (i + 1) * ncols + j - 1; - source_score = match[next_idx]; - score_diff = curr_score - (source_score - gap_open); - if (score_diff == 0) { - mat = Match; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, Match); - } - - source_score = insert_row[next_idx]; - if (source_score > min_inf) { - score_diff = curr_score - (source_score - gap_open); - if (!found_trace && score_diff == 0) { - mat = InsertRow; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertRow); - } - } - - source_score = insert_col[next_idx]; - if (source_score > min_inf) { - score_diff = curr_score - (source_score - gap_extend); - if (!found_trace && score_diff == 0) { - mat = InsertCol; - found_trace = true; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertCol); + for (auto prev_mat : prev_mats) { + switch (prev_mat) { + case Match: + { + source_score = match[next_idx]; + score_diff = curr_score - (source_score - gap_open); + if (score_diff == 0) { + mat = Match; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, Match); + } + break; + } + case InsertRow: + { + source_score = insert_row[next_idx]; + if (source_score > min_inf) { + score_diff = curr_score - (source_score - gap_open); + if (!found_trace && score_diff == 0) { + mat = InsertRow; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertRow); + } + } + break; + } + case InsertCol: + { + source_score = insert_col[next_idx]; + if (source_score > min_inf) { + score_diff = curr_score - (source_score - gap_extend); + if (!found_trace && score_diff == 0) { + mat = InsertCol; + found_trace = true; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, node_id, InsertCol); + } + } + break; + } + default: + { + cerr << "error: invalid previous matrix" << endl; + exit(1); + } } } @@ -1352,6 +1417,10 @@ void BandedGlobalAligner::BAMatrix::traceback_over_edge(const HandleGra #ifdef debug_banded_aligner_traceback cerr << "[BAMatrix::traceback_over_edge] checking seed rectangular coordinates (" << seed_row << ", " << seed_col << "), with indices calculated from current diagonal " << curr_diag << " (top diag " << top_diag << " + offset " << i << "), seed top diagonal " << seed->top_diag << ", seed seq length " << seed_ncols << " with insert column offset " << (mat == InsertCol) << endl; #endif + vector prev_mats{InsertCol, Match, InsertRow}; + if (left_alignment_strand) { + std::swap(prev_mats[0], prev_mats[2]); + } switch (mat) { case Match: @@ -1388,64 +1457,83 @@ void BandedGlobalAligner::BAMatrix::traceback_over_edge(const HandleGra break; } - source_score = seed->match[next_idx]; - // don't need to check edge condition because match does not have min inf - score_diff = curr_score - (source_score + match_score); - if (score_diff == 0 && !found_trace) { - traceback_mat = Match; - traceback_seed = seed; - traceback_seed_row = seed_row; - traceback_seed_col = seed_col; - found_trace = true; - empty_intermediate_nodes = seed_record.second; + for (auto prev_mat : prev_mats) { + switch (prev_mat) { + case Match: + { + source_score = seed->match[next_idx]; + // don't need to check edge condition because match does not have min inf + score_diff = curr_score - (source_score + match_score); + if (score_diff == 0 && !found_trace) { + traceback_mat = Match; + traceback_seed = seed; + traceback_seed_row = seed_row; + traceback_seed_col = seed_col; + found_trace = true; + empty_intermediate_nodes = seed_record.second; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; -#endif - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, Match); - } - - source_score = seed->insert_col[next_idx]; - // check edge condition - if (source_score > min_inf) { - score_diff = curr_score - (source_score + match_score); - if (score_diff == 0 && !found_trace) { + cerr << "[BAMatrix::traceback_over_edge] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; +#endif + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, Match); + } + break; + } + case InsertRow: + { + source_score = seed->insert_row[next_idx]; + // check edge condition + if (source_score > min_inf) { + score_diff = curr_score - (source_score + match_score); + if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] hit found in insert column matrix with score " << (int) seed->insert_col[next_idx] << endl; -#endif - traceback_mat = InsertCol; - traceback_seed = seed; - traceback_seed_row = seed_row; - traceback_seed_col = seed_col; - found_trace = true; - empty_intermediate_nodes = seed_record.second; - } - else { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertCol); - } - } - - source_score = seed->insert_row[next_idx]; - // check edge condition - if (source_score > min_inf) { - score_diff = curr_score - (source_score + match_score); - if (score_diff == 0 && !found_trace) { + cerr << "[BAMatrix::traceback_over_edge] hit found in insert row matrix with score " << (int) seed->insert_row[next_idx] << endl; +#endif + traceback_mat = InsertRow; + traceback_seed = seed; + traceback_seed_row = seed_row; + traceback_seed_col = seed_col; + found_trace = true; + empty_intermediate_nodes = seed_record.second; + } + else { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertRow); + } + } + break; + } + case InsertCol: + { + source_score = seed->insert_col[next_idx]; + // check edge condition + if (source_score > min_inf) { + score_diff = curr_score - (source_score + match_score); + if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] hit found in insert row matrix with score " << (int) seed->insert_row[next_idx] << endl; -#endif - traceback_mat = InsertRow; - traceback_seed = seed; - traceback_seed_row = seed_row; - traceback_seed_col = seed_col; - found_trace = true; - empty_intermediate_nodes = seed_record.second; - } - else { - alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertRow); + cerr << "[BAMatrix::traceback_over_edge] hit found in insert column matrix with score " << (int) seed->insert_col[next_idx] << endl; +#endif + traceback_mat = InsertCol; + traceback_seed = seed; + traceback_seed_row = seed_row; + traceback_seed_col = seed_col; + found_trace = true; + empty_intermediate_nodes = seed_record.second; + } + else { + alt_score = curr_traceback_score - score_diff; + traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertCol); + } + } + break; + } + default: + { + cerr << "error: invalid matrix type" << endl; + exit(1); + } } } @@ -1454,73 +1542,92 @@ void BandedGlobalAligner::BAMatrix::traceback_over_edge(const HandleGra case InsertCol: { - source_score = seed->match[next_idx]; - // don't need to check edge condition because match does not have min inf - score_diff = curr_score - (source_score - gap_open); - if (score_diff == 0 && !found_trace) { + for (auto prev_mat : prev_mats) { + switch (prev_mat) { + case Match: + { + source_score = seed->match[next_idx]; + // don't need to check edge condition because match does not have min inf + score_diff = curr_score - (source_score - gap_open); + if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; -#endif - traceback_mat = Match; - traceback_seed = seed; - traceback_seed_row = seed_row; - traceback_seed_col = seed_col; - found_trace = true; - empty_intermediate_nodes = seed_record.second; - } - else if (source_score != min_inf) { - alt_score = curr_traceback_score - score_diff; + cerr << "[BAMatrix::traceback_over_edge] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; +#endif + traceback_mat = Match; + traceback_seed = seed; + traceback_seed_row = seed_row; + traceback_seed_col = seed_col; + found_trace = true; + empty_intermediate_nodes = seed_record.second; + } + else if (source_score != min_inf) { + alt_score = curr_traceback_score - score_diff; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] no hit in match matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; -#endif - traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, Match); - } - - source_score = seed->insert_col[next_idx]; - // check edge condition - if (source_score > min_inf) { - score_diff = curr_score - (source_score - gap_extend); - if (score_diff == 0 && !found_trace) { + cerr << "[BAMatrix::traceback_over_edge] no hit in match matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; +#endif + traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, Match); + } + break; + } + case InsertRow: + { + source_score = seed->insert_row[next_idx]; + // check edge condition + if (source_score > min_inf) { + score_diff = curr_score - (source_score - gap_open); + if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] hit found in insert column matrix with score " << (int) seed->match[next_idx] << endl; -#endif - traceback_mat = InsertCol; - traceback_seed = seed; - traceback_seed_row = seed_row; - traceback_seed_col = seed_col; - found_trace = true; - empty_intermediate_nodes = seed_record.second; - } - else { - alt_score = curr_traceback_score - score_diff; + cerr << "[BAMatrix::traceback_over_edge] hit found in insert row matrix with score " << (int) seed->match[next_idx] << endl; +#endif + traceback_mat = InsertRow; + traceback_seed = seed; + traceback_seed_row = seed_row; + traceback_seed_col = seed_col; + found_trace = true; + empty_intermediate_nodes = seed_record.second; + } + else { + alt_score = curr_traceback_score - score_diff; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] no hit in insert row matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; -#endif - traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertCol); - } - } - - source_score = seed->insert_row[next_idx]; - // check edge condition - if (source_score > min_inf) { - score_diff = curr_score - (source_score - gap_open); - if (score_diff == 0 && !found_trace) { + cerr << "[BAMatrix::traceback_over_edge] no hit in insert column matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; +#endif + traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertRow); + } + } + break; + } + case InsertCol: + { + source_score = seed->insert_col[next_idx]; + // check edge condition + if (source_score > min_inf) { + score_diff = curr_score - (source_score - gap_extend); + if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] hit found in insert row matrix with score " << (int) seed->match[next_idx] << endl; -#endif - traceback_mat = InsertRow; - traceback_seed = seed; - traceback_seed_row = seed_row; - traceback_seed_col = seed_col; - found_trace = true; - empty_intermediate_nodes = seed_record.second; - } - else { - alt_score = curr_traceback_score - score_diff; + cerr << "[BAMatrix::traceback_over_edge] hit found in insert column matrix with score " << (int) seed->match[next_idx] << endl; +#endif + traceback_mat = InsertCol; + traceback_seed = seed; + traceback_seed_row = seed_row; + traceback_seed_col = seed_col; + found_trace = true; + empty_intermediate_nodes = seed_record.second; + } + else { + alt_score = curr_traceback_score - score_diff; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_over_edge] no hit in insert column matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; -#endif - traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertRow); + cerr << "[BAMatrix::traceback_over_edge] no hit in insert row matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; +#endif + traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertCol); + } + } + break; + } + default: + { + cerr << "error: invalid matrix type" << endl; + exit(1); + } } } @@ -1809,12 +1916,14 @@ void BandedGlobalAligner::BAMatrix::print_band(const HandleGraph& graph template BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, int64_t band_padding, bool permissive_banding, - bool adjust_for_base_quality) : + bool adjust_for_base_quality, + const unordered_map* left_align_strand) : BandedGlobalAligner(alignment, g, nullptr, 1, band_padding, permissive_banding, - adjust_for_base_quality) + adjust_for_base_quality, + left_align_strand) { // nothing to do, just funnel into internal constructor } @@ -1824,13 +1933,15 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const Ha vector& alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding, - bool adjust_for_base_quality) : + bool adjust_for_base_quality, + const unordered_map* left_align_strand) : BandedGlobalAligner(alignment, g, &alt_alignments, max_multi_alns, band_padding, permissive_banding, - adjust_for_base_quality) + adjust_for_base_quality, + left_align_strand) { // check data integrity and funnel into internal constructor if (!alt_alignments.empty()) { @@ -1845,7 +1956,8 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const Ha int64_t max_multi_alns, int64_t band_padding, bool permissive_banding, - bool adjust_for_base_quality) : + bool adjust_for_base_quality, + const unordered_map* left_align_strand) : graph(g), alignment(alignment), alt_alignments(alt_alignments), @@ -1929,12 +2041,20 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const Ha seeds.push_back(banded_matrices[node_id_to_idx[graph.get_id(prev)]]); }); + bool strand = false; + if (left_align_strand) { + auto it = left_align_strand->find(node); + if (it != left_align_strand->end()) { + strand = it->second; + } + } banded_matrices[i] = new BAMatrix(alignment, node, band_ends[i].first, band_ends[i].second, std::move(seeds), - shortest_seqs[i]); + shortest_seqs[i], + strand); } } @@ -2367,17 +2487,43 @@ BandedGlobalAligner::AltTracebackStack::AltTracebackStack(const HandleG } else { // let the insert routine figure out which one is the best and which ones to keep in the stack - if (band_matrix->match[final_idx] != min_inf) { - insert_traceback(null_prefix, band_matrix->match[final_idx], - node_id, final_row, final_col, node_id, Match, path); + vector mats{InsertCol, Match, InsertRow}; + if (band_matrix->left_alignment_strand) { + std::swap(mats[0], mats[2]); } - if (band_matrix->insert_row[final_idx] != min_inf) { - insert_traceback(null_prefix, band_matrix->insert_row[final_idx], - node_id, final_row, final_col, node_id, InsertRow, path); - } - if (band_matrix->insert_col[final_idx] != min_inf) { - insert_traceback(null_prefix, band_matrix->insert_col[final_idx], - node_id, final_row, final_col, node_id, InsertCol, path); + for (auto mat : mats) { + switch (mat) + { + case Match: + { + if (band_matrix->match[final_idx] != min_inf) { + insert_traceback(null_prefix, band_matrix->match[final_idx], + node_id, final_row, final_col, node_id, Match, path); + } + break; + } + case InsertCol: + { + if (band_matrix->insert_row[final_idx] != min_inf) { + insert_traceback(null_prefix, band_matrix->insert_row[final_idx], + node_id, final_row, final_col, node_id, InsertRow, path); + } + break; + } + case InsertRow: + { + if (band_matrix->insert_col[final_idx] != min_inf) { + insert_traceback(null_prefix, band_matrix->insert_col[final_idx], + node_id, final_row, final_col, node_id, InsertCol, path); + } + break; + } + default: + { + cerr << "error: invalid matrix type" << endl; + exit(1); + } + } } } } diff --git a/src/banded_global_aligner.hpp b/src/banded_global_aligner.hpp index b77f9fe449f..74ff2628ce7 100644 --- a/src/banded_global_aligner.hpp +++ b/src/banded_global_aligner.hpp @@ -60,7 +60,8 @@ namespace vg { /// BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, int64_t band_padding, bool permissive_banding = false, - bool adjust_for_base_quality = false); + bool adjust_for_base_quality = false, + const unordered_map* left_align_strand = nullptr); /// Initializes banded multi-alignment, which computes the top scoring alternate alignments in addition @@ -78,7 +79,8 @@ namespace vg { BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector& alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding = false, - bool adjust_for_base_quality = false); + bool adjust_for_base_quality = false, + const unordered_map* left_align_strand = nullptr); ~BandedGlobalAligner(); @@ -133,7 +135,8 @@ namespace vg { BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector* alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding = false, - bool adjust_for_base_quality = false); + bool adjust_for_base_quality = false, + const unordered_map* left_align_strand = nullptr); /// Traceback through dynamic programming matrices to compute alignment void traceback(int8_t* score_mat, int8_t* nt_table, int8_t gap_open, int8_t gap_extend, IntType min_inf); @@ -156,7 +159,7 @@ namespace vg { public: BAMatrix(Alignment& alignment, handle_t node, int64_t top_diag, int64_t bottom_diag, - const vector& seeds, int64_t cumulative_seq_len); + const vector& seeds, int64_t cumulative_seq_len, bool left_alignment_strand); ~BAMatrix(); /// Use DP to fill the band with alignment scores @@ -188,6 +191,8 @@ namespace vg { handle_t node; + bool left_alignment_strand; + Alignment& alignment; /// Length of shortest sequence leading to matrix from a source node diff --git a/src/gbwt_helper.cpp b/src/gbwt_helper.cpp index cdf59e4e928..bb7b786f7f9 100644 --- a/src/gbwt_helper.cpp +++ b/src/gbwt_helper.cpp @@ -532,6 +532,38 @@ std::string compose_short_path_name(const gbwt::GBWT& gbwt_index, gbwt::size_typ //------------------------------------------------------------------------------ +void copy_reference_samples(const gbwt::GBWT& source, gbwt::GBWT& destination) { + if (source.tags.contains(gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG)) { + // Reference samples tag is not copied automatically. + destination.tags.set( + gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG, + source.tags.get(gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG) + ); + } +} + +void copy_reference_samples(const PathHandleGraph& source, gbwt::GBWT& destination) { + std::vector reference_samples; + source.for_each_path_of_sense(PathSense::REFERENCE, [&](const path_handle_t& path) { + std::string sample_name = source.get_sample_name(path); + if (sample_name != PathMetadata::NO_SAMPLE_NAME) { + reference_samples.push_back(sample_name); + } + }); + + if (!reference_samples.empty()) { + std::string reference_samples_tag = reference_samples.front(); + for (size_t i = 1; i < reference_samples.size(); i++) { + reference_samples_tag += gbwtgraph::REFERENCE_SAMPLE_LIST_SEPARATOR + reference_samples[i]; + } + destination.tags.set( + gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG, reference_samples_tag + ); + } +} + +//------------------------------------------------------------------------------ + gbwt::GBWT get_gbwt(const std::vector& paths) { gbwt::size_type node_width = 1, total_length = 0; for (auto& path : paths) { diff --git a/src/gbwt_helper.hpp b/src/gbwt_helper.hpp index 1d8bf0e5db8..cfb02d301a0 100644 --- a/src/gbwt_helper.hpp +++ b/src/gbwt_helper.hpp @@ -209,6 +209,9 @@ struct RebuildParameters { /// /// NOTE: Threads may be reordered if there are multiple jobs. Old thread ids are /// no longer valid after rebuilding the GBWT. +/// +/// NOTE: This could use the ConstructionJob / MetadataBuilder scheme for +/// parallelization, but it would change the interface. gbwt::GBWT rebuild_gbwt(const gbwt::GBWT& gbwt_index, const std::vector& jobs, const std::unordered_map& node_to_job, @@ -241,6 +244,16 @@ std::string compose_short_path_name(const gbwt::GBWT& gbwt_index, gbwt::size_typ //------------------------------------------------------------------------------ +/// Copies the reference sample tag from the source GBWT index to the destination GBWT index. +void copy_reference_samples(const gbwt::GBWT& source, gbwt::GBWT& destination); + +/// Copies reference samples from the source graph to the destination GBWT index. +/// Every sample with at least one reference path in the source graph is considered +/// a reference sample. +void copy_reference_samples(const PathHandleGraph& source, gbwt::GBWT& destination); + +//------------------------------------------------------------------------------ + /// Transform the paths into a GBWT index. Primarily for testing. gbwt::GBWT get_gbwt(const std::vector& paths); diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 53f59fa5efb..83a9c1b4a58 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -335,6 +335,42 @@ vector vcf_contigs(const string& filename) { return return_val; } +// Returns a guess for an appropriate number of parallel GBWT construction jobs. +// This is intended for GBWT recipes where the construction uses an external +// mechanism for parallelism. +size_t guess_parallel_gbwt_jobs(size_t node_count, size_t haplotype_count, size_t available_memory, size_t batch_size) { + + // Memory usage of the GBWT construction itself. + size_t bytes_per_node = 135 * std::max(std::log10(haplotype_count + 1), 1.0); + // Construction buffers typically use 3-4 bytes per node, and the builder has two buffers. + size_t bytes_per_job = batch_size * 8; + + size_t jobs = 1; + size_t max_jobs = get_thread_count(); + // We assume that the largest chromosome is 10% of the genome. + size_t job_size = std::max(node_count / 10, size_t(1)); + size_t memory_usage = bytes_per_node * job_size + bytes_per_job; + + while (jobs < max_jobs) { + // We assume that the next chromosome is 5% smaller than the previous one. + job_size = std::max(size_t(job_size * 0.95), size_t(1)); + memory_usage += bytes_per_node * job_size + bytes_per_job; + if (memory_usage > available_memory) { + break; + } + jobs++; + } + + return jobs; +} + +// Returns the in-memory size of an XG index in bytes, using the same approach as +// sdsl::size_in_bytes(). +size_t xg_index_size(const xg::XG& index) { + sdsl::nullstream ns; + return index.serialize_and_measure(ns); +} + /********************* * Indexing helper functions ***********************/ @@ -2691,30 +2727,44 @@ IndexRegistry VGIndexes::get_vg_index_registry() { std::function path_filter = [&xg_index](const path_handle_t& path) { return !Paths::is_alt(xg_index->get_path_name(path)); }; - - cover = std::move(gbwtgraph::local_haplotypes(*xg_index, *gbwt_index, - IndexingParameters::giraffe_gbwt_downsample, - IndexingParameters::downsample_context_length, - IndexingParameters::gbwt_insert_batch_size, - IndexingParameters::gbwt_sampling_interval, - true, // Also include named paths from the graph - &path_filter, - IndexingParameters::verbosity >= IndexingParameters::Debug)); + + gbwtgraph::PathCoverParameters parameters; + parameters.num_paths = IndexingParameters::giraffe_gbwt_downsample; + parameters.context = IndexingParameters::downsample_context_length; + // TODO: See path cover for handling graphs with large components. + parameters.batch_size = IndexingParameters::gbwt_insert_batch_size; + parameters.sample_interval = IndexingParameters::gbwt_sampling_interval; + std::int64_t available_memory = plan->target_memory_usage() - xg_index_size(*xg_index) - sdsl::size_in_bytes(*gbwt_index); + parameters.parallel_jobs = guess_parallel_gbwt_jobs( + xg_index->get_node_count(), + parameters.num_paths, + std::max(available_memory, std::int64_t(0)), + parameters.batch_size + ); + parameters.show_progress = (IndexingParameters::verbosity >= IndexingParameters::Debug); + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + std::cerr << "[IndexRegistry]: Running " << parameters.parallel_jobs << " jobs in parallel" << std::endl; + } + cover = std::move(gbwtgraph::local_haplotypes(*xg_index, *gbwt_index, parameters, true, &path_filter)); + // Reference samples tag is not copied automatically. + copy_reference_samples(*gbwt_index, cover); } else { // Augment the GBWT with a path cover of components without haplotypes. if (IndexingParameters::verbosity != IndexingParameters::None) { - cerr << "[IndexRegistry]: Not enough haplotypes; augmenting the full GBWT instead." << endl; + cerr << "[IndexRegistry]: Not too many haplotypes; augmenting the full GBWT instead." << endl; } gbwt::DynamicGBWT dynamic_index(*gbwt_index); gbwt_index.reset(); - gbwtgraph::augment_gbwt(*xg_index, dynamic_index, - IndexingParameters::path_cover_depth, - IndexingParameters::downsample_context_length, - IndexingParameters::gbwt_insert_batch_size, - IndexingParameters::gbwt_sampling_interval, - IndexingParameters::verbosity >= IndexingParameters::Debug); + // Note that augmenting a GBWT is always single-threaded. + gbwtgraph::PathCoverParameters parameters; + parameters.num_paths = IndexingParameters::path_cover_depth; + parameters.context = IndexingParameters::downsample_context_length; + parameters.batch_size = IndexingParameters::gbwt_insert_batch_size; + parameters.sample_interval = IndexingParameters::gbwt_sampling_interval; + parameters.show_progress = (IndexingParameters::verbosity >= IndexingParameters::Debug); + gbwtgraph::augment_gbwt(*xg_index, dynamic_index, parameters); cover = std::move(gbwt::GBWT(dynamic_index)); } @@ -2764,14 +2814,25 @@ IndexRegistry VGIndexes::get_vg_index_registry() { }; // make a GBWT from a greedy path cover - gbwt::GBWT cover = gbwtgraph::path_cover_gbwt(*xg_index, - IndexingParameters::path_cover_depth, - IndexingParameters::downsample_context_length, - std::max(IndexingParameters::gbwt_insert_batch_size, 20 * max_comp_size), // buffer size recommendation from Jouni - IndexingParameters::gbwt_sampling_interval, - true, // Also include named paths from the graph - &path_filter, - IndexingParameters::verbosity >= IndexingParameters::Debug); + gbwtgraph::PathCoverParameters parameters; + parameters.num_paths = IndexingParameters::path_cover_depth; + parameters.context = IndexingParameters::downsample_context_length; + parameters.batch_size = std::max(IndexingParameters::gbwt_insert_batch_size, 20 * max_comp_size); + parameters.sample_interval = IndexingParameters::gbwt_sampling_interval; + std::int64_t available_memory = plan->target_memory_usage() - xg_index_size(*xg_index); + parameters.parallel_jobs = guess_parallel_gbwt_jobs( + xg_index->get_node_count(), + parameters.num_paths, + std::max(available_memory, std::int64_t(0)), + parameters.batch_size + ); + parameters.show_progress = (IndexingParameters::verbosity >= IndexingParameters::Debug); + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + std::cerr << "[IndexRegistry]: Running " << parameters.parallel_jobs << " jobs in parallel" << std::endl; + } + gbwt::GBWT cover = gbwtgraph::path_cover_gbwt(*xg_index, parameters, true, &path_filter); + // Determine reference samples from reference paths. + copy_reference_samples(*xg_index, cover); save_gbwt(cover, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); @@ -3853,8 +3914,20 @@ IndexRegistry VGIndexes::get_vg_index_registry() { params.batch_size = IndexingParameters::gbwt_insert_batch_size; params.sample_interval = IndexingParameters::gbwt_sampling_interval; params.max_node_length = IndexingParameters::max_node_size; + // TODO: Here we assume that the GFA file contains 100 haplotypes. If there are more, + // we could safely launch more jobs. Using 600 as the divisor would be a better + // estimate of the number of segments, but we chop long segments to 32 bp nodes. + params.parallel_jobs = guess_parallel_gbwt_jobs( + std::max(get_file_size(gfa_filename) / 300, std::int64_t(1)), + 100, + plan->target_memory_usage(), + params.batch_size + ); params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug; - + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + std::cerr << "[IndexRegistry]: Running " << params.parallel_jobs << " jobs in parallel" << std::endl; + } + // jointly generate the GBWT and record sequences unique_ptr gbwt_index; unique_ptr seq_source; diff --git a/src/main.cpp b/src/main.cpp index dc87d7381ad..514f8b89760 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,6 +1,3 @@ -// Needed for crash.hpp to work because it uses newer types -#define _POSIX_C_SOURCE 200809L - #include #include #include diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index ae2991c0238..4cc4224368e 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -3105,6 +3105,7 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r get_regular_aligner()->align_xdrop(rescued_alignment, cached_graph, topological_order, dozeu_seed, false, gap_limit); this->fix_dozeu_score(rescued_alignment, cached_graph, topological_order); + this->fix_dozeu_end_deletions(rescued_alignment); } else { get_regular_aligner()->align(rescued_alignment, cached_graph, topological_order); } @@ -3143,6 +3144,7 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r size_t gap_limit = this->get_regular_aligner()->longest_detectable_gap(rescued_alignment); get_regular_aligner()->align_xdrop(rescued_alignment, dagified, std::vector(), false, gap_limit); this->fix_dozeu_score(rescued_alignment, dagified, std::vector()); + this->fix_dozeu_end_deletions(rescued_alignment); } else if (this->rescue_algorithm == rescue_gssw) { get_regular_aligner()->align(rescued_alignment, dagified, true); } @@ -3199,6 +3201,55 @@ void MinimizerMapper::fix_dozeu_score(Alignment& rescued_alignment, const Handle } } +void MinimizerMapper::fix_dozeu_end_deletions(Alignment& alignment) const { + + // figure out where the first insert/aligned base occurs on the left side + size_t i = 0, j = 0; + for (; i < alignment.path().mapping_size(); ++i) { + const auto& mapping = alignment.path().mapping(i); + for (j = 0; j < mapping.edit_size(); ++j) { + if (mapping.edit(j).to_length() != 0) { + break; + } + } + if (j != mapping.edit_size()) { + break; + } + } + if (i == alignment.path().mapping_size()) { + // the entire alignment is a deletion, clear it + alignment.clear_path(); + } + else if (i != 0 || j != 0) { + // we found a deletion on the left side, remove it + auto mappings = alignment.mutable_path()->mutable_mapping(); + auto edits = (*mappings)[j].mutable_edit(); + size_t removed = 0; + for (size_t k = 0; k < j; ++k) { + removed += (*edits)[k].from_length(); + } + edits->erase(edits->begin(), edits->begin() + j); + mappings->erase(mappings->begin(), mappings->begin() + i); + auto position = (*mappings)[0].mutable_position(); + position->set_offset(position->offset() + removed); + } + + // remove deletions on the right side + for (int64_t i = alignment.path().mapping_size() - 1; i >= 0; --i) { + auto edits = alignment.mutable_path()->mutable_mapping(i)->mutable_edit(); + while (!edits->empty() && (*edits)[edits->size() - 1].to_length() == 0) { + edits->RemoveLast(); + } + if (edits->empty()) { + alignment.mutable_path()->mutable_mapping()->RemoveLast(); + } + else { + break; + } + } +} + + //----------------------------------------------------------------------------- diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index ab653ea80a6..f71a13f107b 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -645,6 +645,14 @@ class MinimizerMapper : public AlignerClient { */ void fix_dozeu_score(Alignment& rescued_alignment, const HandleGraph& rescue_graph, const std::vector& topological_order) const; + + /** + * When dozeu doesn't have any seeds, it's scan heuristic can lead to + * inaccurate anchoring with the end result that one end of the alignment + * has a deletion that doesn't connect to an aligned base. This function + * removes those deletions + */ + void fix_dozeu_end_deletions(Alignment& rescued_alignment) const; //----------------------------------------------------------------------------- diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index a4d1c3e0cd2..a9cf989d726 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -4227,7 +4227,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, const function(id_t)>* project, - bool allow_negative_scores) { + bool allow_negative_scores, unordered_map* left_align_strand) { // don't dynamically choose band padding, shim constant value into a function type function constant_padding = [&](const Alignment& seq, const HandleGraph& graph) { @@ -4248,7 +4248,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap cutting_snarls, dist_index, project, - allow_negative_scores); + allow_negative_scores, + left_align_strand); } void MultipathAlignmentGraph::deduplicate_alt_alns(vector>& alt_alns, @@ -5184,7 +5185,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap function band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, const function(id_t)>* project, - bool allow_negative_scores) { + bool allow_negative_scores, unordered_map* left_align_strand) { // TODO: magic number // how many tails we need to have before we try the more complicated but @@ -5351,7 +5352,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap vector alt_alignments; aligner->align_global_banded_multi(intervening_sequence, alt_alignments, connecting_graph, num_alns_iter, - band_padding_function(intervening_sequence, connecting_graph), true); + band_padding_function(intervening_sequence, connecting_graph), true, + left_align_strand); // remove alignments with the same path deduplicated = convert_and_deduplicate(alt_alignments, false, false); diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index bb7abbd9889..90b3de547a7 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -197,7 +197,7 @@ namespace vg { size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, const function(id_t)>* project = nullptr, - bool allow_negative_scores = false); + bool allow_negative_scores = false, unordered_map* left_align_strand = nullptr); /// Do intervening and tail alignments between the anchoring paths and /// store the result in a multipath_alignment_t. Reachability edges must @@ -215,7 +215,8 @@ namespace vg { size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len, function band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, - const function(id_t)>* project = nullptr, bool allow_negative_scores = false); + const function(id_t)>* project = nullptr, bool allow_negative_scores = false, + unordered_map* left_align_strand = nullptr); /// Converts a MultipathAlignmentGraph to a GraphViz Dot representation, output to the given ostream. /// If given the Alignment query we are working on, can produce information about subpath iterators. diff --git a/src/recombinator.cpp b/src/recombinator.cpp index 533e73f8237..b1a038884d5 100644 --- a/src/recombinator.cpp +++ b/src/recombinator.cpp @@ -220,7 +220,7 @@ void Haplotypes::TopLevelChain::simple_sds_load(std::istream& in) { void Haplotypes::TopLevelChain::load_old(std::istream& in) { this->offset = sdsl::simple_sds::load_value(in); this->job_id = sdsl::simple_sds::load_value(in); - this->contig_name = "chain_" + std::to_string(this->offset); + this->contig_name = "component_" + std::to_string(this->offset); size_t subchain_count = sdsl::simple_sds::load_value(in); this->subchains.resize(subchain_count); for (size_t i = 0; i < subchain_count; i++) { @@ -310,72 +310,70 @@ Haplotypes HaplotypePartitioner::partition_haplotypes(const Parameters& paramete Haplotypes result; result.header.k = this->minimizer_index.k(); - // Determine top-level chains. + // Determine GBWT construction jobs. double start = gbwt::readTimer(); if (this->verbosity >= Haplotypes::verbosity_basic) { std::cerr << "Determining construction jobs" << std::endl; } + size_t size_bound = this->gbz.graph.get_node_count() / parameters.approximate_jobs; + gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(this->gbz.graph, size_bound); + result.header.construction_jobs = jobs.size(); + + // Determine the number of top-level chains and fill in basic information. size_t total_chains = 0; this->distance_index.for_each_child(this->distance_index.get_root(), [&](const handlegraph::net_handle_t&) { total_chains++; }); - size_t size_bound = this->gbz.graph.get_node_count() / parameters.approximate_jobs; - gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(this->gbz.graph, size_bound); - if (jobs.components != total_chains) { + if (jobs.components() != total_chains) { // TODO: Could we instead identify the components with multiple top-level chains // and skip them? std::string msg = "HaplotypePartitioner::partition_haplotypes(): there are " - + std::to_string(total_chains) + " top-level chains and " + std::to_string(jobs.components) + + std::to_string(total_chains) + " top-level chains and " + std::to_string(jobs.components()) + " weakly connected components; haplotype sampling cannot be used with this graph"; throw std::runtime_error(msg); } - result.header.top_level_chains = jobs.components; - result.header.construction_jobs = jobs.size(); + result.header.top_level_chains = jobs.components(); result.chains.resize(result.components()); for (size_t chain_id = 0; chain_id < result.components(); chain_id++) { result.chains[chain_id].offset = chain_id; result.chains[chain_id].job_id = result.jobs(); // Not assigned to any job yet. - result.chains[chain_id].contig_name = "chain_" + std::to_string(chain_id); // Placeholder name. + result.chains[chain_id].contig_name = "component_" + std::to_string(chain_id); // Placeholder name. } - // Assign chains to jobs and determine contig names. + // Assign chains to jobs and fill in contig names. auto chains_by_job = gbwtgraph::partition_chains(this->distance_index, this->gbz.graph, jobs); + // We do not use a path filter, because a GBZ graph should not contain alt paths. + auto contig_names = jobs.contig_names(this->gbz.graph); for (size_t job_id = 0; job_id < result.jobs(); job_id++) { for (auto& chain : chains_by_job[job_id]) { result.chains[chain.offset].job_id = job_id; - auto da = this->r_index.decompressDA(gbwtgraph::GBWTGraph::handle_to_node(chain.handle)); - for (gbwt::size_type seq_id : da) { - gbwt::size_type path_id = gbwt::Path::id(seq_id); - auto iter = this->gbz.graph.id_to_path.find(path_id); - if (iter != this->gbz.graph.id_to_path.end()) { - // This is a generic / reference path. Use its contig name for the chain. - gbwt::PathName path_name = this->gbz.index.metadata.path(path_id); - result.chains[chain.offset].contig_name = this->gbz.index.metadata.contig(path_name.contig); - if (this->verbosity >= Haplotypes::verbosity_debug) { - std::cerr << "Using contig name " << result.chains[chain.offset].contig_name << " for chain " << chain.offset << std::endl; - } - break; - } + nid_t node_id = this->gbz.graph.get_id(chain.handle); + size_t component_id = jobs.component(node_id); + if (component_id >= contig_names.size()) { + std::string msg = "HaplotypePartitioner::partition_haplotypes(): cannot map top-level chain " + std::to_string(chain.offset) + " to a component"; + throw std::runtime_error(msg); + } + result.chains[chain.offset].contig_name = contig_names[component_id]; + if (this->verbosity >= Haplotypes::verbosity_debug) { + std::cerr << "Using contig name " << contig_names[component_id] << " for chain " << chain.offset << std::endl; } } } // Assign named and reference paths to jobs. - result.jobs_for_cached_paths.reserve(this->gbz.graph.named_paths.size()); - for (size_t i = 0; i < this->gbz.graph.named_paths.size(); i++) { - const gbwtgraph::NamedPath& path = this->gbz.graph.named_paths[i]; - if (path.from == gbwt::invalid_edge()) { - // Skip empty paths. - result.jobs_for_cached_paths.push_back(result.jobs()); - continue; - } - nid_t node_id = gbwt::Node::id(path.from.first); - auto iter = jobs.node_to_job.find(node_id); - if (iter == jobs.node_to_job.end()) { - std::string msg = "HaplotypePartitioner::partition_haplotypes(): cannot assign node " + std::to_string(node_id) + " to a job"; - throw std::runtime_error(msg); + result.jobs_for_cached_paths = std::vector(this->gbz.graph.named_paths.size(), result.jobs()); + // Again, we do not use a path filter, because a GBZ graph should not contain alt paths. + auto assignments = gbwtgraph::assign_paths(this->gbz.graph, jobs, nullptr, nullptr); + for (size_t job = 0; job < assignments.size(); job++) { + for (const path_handle_t& path : assignments[job]) { + // We know that GBWTGraph path handles for reference/generic paths are offsets in named_paths. + size_t path_id = handlegraph::as_integer(path); + if (path_id >= result.jobs_for_cached_paths.size()) { + std::string msg = "HaplotypePartitioner::partition_haplotypes(): path " + std::to_string(path_id) + " is not a reference/generic path"; + throw std::runtime_error(msg); + } + result.jobs_for_cached_paths[path_id] = job; } - result.jobs_for_cached_paths.push_back(iter->second); } jobs = gbwtgraph::ConstructionJobs(); // Save memory. @@ -779,7 +777,7 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain * GBWT metadata will be set as following: * * * Sample name is "recombination". - * * Contig name is "chain_X", where X is the chain identifier. + * * Contig name is taken from the top-level chain. * * Haplotype identifier is set during construction. * * Fragment identifier is set as necessary. */ @@ -822,19 +820,25 @@ struct RecombinatorHaplotype { * take the prefix of the original original haplotype until the start * of the subchain. */ - void extend(sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, gbwt::GBWTBuilder& builder); + void extend( + sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata + ); // Takes an existing haplotype from the GBWT index and inserts it into /// the builder. This is intended for fragments that do not contain /// subchains crossed by the original haplotypes. The call will fail if /// `extend()` has been called. - void take(gbwt::size_type sequence_id, const Recombinator& recombinator, gbwt::GBWTBuilder& builder); + void take( + gbwt::size_type sequence_id, const Recombinator& recombinator, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata + ); // Extends the original haplotype from the latest `extend()` call until // the end, inserts it into the builder, and starts a new fragment. // The call will fail if `extend()` has not been called for this // fragment. - void finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder); + void finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata); private: // Extends the haplotype over a unary path from a previous subchain. @@ -847,10 +851,13 @@ struct RecombinatorHaplotype { void suffix(const gbwt::GBWT& index); // Inserts the current fragment into the builder. - void insert(gbwt::GBWTBuilder& builder); + void insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata); }; -void RecombinatorHaplotype::extend(sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, gbwt::GBWTBuilder& builder) { +void RecombinatorHaplotype::extend( + sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata +) { if (subchain.type == Haplotypes::Subchain::full_haplotype) { throw std::runtime_error("Haplotype::extend(): cannot extend a full haplotype"); } @@ -877,7 +884,7 @@ void RecombinatorHaplotype::extend(sequence_type sequence, const Haplotypes::Sub if (subchain.type == Haplotypes::Subchain::suffix) { this->position = curr; - this->finish(recombinator, builder); + this->finish(recombinator, builder, metadata); return; } @@ -893,7 +900,10 @@ void RecombinatorHaplotype::extend(sequence_type sequence, const Haplotypes::Sub this->position = curr; } -void RecombinatorHaplotype::take(gbwt::size_type sequence_id, const Recombinator& recombinator, gbwt::GBWTBuilder& builder) { +void RecombinatorHaplotype::take( + gbwt::size_type sequence_id, const Recombinator& recombinator, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata +) { if (!this->path.empty()) { throw std::runtime_error("Haplotype::take(): the current fragment is not empty"); } @@ -901,19 +911,19 @@ void RecombinatorHaplotype::take(gbwt::size_type sequence_id, const Recombinator throw std::runtime_error("Haplotype::take(): the GBWT index does not contain sequence " + std::to_string(sequence_id)); } this->path = recombinator.gbz.index.extract(sequence_id); - this->insert(builder); + this->insert(builder, metadata); this->fragment++; this->sequence_id = gbwt::invalid_sequence(); this->position = gbwt::invalid_edge(); this->path.clear(); } -void RecombinatorHaplotype::finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder) { +void RecombinatorHaplotype::finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) { if (this->position == gbwt::invalid_edge()) { throw std::runtime_error("Haplotype::finish(): there is no current position"); } this->suffix(recombinator.gbz.index); - this->insert(builder); + this->insert(builder, metadata); this->fragment++; this->sequence_id = gbwt::invalid_sequence(); this->position = gbwt::invalid_edge(); @@ -966,19 +976,9 @@ void RecombinatorHaplotype::suffix(const gbwt::GBWT& index) { } } -void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder) { +void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) { std::string sample_name = "recombination"; - gbwt::size_type sample_id = builder.index.metadata.sample(sample_name); - if (sample_id >= builder.index.metadata.samples()) { - builder.index.metadata.addSamples({ sample_name }); - } - - gbwt::size_type contig_id = builder.index.metadata.contig(this->contig_name); - if (contig_id >= builder.index.metadata.contigs()) { - builder.index.metadata.addContigs({ this->contig_name }); - } - - builder.index.metadata.addPath(sample_id, contig_id, this->id, this->fragment); + metadata.add_haplotype(sample_name, this->contig_name, this->id, this->fragment); builder.insert(this->path, true); } @@ -1022,24 +1022,18 @@ Recombinator::Recombinator(const gbwtgraph::GBZ& gbz, Verbosity verbosity) : //------------------------------------------------------------------------------ -void add_path(const gbwt::GBWT& source, gbwt::size_type path_id, gbwt::GBWTBuilder& builder) { +void add_path(const gbwt::GBWT& source, gbwt::size_type path_id, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) { // We know that sufficient metadata exists, because this is a cached path. gbwt::PathName path_name = source.metadata.path(path_id); - std::string sample_name = source.metadata.sample(path_name.sample); - path_name.sample = builder.index.metadata.sample(sample_name); - if (path_name.sample >= builder.index.metadata.samples()) { - builder.index.metadata.addSamples({ sample_name }); - } - std::string contig_name = source.metadata.contig(path_name.contig); - path_name.contig = builder.index.metadata.contig(contig_name); - if (path_name.contig >= builder.index.metadata.contigs()) { - builder.index.metadata.addContigs({ contig_name }); + if (sample_name == gbwtgraph::REFERENCE_PATH_SAMPLE_NAME) { + metadata.add_generic_path(contig_name); + } else { + // Reference samples will be copied later. + metadata.add_haplotype(sample_name, contig_name, path_name.phase, path_name.count); } - builder.index.metadata.addPath(path_name); - gbwt::vector_type path = source.extract(gbwt::Path::encode(path_id, false)); builder.insert(path, true); } @@ -1170,6 +1164,8 @@ gbwt::GBWT Recombinator::generate_haplotypes(const Haplotypes& haplotypes, const } // Build partial indexes. + // We use a separate MetadataBuilder for each job, because we don't know in advance + // how many fragments there will be for each generated haplotype. double checkpoint = gbwt::readTimer(); if (this->verbosity >= Haplotypes::verbosity_basic) { std::cerr << "Running " << omp_get_max_threads() << " GBWT construction jobs in parallel" << std::endl; @@ -1179,24 +1175,28 @@ gbwt::GBWT Recombinator::generate_haplotypes(const Haplotypes& haplotypes, const #pragma omp parallel for schedule(dynamic, 1) for (size_t job = 0; job < jobs.size(); job++) { gbwt::GBWTBuilder builder(sdsl::bits::length(this->gbz.index.sigma() - 1), parameters.buffer_size); - builder.index.addMetadata(); + gbwtgraph::MetadataBuilder metadata; Statistics job_statistics; + // Add named and reference paths. + for (auto path_id : reference_paths[job]) { + add_path(this->gbz.index, path_id, builder, metadata); + job_statistics.ref_paths++; + } // Add haplotypes for each chain. for (auto chain_id : jobs[job]) { try { - Statistics chain_statistics = this->generate_haplotypes(haplotypes.chains[chain_id], counts, builder, parameters, coverage); + Statistics chain_statistics = this->generate_haplotypes( + haplotypes.chains[chain_id], counts, builder, metadata, parameters, coverage + ); job_statistics.combine(chain_statistics); } catch (const std::runtime_error& e) { std::cerr << "error: [job " << job << "]: " << e.what() << std::endl; std::exit(EXIT_FAILURE); } } - // Add named and reference paths. - for (auto path_id : reference_paths[job]) { - add_path(this->gbz.index, path_id, builder); - job_statistics.ref_paths++; - } builder.finish(); + builder.index.addMetadata(); + builder.index.metadata = metadata.get_metadata(); indexes[job] = builder.index; #pragma omp critical { @@ -1212,18 +1212,14 @@ gbwt::GBWT Recombinator::generate_haplotypes(const Haplotypes& haplotypes, const std::cerr << "Finished the jobs in " << seconds << " seconds" << std::endl; } - // Merge the partial indexes. + // Merge the partial indexes and set the reference samples tag. checkpoint = gbwt::readTimer(); if (this->verbosity >= Haplotypes::verbosity_basic) { std::cerr << "Merging the partial indexes" << std::endl; } gbwt::GBWT merged(indexes); if (parameters.include_reference) { - // If we included reference paths, set the same samples as references in the output GBWT. - std::string reference_samples = this->gbz.index.tags.get(gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG); - if (!reference_samples.empty()) { - merged.tags.set(gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG, reference_samples); - } + copy_reference_samples(this->gbz.index, merged); } if (this->verbosity >= Haplotypes::verbosity_basic) { double seconds = gbwt::readTimer() - checkpoint; @@ -1420,7 +1416,7 @@ std::vector> select_haplotypes( Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::TopLevelChain& chain, const hash_map& kmer_counts, - gbwt::GBWTBuilder& builder, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata, const Parameters& parameters, double coverage ) const { @@ -1439,7 +1435,7 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { assert(!subchain.sequences.empty()); size_t seq = haplotype % subchain.sequences.size(); - haplotypes[haplotype].take(subchain.sequences[seq].first, *this, builder); + haplotypes[haplotype].take(subchain.sequences[seq].first, *this, builder, metadata); } statistics.full_haplotypes = 1; } else { @@ -1491,14 +1487,14 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top size_t selected = haplotype_to_selected[haplotype]; size_t seq_offset = selected_haplotypes[selected].first; statistics.score += selected_haplotypes[selected].second; - haplotypes[haplotype].extend(subchain.sequences[seq_offset], subchain, *this, builder); + haplotypes[haplotype].extend(subchain.sequences[seq_offset], subchain, *this, builder, metadata); } have_haplotypes = subchain.has_end(); statistics.subchains++; } if (have_haplotypes) { for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { - haplotypes[haplotype].finish(*this, builder); + haplotypes[haplotype].finish(*this, builder, metadata); } } statistics.fragments = haplotypes.front().fragment; diff --git a/src/recombinator.hpp b/src/recombinator.hpp index 002e61f7460..0801e4421e1 100644 --- a/src/recombinator.hpp +++ b/src/recombinator.hpp @@ -521,7 +521,7 @@ class Recombinator { // Generate haplotypes for the given chain. Statistics generate_haplotypes(const Haplotypes::TopLevelChain& chain, const hash_map& kmer_counts, - gbwt::GBWTBuilder& builder, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata, const Parameters& parameters, double coverage) const; }; diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index 2e798e1f69c..7474f5a521e 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -491,7 +491,7 @@ void help_convert(char** argv) { << " Not compatible with other GFA output options or non-GBWT graphs." << endl << " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types," << endl << " but can't preserve original GFA coordinates." << endl - << " --no-translation When using the GBWTGraph algorith, convert the graph directly to GFA." << endl + << " --no-translation When using the GBWTGraph algorithm, convert the graph directly to GFA." << endl << " Do not use the translation to preserve original coordinates." << endl << "alignment options:" << endl << " -G, --gam-to-gaf FILE convert GAM FILE to GAF" << endl diff --git a/src/subcommand/gamcompare_main.cpp b/src/subcommand/gamcompare_main.cpp index 7cd27270513..3000a096806 100644 --- a/src/subcommand/gamcompare_main.cpp +++ b/src/subcommand/gamcompare_main.cpp @@ -11,6 +11,7 @@ #include "subcommand.hpp" #include "../alignment.hpp" +#include "../annotation.hpp" #include "../snarl_distance_index.hpp" #include "../vg.hpp" #include @@ -257,7 +258,7 @@ int main_gamcompare(int argc, char** argv) { // Output TSV to standard out in the format plot-qq.R needs. if (!header_printed) { // It needs a header - cout << "correct\tmq\taligner\tread" << endl; + cout << "correct\tmq\taligner\tread\teligible" << endl; header_printed = true; } @@ -266,7 +267,8 @@ int main_gamcompare(int argc, char** argv) { cout << (aln.correctly_mapped() ? "1" : "0") << "\t"; cout << aln.mapping_quality() << "\t"; cout << aligner_name << "\t"; - cout << aln.name() << endl; + cout << aln.name() << "\t"; + cout << (has_annotation(aln, "no_truth") ? "0" : "1") << endl; } text_buffer.clear(); }; @@ -341,6 +343,8 @@ int main_gamcompare(int argc, char** argv) { // Annotate it as such aln.set_correctly_mapped(correctly_mapped); + // And make sure we say it was possible to get + clear_annotation(aln, "no_truth"); if (correctly_mapped) { correct_counts.at(omp_get_thread_num()) += 1; @@ -358,15 +362,25 @@ int main_gamcompare(int argc, char** argv) { correct_count_by_mapq_by_thread.at(omp_get_thread_num()).at(mapq) += 1; } } + } else if (range != -1) { + // We are flagging reads correct/incorrect, but this read has no truth position. + // Remember that it was impossible to get. + set_annotation(aln, "no_truth", true); } #pragma omp critical { if (output_tsv) { - text_buffer.emplace_back(std::move(aln)); + if (emitter) { + // Copy the alignment since we need it twice + text_buffer.emplace_back(aln); + } else { + text_buffer.emplace_back(std::move(aln)); + } if (text_buffer.size() > 1000) { flush_text_buffer(); } - } else { + } + if (emitter) { emitter->write(std::move(aln)); } } diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index 7971f72aa94..94b16945984 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -102,6 +102,17 @@ struct GBWTConfig { static size_t default_merge_jobs() { return std::min(static_cast(gbwt::MergeParameters::MERGE_JOBS), std::max(static_cast(1), static_cast(omp_get_max_threads() / 2))); } + + gbwtgraph::PathCoverParameters path_cover_parameters() const { + gbwtgraph::PathCoverParameters parameters; + parameters.num_paths = this->num_paths; + parameters.context = this->context_length; + parameters.batch_size = this->haplotype_indexer.gbwt_buffer_size * gbwt::MILLION; + parameters.sample_interval = this->haplotype_indexer.id_interval; + parameters.parallel_jobs = this->build_jobs; + parameters.show_progress = this->show_progress; + return parameters; + } }; struct GraphHandler { @@ -112,6 +123,10 @@ struct GraphHandler { std::unique_ptr gbwt_graph = nullptr; graph_type in_use = graph_none; + // Returns a pointer to any stored `PathHandleGraph` or loads one according to + // the config if there is no such graph. + const PathHandleGraph* get_any_graph(const GBWTConfig& config); + // Load the `PathHandleGraph` specified in the config and release other graphs. // No effect if the handler already contains a `PathHandleGraph`. void get_graph(const GBWTConfig& config); @@ -245,7 +260,7 @@ void help_gbwt(char** argv) { std::cerr << " --id-interval N store path ids at one out of N positions (default " << gbwt::DynamicGBWT::SAMPLE_INTERVAL << ")" << std::endl; std::cerr << std::endl; std::cerr << "Multithreading:" << std::endl; - std::cerr << " --num-jobs N use at most N parallel build jobs (for -v and -G; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; + std::cerr << " --num-jobs N use at most N parallel build jobs (for -v, -G, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; std::cerr << " --num-threads N use N parallel search threads (for -b and -r; default " << omp_get_max_threads() << ")" << std::endl; std::cerr << std::endl; std::cerr << "Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }):" << std::endl; @@ -294,7 +309,7 @@ void help_gbwt(char** argv) { std::cerr << " --set-tag K=V set a GBWT tag (may repeat)" << std::endl; std::cerr << " --set-reference X set sample X as the reference (may repeat)" << std::endl; std::cerr << std::endl; - std::cerr << "Step 4: Path cover GBWT construction (requires -o, -x, and one of { -a, -l, -P }):" << std::endl; + std::cerr << "Step 4: Path cover GBWT construction (requires an input graph, -o, and one of { -a, -l, -P }):" << std::endl; std::cerr << " -a, --augment-gbwt add a path cover of missing components (one input GBWT)" << std::endl; std::cerr << " -l, --local-haplotypes sample local haplotypes (one input GBWT)" << std::endl; std::cerr << " -P, --path-cover build a greedy path cover (no input GBWTs)" << std::endl; @@ -1002,8 +1017,11 @@ void validate_gbwt_config(GBWTConfig& config) { } if (config.path_cover != GBWTConfig::path_cover_none) { - if (!has_gbwt_output || config.graph_name.empty()) { - std::cerr << "error: [vg gbwt] path cover options require -x and output GBWT" << std::endl; + if (!has_gbwt_output || (config.graph_name.empty() && config.build != GBWTConfig::build_gbz && config.build != GBWTConfig::build_gbwtgraph)) { + // Path cover options needs a graph. We can use the provided graph or the GBZ/GBWTGraph + // we took as an input. In the latter case, we know that the corresponding GBWT has not + // been modified and the graph is hence safe to use. + std::cerr << "error: [vg gbwt] path cover options require an input graph and output GBWT" << std::endl; std::exit(EXIT_FAILURE); } if (config.path_cover == GBWTConfig::path_cover_greedy && !config.input_filenames.empty()) { @@ -1491,54 +1509,42 @@ void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& con if (config.show_progress) { std::cerr << "Finding a " << config.num_paths << "-path cover with context length " << config.context_length << std::endl; } - - graphs.get_graph(config); + + // Select the appropriate graph. + const PathHandleGraph* graph = graphs.get_any_graph(config); // We need to drop paths that are alt allele paths and not pass them // through from a graph that has them to the synthesized GBWT. - std::function path_filter = [&graphs](const path_handle_t& path) { - return !Paths::is_alt(graphs.path_graph->get_path_name(path)); + std::function path_filter = [&graph](const path_handle_t& path) { + return !Paths::is_alt(graph->get_path_name(path)); }; if (config.path_cover == GBWTConfig::path_cover_greedy) { if (config.show_progress) { std::cerr << "Algorithm: greedy" << std::endl; } - gbwt::GBWT cover = gbwtgraph::path_cover_gbwt(*(graphs.path_graph), - config.num_paths, - config.context_length, - config.haplotype_indexer.gbwt_buffer_size * gbwt::MILLION, - config.haplotype_indexer.id_interval, - config.include_named_paths, - &path_filter, - config.show_progress); + gbwt::GBWT cover = gbwtgraph::path_cover_gbwt( + *graph, config.path_cover_parameters(), + config.include_named_paths, &path_filter + ); + copy_reference_samples(*graph, cover); gbwts.use(cover); } else if (config.path_cover == GBWTConfig::path_cover_augment) { if (config.show_progress) { std::cerr << "Algorithm: augment" << std::endl; } gbwts.use_dynamic(); - gbwtgraph::augment_gbwt(*(graphs.path_graph), - gbwts.dynamic, - config.num_paths, - config.context_length, - config.haplotype_indexer.gbwt_buffer_size * gbwt::MILLION, - config.haplotype_indexer.id_interval, - config.show_progress); + gbwtgraph::augment_gbwt(*graph, gbwts.dynamic, config.path_cover_parameters()); } else { if (config.show_progress) { std::cerr << "Algorithm: local haplotypes" << std::endl; } gbwts.use_compressed(); - gbwt::GBWT cover = gbwtgraph::local_haplotypes(*(graphs.path_graph), - gbwts.compressed, - config.num_paths, - config.context_length, - config.haplotype_indexer.gbwt_buffer_size * gbwt::MILLION, - config.haplotype_indexer.id_interval, - config.include_named_paths, - &path_filter, - config.show_progress); + gbwt::GBWT cover = gbwtgraph::local_haplotypes( + *graph, gbwts.compressed, config.path_cover_parameters(), + config.include_named_paths, &path_filter + ); + copy_reference_samples(gbwts.compressed, cover); gbwts.use(cover); } gbwts.unbacked(); // We modified the GBWT. @@ -1712,6 +1718,15 @@ void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { //---------------------------------------------------------------------------- +const PathHandleGraph* GraphHandler::get_any_graph(const GBWTConfig& config) { + if (this->in_use == GraphHandler::graph_gbz || this->in_use == GraphHandler::graph_gbwtgraph) { + return this->gbwt_graph.get(); + } else { + this->get_graph(config); + return this->path_graph.get(); + } +} + void GraphHandler::get_graph(const GBWTConfig& config) { if (this->in_use == graph_path) { return; diff --git a/src/surjector.cpp b/src/surjector.cpp index 0f1f407daab..83f008ce335 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -3068,6 +3068,13 @@ using namespace std; #endif } + // left align on forward strands and right align on reverse strands + unordered_map left_align_strand; + left_align_strand.reserve(aln_graph->get_node_count()); + aln_graph->for_each_handle([&](const handle_t& handle) { + left_align_strand[handle] = node_trans.at(aln_graph->get_id(handle)).second; + }); + // align the intervening segments and store the result in a multipath alignment multipath_alignment_t mp_aln; mp_aln_graph.align(source, *aln_graph, get_aligner(), @@ -3083,7 +3090,8 @@ using namespace std; nullptr, // snarl manager nullptr, // distance index nullptr, // projector - allow_negative_scores); + allow_negative_scores, // subpath local + &left_align_strand); // strand to left align against topologically_order_subpaths(mp_aln); diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index 7d27d1de5b7..90d3ba13bad 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -14,9 +14,9 @@ #include #include -#include "../vg.hpp" -#include "../types.hpp" -#include "../gbwt_helper.hpp" +#include "vg.hpp" +#include "types.hpp" +#include "gbwt_helper.hpp" namespace vg { diff --git a/src/unittest/minimizer_mapper.cpp b/src/unittest/minimizer_mapper.cpp index bbcfdbda43f..8b60d21479e 100644 --- a/src/unittest/minimizer_mapper.cpp +++ b/src/unittest/minimizer_mapper.cpp @@ -30,6 +30,7 @@ class TestMinimizerMapper : public MinimizerMapper { using MinimizerMapper::faster_cap; using MinimizerMapper::with_dagified_local_graph; using MinimizerMapper::align_sequence_between; + using MinimizerMapper::fix_dozeu_end_deletions; }; TEST_CASE("Fragment length distribution gets reasonable value", "[giraffe][mapping]") { @@ -386,6 +387,59 @@ TEST_CASE("MinimizerMapper can extract a strand-split dagified local graph witho } +TEST_CASE("MinimizerMapper can fix up alignments with deletions on the ends", "[giraffe][mapping]") { + + gbwtgraph::GBWTGraph gbwt_graph; + gbwt::GBWT gbwt; + gbwt_graph.set_gbwt(gbwt); + gbwtgraph::DefaultMinimizerIndex minimizer_index; + SnarlDistanceIndex distance_index; + PathPositionHandleGraph* handle_graph; + TestMinimizerMapper test_mapper (gbwt_graph, minimizer_index, &distance_index, handle_graph); + + Alignment aln; + auto m1 = aln.mutable_path()->add_mapping(); + auto p1 = m1->mutable_position(); + p1->set_node_id(1); + p1->set_offset(3); + auto e1 = m1->add_edit(); + e1->set_from_length(2); + e1->set_to_length(0); + + auto m2 = aln.mutable_path()->add_mapping(); + auto p2 = m2->mutable_position(); + p2->set_node_id(2); + p2->set_offset(0); + auto e2 = m2->add_edit(); + e2->set_from_length(2); + e2->set_to_length(0); + auto e3 = m2->add_edit(); + e3->set_from_length(1); + e3->set_to_length(1); + auto e4 = m2->add_edit(); + e4->set_from_length(1); + e4->set_to_length(0); + + auto m3 = aln.mutable_path()->add_mapping(); + auto p3 = m3->mutable_position(); + p3->set_node_id(3); + p3->set_offset(0); + auto e5 = m3->add_edit(); + e5->set_from_length(1); + e5->set_to_length(0); + + test_mapper.fix_dozeu_end_deletions(aln); + + REQUIRE(aln.path().mapping_size() == 1); + REQUIRE(aln.path().mapping(0).position().node_id() == 2); + REQUIRE(aln.path().mapping(0).position().offset() == 2); + REQUIRE(aln.path().mapping(0).position().is_reverse() == false); + REQUIRE(aln.path().mapping(0).edit_size() == 1); + REQUIRE(aln.path().mapping(0).edit(0).from_length() == 1); + REQUIRE(aln.path().mapping(0).edit(0).to_length() == 1); + REQUIRE(aln.path().mapping(0).edit(0).sequence() == ""); +} + } diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 20a5ad071e8..6c11196fe7a 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 151 +plan tests 159 # Build vg graphs for two chromosomes @@ -298,10 +298,15 @@ is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes: 2 contigs" is $(vg gbwt -H xy.local.gbwt) 16 "local haplotypes: 16 haplotypes" is $(vg gbwt -S xy.local.gbwt) 16 "local haplotypes: 16 samples" -# Build GBZ from 16 paths of local haplotypes -vg gbwt -x xy-alt.xg -g xy.local.gbz --gbz-format -l -n 16 -v small/xy2.vcf.gz -is $? 0 "Local haplotypes GBZ construction" -is $(md5sum xy.local.gbz | cut -f 1 -d\ ) 65d2290f32c200ea57212cb7b71075b0 "GBZ was serialized correctly" +# Build GBZ from 16 paths of local haplotypes with a single job +vg gbwt -x xy-alt.xg -g xy.local.gbz --gbz-format -l -n 16 --num-jobs 1 -v small/xy2.vcf.gz +is $? 0 "Local haplotypes GBZ construction (single job)" +is $(md5sum xy.local.gbz | cut -f 1 -d\ ) b6540312514c4e70aa45fc65b4bd762c "GBZ was serialized correctly" + +# As above, but with two parallel jobs +vg gbwt -x xy-alt.xg -g xy.local.gbz --gbz-format -l -n 16 --num-jobs 2 -v small/xy2.vcf.gz +is $? 0 "Local haplotypes GBZ construction (two jobs)" +is $(md5sum xy.local.gbz | cut -f 1 -d\ ) b6540312514c4e70aa45fc65b4bd762c "GBZ was serialized correctly" rm -f xy.local.gg xy.local.gbwt xy.local.gbz @@ -316,6 +321,18 @@ is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ paths: 17 samples" rm -f xy.local.gg xy.local.gbwt +# Build GBZ from a GFA and then build a local haplotype cover with reference paths from the GBZ +vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa -g large.gbz --gbz-format +vg gbwt -Z large.gbz -l -n 16 --pass-paths -o large.local.gbwt +is $? 0 "Local haplotypes with reference paths from a larger GBZ" +is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ paths: 36 threads" +is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs" +is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ paths: 18 haplotypes" +is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ paths: 18 samples" +is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ paths: reference_samples set" + +rm -f large.gbz large.local.gbwt + # Build GBWTGraph from an augmented GBWT vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz @@ -324,7 +341,7 @@ is $? 0 "Augmented GBWTGraph construction" is $(md5sum augmented.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly" is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 threads" is $(vg gbwt -C augmented.gbwt) 2 "augmented: 2 contigs" -is $(vg gbwt -H augmented.gbwt) 2 "augmented: 2 haplotypes" +is $(vg gbwt -H augmented.gbwt) 18 "augmented: 18 haplotypes" is $(vg gbwt -S augmented.gbwt) 17 "augmented: 17 samples" rm -f x.gbwt augmented.gg augmented.gbwt @@ -383,7 +400,7 @@ is $(wc -l < ref_paths.trans) 0 "ref paths: 0 translations" rm -f gfa.gbwt rm -f gfa2.gbwt gfa2.gg gfa2.trans gfa2.gbz rm -f ref_paths.gbwt ref_paths.gg ref_paths.trans -rm -f chopping.gbwt chopping.gg chopping.trans from_gbz.trans +rm -f chopping.gbwt chopping.gg chopping.gbz chopping.trans from_gbz.trans # Build a GBZ from a graph with a reference vg gbwt -g gfa.gbz --gbz-format -G graphs/gfa_with_reference.gfa diff --git a/vgci/vgci.py b/vgci/vgci.py index 9e9480ef93f..d8eb064a3c3 100755 --- a/vgci/vgci.py +++ b/vgci/vgci.py @@ -1284,7 +1284,7 @@ def test_sim_chr21_snp1kg_trained(self): sim_opts='-p 570 -v 150 -S 4 -i 0.002 -I', # 800k 148bp reads from Genome in a Bottle NA12878 library # (placeholder while finding something better) - sim_fastq='ftp://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/NA12878/NIST_NA12878_HG001_HiSeq_300x/131219_D00360_005_BH814YADXX/Project_RM8398/Sample_U5a/U5a_AGTCAA_L002_R1_007.fastq.gz') + sim_fastq='https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/NA12878/NIST_NA12878_HG001_HiSeq_300x/131219_D00360_005_BH814YADXX/Project_RM8398/Sample_U5a/U5a_AGTCAA_L002_R1_007.fastq.gz') @skip("skipping test to keep runtime down") @timeout_decorator.timeout(2400) diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 44131e0b607..192d6d19839 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@64774cc76ef117aef705c1b5203450482c6a4b60" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"