From 3cccbc3d79cbbd343a459dfe2c58be67bbb25cf5 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 28 Feb 2025 13:10:39 -0800 Subject: [PATCH 1/3] Create deletion edits when translating from Dozeu --- src/dozeu_interface.cpp | 25 +++++++++++++++---------- src/dozeu_interface.hpp | 3 +++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/dozeu_interface.cpp b/src/dozeu_interface.cpp index 77eda8caa68..dfe713b4584 100644 --- a/src/dozeu_interface.cpp +++ b/src/dozeu_interface.cpp @@ -418,17 +418,22 @@ void DozeuInterface::calculate_and_save_alignment(Alignment &alignment, const Or // aln->query_length can be shorter than alignment.sequence().size() if we // didn't traceback from the very last base of the query, or if we didn't // pack the whole query because of an offset. - - #define _push_mapping(_id) ({ \ - handle_t n = graph.order[(_id)]; \ - Mapping *mapping = path->add_mapping(); \ - mapping->set_rank(path->mapping_size()); \ - Position *position = mapping->mutable_position(); \ - position->set_node_id(graph.graph.get_id(n)); \ + + // Add a mapping to node at index _id. Only the first mapping will be + // allowed to use offset. Also consults (and clears) ref_offset + #define _push_mapping(_id) ({ \ + handle_t n = graph.order[(_id)]; \ + Mapping *mapping = path->add_mapping(); \ + size_t s = path->mapping_size(); \ + mapping->set_rank(s); \ + Position *position = mapping->mutable_position(); \ + position->set_node_id(graph.graph.get_id(n)); \ position->set_is_reverse(graph.graph.get_is_reverse(n)); \ - position->set_offset(ref_offset); ref_offset = 0; \ - mapping; \ - }) + if (s != 1 && ref_offset > 0) { mapping->add_edit()->set_from_length(ref_offset); } \ + else { position->set_offset(ref_offset); } \ + ref_offset = 0; \ + mapping; \ + }) #define _push_op(_m, _op, _len) { \ query_offset += push_edit(_m, (_op), &query[query_offset], (_len)); \ } diff --git a/src/dozeu_interface.hpp b/src/dozeu_interface.hpp index def39d19fb4..11203ed8022 100644 --- a/src/dozeu_interface.hpp +++ b/src/dozeu_interface.hpp @@ -226,6 +226,9 @@ class DozeuInterface { * can emit it as is. If it is false, the nodes were filled right to * left, and the internal traceback comes out in right to left order, * so we need to flip it. + * + * All Mappings in the Alignment other than the first will have a zero + * offset in their Positions. */ void calculate_and_save_alignment(Alignment& alignment, const OrderedGraph& graph, const vector& head_positions, From 9dd4122d4090b61c9db81f306c980507d959ed2f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 28 Feb 2025 16:15:08 -0800 Subject: [PATCH 2/3] Instead handle fixing the leftmost mapping offset when it makes it back around to the cut node for the right anchor --- src/dozeu_interface.cpp | 25 ++++--- src/minimizer_mapper_from_chains.cpp | 97 ++++++++++++++++++++++++---- 2 files changed, 94 insertions(+), 28 deletions(-) diff --git a/src/dozeu_interface.cpp b/src/dozeu_interface.cpp index dfe713b4584..8ae3e41ffb2 100644 --- a/src/dozeu_interface.cpp +++ b/src/dozeu_interface.cpp @@ -419,21 +419,18 @@ void DozeuInterface::calculate_and_save_alignment(Alignment &alignment, const Or // didn't traceback from the very last base of the query, or if we didn't // pack the whole query because of an offset. - // Add a mapping to node at index _id. Only the first mapping will be - // allowed to use offset. Also consults (and clears) ref_offset - #define _push_mapping(_id) ({ \ - handle_t n = graph.order[(_id)]; \ - Mapping *mapping = path->add_mapping(); \ - size_t s = path->mapping_size(); \ - mapping->set_rank(s); \ - Position *position = mapping->mutable_position(); \ - position->set_node_id(graph.graph.get_id(n)); \ + // Add a mapping to node at index _id. + // Also consults (and clears) ref_offset + #define _push_mapping(_id) ({ \ + handle_t n = graph.order[(_id)]; \ + Mapping *mapping = path->add_mapping(); \ + mapping->set_rank(path->mapping_size()); \ + Position *position = mapping->mutable_position(); \ + position->set_node_id(graph.graph.get_id(n)); \ position->set_is_reverse(graph.graph.get_is_reverse(n)); \ - if (s != 1 && ref_offset > 0) { mapping->add_edit()->set_from_length(ref_offset); } \ - else { position->set_offset(ref_offset); } \ - ref_offset = 0; \ - mapping; \ - }) + position->set_offset(ref_offset); ref_offset = 0; \ + mapping; \ + }) #define _push_op(_m, _op, _len) { \ query_offset += push_edit(_m, (_op), &query[query_offset], (_len)); \ } diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 24932b36527..fee7f57da9a 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -3777,13 +3777,21 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l #endif // Then trim off the tips that are either in the wrong orientation relative - // to whether we want them to be a source or a sink, or extraneous + // to whether we want them to be a source or a sink, or extraneous. + // Also find the unique handles for the potentially cut-down anchor nodes. + // TODO: We re-find them on every trim. std::vector tip_handles = handlegraph::algorithms::find_tips(&dagified_graph); bool trimmed; size_t trim_count = 0; + bool found_left_anchor_tip; + handle_t left_anchor_tip; + bool found_right_anchor_tip; + handle_t right_anchor_tip; do { trimmed = false; + found_left_anchor_tip = false; + found_right_anchor_tip = false; // We need to make sure to remove only one orientation of each handle // we remove. std::unordered_set to_remove_ids; @@ -3797,6 +3805,17 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l #ifdef debug std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable source (" << base_coords.first << " " << base_coords.second << ")" << std::endl; #endif + if (!is_empty(left_anchor)) { + // There should only be one source tip for the left anchor node: the one we actually grabbed the graph from. + // Catch any weird bugs in the subgraph extraction + if (found_left_anchor_tip) { + std::stringstream ss; + ss << "Duplicate left anchor tip when extracting " << left_anchor << " to " << right_anchor << ". Your graph has defeated subgraph extraction and the vg developers need to look at it. Please report a bug!"; + throw std::runtime_error(ss.str()); + } + found_left_anchor_tip = true; + left_anchor_tip = h; + } } else if (dagified_graph.get_is_reverse(h) && (is_empty(right_anchor) || base_coords.first == id(right_anchor))) { // Tip is inward reverse, so it's a sink. // This is a tail in the subgraph, and either matches a right @@ -3804,6 +3823,17 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l #ifdef debug std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable sink (" << base_coords.first << " " << base_coords.second << ")" << std::endl; #endif + if (!is_empty(right_anchor)) { + // There should only be one sink tip for the right anchor node: the one we actually grabbed the graph from. + // Catch any weird bugs in the subgraph extraction + if (found_right_anchor_tip) { + std::stringstream ss; + ss << "Duplicate right anchor tip when extracting " << left_anchor << " to " << right_anchor << ". Your graph has defeated subgraph extraction and the vg developers need to look at it. Please report a bug!"; + throw std::runtime_error(ss.str()); + } + found_right_anchor_tip = true; + right_anchor_tip = h; + } } else { // This is a wrong orientation of an anchoring node, or some other tip. // We don't want to keep this handle @@ -3816,6 +3846,10 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l to_remove_handles.push_back(h); } } + // TODO: There should only be one acceptable tip for each + // anchoring node! We probably should check this and have a + // bunch of unit tests with ugly cyclic graphs where you can + // get at your anchoring nodes from behind. } for (auto& h : to_remove_handles) { dagified_graph.destroy_handle(h); @@ -3910,36 +3944,58 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l } } - // And translate back into original graph space + // And translate back into original graph ID and orientation space. for (size_t i = 0; i < alignment.path().mapping_size(); i++) { // Translate each mapping's ID and orientation down to the base graph Mapping* m = alignment.mutable_path()->mutable_mapping(i); handle_t dagified_handle = dagified_graph.get_handle(m->position().node_id(), m->position().is_reverse()); auto base_coords = dagified_handle_to_base(dagified_handle); + + if (i == 0) { + // Make sure that, if the leftmost mapping is on an anchor + // node, it's on a copy with the correct length. + // + // Otherwise the changes we make to the offset of the leftmost + // mapping when it's on an anchor node won't make sense, because we + // won't know we were really on a cut copy. + if (!is_empty(left_anchor) && base_coords.first == id(left_anchor)) { + crash_unless(dagified_graph.get_length(dagified_handle) == dagified_graph.get_length(left_anchor_tip)); + } + if (!is_empty(right_anchor) && base_coords.first == id(right_anchor)) { + crash_unless(dagified_graph.get_length(dagified_handle) == dagified_graph.get_length(right_anchor_tip)); + } + } m->mutable_position()->set_node_id(base_coords.first); m->mutable_position()->set_is_reverse(base_coords.second); } - if (!is_empty(left_anchor) && alignment.path().mapping_size() > 0) { - // Get the positions of the leftmost mapping - Position* left_pos = alignment.mutable_path()->mutable_mapping(0)->mutable_position(); - - if (offset(left_anchor) != 0 && offset(left_anchor) < graph->get_length(graph->get_handle(id(left_anchor)))) { - // There is some of the left anchor's node actually in the - // extracted graph. The left anchor isn't past the end of its node. - // The alignment must actually start on the anchor node. - assert(left_pos->node_id() == id(left_anchor)); - } + if (alignment.path().mapping_size() > 0) { + // If we anchored on the left, the leftmost mapping will be on a + // cut anchor node and its offset won't reflect the offset on the + // backing graph node. + // + // If we anchored on the right only, it's *still possible* for the + // alignment to turn around so the leftmost mapping is on the same + // cut anchor node we anchored the right end on, facing the other + // direction and offsetting from the cut point. + // + // Adjust the offset on the leftmost mapping so it counts from the + // entire node and not the cut-down part used for anchoring, if + // it's on one of the cut-down nodes. + Position* left_pos = alignment.mutable_path()->mutable_mapping(0)->mutable_position(); - if (left_pos->node_id() == id(left_anchor)) { + if (!is_empty(left_anchor) && left_pos->node_id() == id(left_anchor)) { // If the alignment does start on the anchor node (even at 0 or at the past-end position) - // Add on the offset for the cut-off piece of the left anchor node left_pos->set_offset(left_pos->offset() + offset(left_anchor)); + } else if (!is_empty(right_anchor) && left_pos->node_id() == id(right_anchor)) { + // If the alignment starts on the right anchor, make sure to add the part that was cut off to the offset. + left_pos->set_offset(left_pos->offset() + graph->get_length(graph->get_handle(id(right_anchor))) - offset(right_anchor)); } } + if (alignment.path().mapping_size() > 0) { // Make sure we don't have an empty mapping on the end auto* last_mapping = alignment.mutable_path()->mutable_mapping(alignment.path().mapping_size() - 1); @@ -3992,19 +4048,32 @@ std::pair MinimizerMapper::align_sequence_between_consistently(c // Now we know a swap is required. + std::cerr << "Normally would anchor at " << left_anchor << ", " << right_anchor << std::endl; // The anchors face left to right so we need to flip their orientations in addition to swapping them. // align_sequence_between uses between-base positions for anchoring pos_t flipped_left_anchor = is_empty(right_anchor) ? empty_pos_t() : reverse(right_anchor, get_node_length(id(right_anchor))); pos_t flipped_right_anchor = is_empty(left_anchor) ? empty_pos_t() : reverse(left_anchor, get_node_length(id(left_anchor))); + std::cerr << "Actually anchoring at " << flipped_left_anchor << ", " << flipped_right_anchor << std::endl; + // Do the alignment auto result = align_sequence_between(flipped_left_anchor, flipped_right_anchor, max_path_length, max_gap_length, graph, aligner, flipped_query, alignment_name, max_dp_cells, choose_band_padding); + for (size_t i = 0; i < flipped_query.path().mapping_size(); i++) { + std::cerr << "Mapping to flip: " << pb2json(flipped_query.path().mapping(i)) << std::endl; + } + // Flip and send the answer reverse_complement_alignment_in_place(&flipped_query, get_node_length); alignment = std::move(flipped_query); + for (size_t i = 0; i < alignment.path().mapping_size(); i++) { + if (i > 0 && alignment.path().mapping(i).position().offset() != 0) { + throw std::runtime_error("Nonzero offset in " + pb2json(alignment.path().mapping(i)) + " at " + std::to_string(i)); + } + } + // Return the metadata we track return result; } From 6198e16e9c94d6735fa61777f6b571727d4cb226 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 28 Feb 2025 16:51:20 -0800 Subject: [PATCH 3/3] Add unit tests and not some remaining shortcomings --- src/minimizer_mapper_from_chains.cpp | 33 +++++------- src/unittest/minimizer_mapper.cpp | 81 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 20 deletions(-) diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index fee7f57da9a..bb36b885db0 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -3778,7 +3778,7 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l // Then trim off the tips that are either in the wrong orientation relative // to whether we want them to be a source or a sink, or extraneous. - // Also find the unique handles for the potentially cut-down anchor nodes. + // Also find the unique tip handles for the potentially cut-down anchor nodes. // TODO: We re-find them on every trim. std::vector tip_handles = handlegraph::algorithms::find_tips(&dagified_graph); @@ -3807,10 +3807,11 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l #endif if (!is_empty(left_anchor)) { // There should only be one source tip for the left anchor node: the one we actually grabbed the graph from. + // Other nodes for this node shouldn't be this kind of tip. // Catch any weird bugs in the subgraph extraction if (found_left_anchor_tip) { std::stringstream ss; - ss << "Duplicate left anchor tip when extracting " << left_anchor << " to " << right_anchor << ". Your graph has defeated subgraph extraction and the vg developers need to look at it. Please report a bug!"; + ss << "Duplicate left anchor tip when extracting " << left_anchor << " to " << right_anchor; throw std::runtime_error(ss.str()); } found_left_anchor_tip = true; @@ -3825,10 +3826,11 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l #endif if (!is_empty(right_anchor)) { // There should only be one sink tip for the right anchor node: the one we actually grabbed the graph from. + // Other nodes for this node shouldn't be this kind of tip. // Catch any weird bugs in the subgraph extraction if (found_right_anchor_tip) { std::stringstream ss; - ss << "Duplicate right anchor tip when extracting " << left_anchor << " to " << right_anchor << ". Your graph has defeated subgraph extraction and the vg developers need to look at it. Please report a bug!"; + ss << "Duplicate right anchor tip when extracting " << left_anchor << " to " << right_anchor; throw std::runtime_error(ss.str()); } found_right_anchor_tip = true; @@ -3846,10 +3848,6 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l to_remove_handles.push_back(h); } } - // TODO: There should only be one acceptable tip for each - // anchoring node! We probably should check this and have a - // bunch of unit tests with ugly cyclic graphs where you can - // get at your anchoring nodes from behind. } for (auto& h : to_remove_handles) { dagified_graph.destroy_handle(h); @@ -3959,6 +3957,7 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l // Otherwise the changes we make to the offset of the leftmost // mapping when it's on an anchor node won't make sense, because we // won't know we were really on a cut copy. + // TODO: This should always be true. if (!is_empty(left_anchor) && base_coords.first == id(left_anchor)) { crash_unless(dagified_graph.get_length(dagified_handle) == dagified_graph.get_length(left_anchor_tip)); } @@ -3982,8 +3981,8 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l // direction and offsetting from the cut point. // // Adjust the offset on the leftmost mapping so it counts from the - // entire node and not the cut-down part used for anchoring, if - // it's on one of the cut-down nodes. + // edge of the entire node and not the cut-down part used for + // anchoring, if it's on one of the cut-down nodes. Position* left_pos = alignment.mutable_path()->mutable_mapping(0)->mutable_position(); if (!is_empty(left_anchor) && left_pos->node_id() == id(left_anchor)) { @@ -4048,29 +4047,23 @@ std::pair MinimizerMapper::align_sequence_between_consistently(c // Now we know a swap is required. - std::cerr << "Normally would anchor at " << left_anchor << ", " << right_anchor << std::endl; - // The anchors face left to right so we need to flip their orientations in addition to swapping them. // align_sequence_between uses between-base positions for anchoring pos_t flipped_left_anchor = is_empty(right_anchor) ? empty_pos_t() : reverse(right_anchor, get_node_length(id(right_anchor))); pos_t flipped_right_anchor = is_empty(left_anchor) ? empty_pos_t() : reverse(left_anchor, get_node_length(id(left_anchor))); - std::cerr << "Actually anchoring at " << flipped_left_anchor << ", " << flipped_right_anchor << std::endl; - // Do the alignment auto result = align_sequence_between(flipped_left_anchor, flipped_right_anchor, max_path_length, max_gap_length, graph, aligner, flipped_query, alignment_name, max_dp_cells, choose_band_padding); - for (size_t i = 0; i < flipped_query.path().mapping_size(); i++) { - std::cerr << "Mapping to flip: " << pb2json(flipped_query.path().mapping(i)) << std::endl; - } - // Flip and send the answer reverse_complement_alignment_in_place(&flipped_query, get_node_length); alignment = std::move(flipped_query); - + + // We shouldn't use any nonzero offsets after the first node. We're not + // meant to be making a split alignment. for (size_t i = 0; i < alignment.path().mapping_size(); i++) { - if (i > 0 && alignment.path().mapping(i).position().offset() != 0) { - throw std::runtime_error("Nonzero offset in " + pb2json(alignment.path().mapping(i)) + " at " + std::to_string(i)); + if (i > 0) { + crash_unless(alignment.path().mapping(i).position().offset() == 0); } } diff --git a/src/unittest/minimizer_mapper.cpp b/src/unittest/minimizer_mapper.cpp index d6309380f45..e517653b1f9 100644 --- a/src/unittest/minimizer_mapper.cpp +++ b/src/unittest/minimizer_mapper.cpp @@ -347,6 +347,87 @@ TEST_CASE("MinimizerMapper can map against subgraphs between abutting points", " } } + +TEST_CASE("MinimizerMapper can map against subgraphs within a node", "[giraffe][mapping]") { + + Aligner aligner; + HashGraph graph; + + // We have a big node + auto h1 = graph.create_handle("AAAAGATTG"); + + Alignment aln; + aln.set_sequence("AGAT"); + + // Left anchor should be on start + pos_t left_anchor {graph.get_id(h1), false, 3}; + // Right anchor should be past end + pos_t right_anchor {graph.get_id(h1), false, 7}; + + TestMinimizerMapper::align_sequence_between(left_anchor, right_anchor, 100, 20, &graph, &aligner, aln); + + // Make sure we get the right alignment + REQUIRE(aln.path().mapping_size() == 1); + REQUIRE(aln.path().mapping(0).position().node_id() == graph.get_id(h1)); + REQUIRE(aln.path().mapping(0).position().is_reverse() == graph.get_is_reverse(h1)); + REQUIRE(aln.path().mapping(0).position().offset() == offset(left_anchor)); + REQUIRE(aln.path().mapping(0).edit_size() == 1); + REQUIRE(aln.path().mapping(0).edit(0).from_length() == 4); + REQUIRE(aln.path().mapping(0).edit(0).to_length() == 4); + REQUIRE(aln.path().mapping(0).edit(0).sequence() == ""); + +} + +TEST_CASE("MinimizerMapper can find mappings where the alignment turns around from the right", "[giraffe][mapping]") { + + Aligner aligner; + HashGraph graph; + + auto h1 = graph.create_handle("AAAAGAT"); + auto h_in = graph.create_handle("G"); + graph.create_edge(h_in, h1); + graph.create_edge(graph.flip(h_in), h1); + + Alignment aln; + aln.set_sequence("TTTTGAAAAGA"); + + // Left anchor is empty + pos_t left_anchor; + // Right anchor is at past-end + pos_t right_anchor {graph.get_id(h1), false, 6}; + + TestMinimizerMapper::align_sequence_between(left_anchor, right_anchor, 100, 20, &graph, &aligner, aln); + + // Make sure we get the right alignment + REQUIRE(aln.path().mapping_size() == 3); + REQUIRE(aln.path().mapping(0).position().node_id() == graph.get_id(h1)); + REQUIRE(aln.path().mapping(0).position().is_reverse() == !graph.get_is_reverse(h1)); + REQUIRE(aln.path().mapping(0).position().offset() == 3); + REQUIRE(aln.path().mapping(0).edit_size() == 1); + REQUIRE(aln.path().mapping(0).edit(0).from_length() == 4); + REQUIRE(aln.path().mapping(0).edit(0).to_length() == 4); + REQUIRE(aln.path().mapping(0).edit(0).sequence() == ""); + REQUIRE(aln.path().mapping(1).position().node_id() == graph.get_id(h_in)); + REQUIRE(aln.path().mapping(1).position().is_reverse() == graph.get_is_reverse(h_in)); + REQUIRE(aln.path().mapping(1).position().offset() == 0); + REQUIRE(aln.path().mapping(1).edit_size() == 1); + REQUIRE(aln.path().mapping(1).edit(0).from_length() == 1); + REQUIRE(aln.path().mapping(1).edit(0).to_length() == 1); + REQUIRE(aln.path().mapping(1).edit(0).sequence() == ""); + REQUIRE(aln.path().mapping(2).position().node_id() == graph.get_id(h1)); + REQUIRE(aln.path().mapping(2).position().is_reverse() == graph.get_is_reverse(h1)); + REQUIRE(aln.path().mapping(2).position().offset() == 0); + REQUIRE(aln.path().mapping(2).edit_size() == 1); + REQUIRE(aln.path().mapping(2).edit(0).from_length() == 6); + REQUIRE(aln.path().mapping(2).edit(0).to_length() == 6); + REQUIRE(aln.path().mapping(2).edit(0).sequence() == ""); + + // TODO: We can't find the same alignment from the left because we'd need + // to see more of the anchor node than we anchored on. + + // TODO: We can't handle a hairpin directly inside the anchor node. +} + TEST_CASE("MinimizerMapper can map an empty string between odd points", "[giraffe][mapping]") { Aligner aligner;