From a03ff22c68f4d09474e48bcb9420783567da5e7f Mon Sep 17 00:00:00 2001 From: parsaeskandar Date: Fri, 26 Jan 2024 14:28:28 -0800 Subject: [PATCH 01/67] Added sub-path support to vg annotate. --- src/algorithms/alignment_path_offsets.cpp | 397 +++++++++++----------- 1 file changed, 201 insertions(+), 196 deletions(-) diff --git a/src/algorithms/alignment_path_offsets.cpp b/src/algorithms/alignment_path_offsets.cpp index d50f9100818..f781b042377 100644 --- a/src/algorithms/alignment_path_offsets.cpp +++ b/src/algorithms/alignment_path_offsets.cpp @@ -3,94 +3,94 @@ //#define debug_mpaln_offsets namespace vg { -namespace algorithms { - -unordered_map > > -alignment_path_offsets(const PathPositionHandleGraph& graph, - const Alignment& aln, - bool just_min, - bool nearby, - size_t search_limit, - const std::function* path_filter) { - if (nearby && search_limit == 0) { - // Fill in the search limit - search_limit = aln.sequence().size(); - } - unordered_map > > offsets; - if (graph.get_path_count() == 0) return offsets; - for (auto& mapping : aln.path().mapping()) { - // How many bases does this Mapping cover over? - size_t mapping_width = mapping_from_length(mapping); - if (mapping_width == 0 && !nearby) { - // Just skip over this mapping; it touches no bases. - continue; - } - // We may have to consider both the starts and ends of mappings - vector end = {false}; - if (just_min && !nearby) { - // We want the min actually touched position along each path. It - // could come from the Mapping start or the Mapping end. - end.push_back(true); - } - // Find the position of this end of this mapping - pos_t mapping_pos = make_pos_t(mapping.position()); - // Find the positions for this end of this Mapping - auto pos_offs = algorithms::nearest_offsets_in_paths(&graph, mapping_pos, nearby ? search_limit : -1, path_filter); - for (auto look_at_end : end) { - // For the start and the end of the Mapping, as needed - for (auto& p : pos_offs) { - // For each path, splice the list of path positions for this Mapping - // onto the end of the list of positions we found in that path - auto& v = offsets[p.first]; - for (pair& y : p.second) { - v.emplace_back(y.second ? y.first - mapping_width : y.first, - y.second); + namespace algorithms { + + unordered_map > > + alignment_path_offsets(const PathPositionHandleGraph& graph, + const Alignment& aln, + bool just_min, + bool nearby, + size_t search_limit, + const std::function* path_filter) { + if (nearby && search_limit == 0) { + // Fill in the search limit + search_limit = aln.sequence().size(); + } + unordered_map > > offsets; + if (graph.get_path_count() == 0) return offsets; + for (auto& mapping : aln.path().mapping()) { + // How many bases does this Mapping cover over? + size_t mapping_width = mapping_from_length(mapping); + if (mapping_width == 0 && !nearby) { + // Just skip over this mapping; it touches no bases. + continue; + } + // We may have to consider both the starts and ends of mappings + vector end = {false}; + if (just_min && !nearby) { + // We want the min actually touched position along each path. It + // could come from the Mapping start or the Mapping end. + end.push_back(true); + } + // Find the position of this end of this mapping + pos_t mapping_pos = make_pos_t(mapping.position()); + // Find the positions for this end of this Mapping + auto pos_offs = algorithms::nearest_offsets_in_paths(&graph, mapping_pos, nearby ? search_limit : -1, path_filter); + for (auto look_at_end : end) { + // For the start and the end of the Mapping, as needed + for (auto& p : pos_offs) { + // For each path, splice the list of path positions for this Mapping + // onto the end of the list of positions we found in that path + auto& v = offsets[p.first]; + for (pair& y : p.second) { + v.emplace_back(y.second ? y.first - mapping_width : y.first, + y.second); + } + } } } - } - } - if (!nearby && offsets.empty()) { - // find the nearest if we couldn't find any before - return alignment_path_offsets(graph, aln, just_min, true, search_limit, path_filter); - } - if (just_min) { - // We need the minimum position for each path - for (auto& p : offsets) { - auto& v = p.second; - auto m = *min_element(v.begin(), v.end(), - [](const pair& a, - const pair& b) - { return a.first < b.first; }); - v.clear(); - v.push_back(m); - } - } - return offsets; -} - -unordered_map > > -multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, - const multipath_alignment_t& mp_aln, - const std::function* path_filter) { - - using path_positions_t = unordered_map>>; - - // collect the search results for each mapping on each subpath - vector> search_results(mp_aln.subpath_size()); - for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { - const subpath_t& subpath = mp_aln.subpath(i); - auto& subpath_search_results = search_results[i]; - subpath_search_results.resize(subpath.path().mapping_size()); - for (size_t j = 0; j < subpath.path().mapping_size(); ++j) { - // get the positions on paths that this mapping touches - pos_t mapping_pos = make_pos_t(subpath.path().mapping(j).position()); - subpath_search_results[j] = nearest_offsets_in_paths(&graph, mapping_pos, 0, path_filter); - // make sure that offsets are stored in increasing order - for (pair>>& search_record : subpath_search_results[j]) { - sort(search_record.second.begin(), search_record.second.end()); + if (!nearby && offsets.empty()) { + // find the nearest if we couldn't find any before + return alignment_path_offsets(graph, aln, just_min, true, search_limit, path_filter); + } + if (just_min) { + // We need the minimum position for each path + for (auto& p : offsets) { + auto& v = p.second; + auto m = *min_element(v.begin(), v.end(), + [](const pair& a, + const pair& b) + { return a.first < b.first; }); + v.clear(); + v.push_back(m); + } } + return offsets; + } + + unordered_map > > + multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, + const multipath_alignment_t& mp_aln, + const std::function* path_filter) { + + using path_positions_t = unordered_map>>; + + // collect the search results for each mapping on each subpath + vector> search_results(mp_aln.subpath_size()); + for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { + const subpath_t& subpath = mp_aln.subpath(i); + auto& subpath_search_results = search_results[i]; + subpath_search_results.resize(subpath.path().mapping_size()); + for (size_t j = 0; j < subpath.path().mapping_size(); ++j) { + // get the positions on paths that this mapping touches + pos_t mapping_pos = make_pos_t(subpath.path().mapping(j).position()); + subpath_search_results[j] = nearest_offsets_in_paths(&graph, mapping_pos, 0, path_filter); + // make sure that offsets are stored in increasing order + for (pair>>& search_record : subpath_search_results[j]) { + sort(search_record.second.begin(), search_record.second.end()); + } #ifdef debug_mpaln_offsets - cerr << "subpath " << i << ", mapping " << j << " path locations" << endl; + cerr << "subpath " << i << ", mapping " << j << " path locations" << endl; for (const auto& pps : subpath_search_results[j]) { cerr << graph.get_path_name(pps.first) << endl; for (const auto& pp : pps.second) { @@ -98,132 +98,137 @@ multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, } } #endif - } - } - - path_positions_t return_val; - - // to keep track of whether we've already chosen a position on each path - // earlier in the multipath alignment in either the forward or reverse pass - vector> covered_fwd(mp_aln.subpath_size()); - vector> covered_rev(mp_aln.subpath_size()); - - // forward pass looking for positions on the forward strand of paths - for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { - const auto& subpath_search_results = search_results[i]; - for (size_t j = 0; j < subpath_search_results.size(); ++j) { - for (const auto& path_pos : subpath_search_results[j]) { - if (!covered_fwd[i].count(path_pos.first)) { - // we haven't already covered this path at an earlier position on the alignment - for (const auto& path_offset : path_pos.second) { - if (!path_offset.second) { - // there's a position on the forward strand of this path - return_val[path_pos.first].emplace_back(path_offset); - - // we're now covering this path for future search results - covered_fwd[i].insert(path_pos.first); - + } + } + + path_positions_t return_val; + + // to keep track of whether we've already chosen a position on each path + // earlier in the multipath alignment in either the forward or reverse pass + vector> covered_fwd(mp_aln.subpath_size()); + vector> covered_rev(mp_aln.subpath_size()); + + // forward pass looking for positions on the forward strand of paths + for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { + const auto& subpath_search_results = search_results[i]; + for (size_t j = 0; j < subpath_search_results.size(); ++j) { + for (const auto& path_pos : subpath_search_results[j]) { + if (!covered_fwd[i].count(path_pos.first)) { + // we haven't already covered this path at an earlier position on the alignment + for (const auto& path_offset : path_pos.second) { + if (!path_offset.second) { + // there's a position on the forward strand of this path + return_val[path_pos.first].emplace_back(path_offset); + + // we're now covering this path for future search results + covered_fwd[i].insert(path_pos.first); + #ifdef debug_mpaln_offsets - cerr << "found fwd pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first << " " << path_offset.second << endl; + cerr << "found fwd pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first << " " << path_offset.second << endl; #endif - - break; + + break; + } + } } } } + + // the following subpaths will be covered for any path that this + // one is covered for + for (auto n : mp_aln.subpath(i).next()) { + auto& next_coverings = covered_fwd[n]; + for (auto path_handle : covered_fwd[i]) { + next_coverings.insert(path_handle); + } + } + for (const auto& c : mp_aln.subpath(i).connection()) { + auto& next_coverings = covered_fwd[c.next()]; + for (auto path_handle : covered_fwd[i]) { + next_coverings.insert(path_handle); + } + } } - } - - // the following subpaths will be covered for any path that this - // one is covered for - for (auto n : mp_aln.subpath(i).next()) { - auto& next_coverings = covered_fwd[n]; - for (auto path_handle : covered_fwd[i]) { - next_coverings.insert(path_handle); - } - } - for (const auto& c : mp_aln.subpath(i).connection()) { - auto& next_coverings = covered_fwd[c.next()]; - for (auto path_handle : covered_fwd[i]) { - next_coverings.insert(path_handle); - } - } - } - - // now do a backward pass for the reverse strand of paths - for (int64_t i = mp_aln.subpath_size() - 1; i >= 0; --i) { - // find which paths are already covered in the reverse - for (auto n : mp_aln.subpath(i).next()) { - for (auto path_handle : covered_rev[n]) { - covered_rev[i].insert(path_handle); - } - } - for (const auto& c : mp_aln.subpath(i).connection()) { - for (auto path_handle : covered_rev[c.next()]) { - covered_rev[i].insert(path_handle); - } - } - - const auto& subpath_search_results = search_results[i]; - for (int64_t j = subpath_search_results.size() - 1; j >= 0; --j) { - for (const auto& path_pos : subpath_search_results[j]) { - if (!covered_rev[i].count(path_pos.first)) { - // we haven't already covered this path at an earlier position on the alignment - for (const auto& path_offset : path_pos.second) { - if (path_offset.second) { - // there's a position on the reverse strand of this path - auto mapping_len = mapping_from_length(mp_aln.subpath(i).path().mapping(j)); - return_val[path_pos.first].emplace_back(path_offset.first - mapping_len, - path_offset.second); - + + // now do a backward pass for the reverse strand of paths + for (int64_t i = mp_aln.subpath_size() - 1; i >= 0; --i) { + // find which paths are already covered in the reverse + for (auto n : mp_aln.subpath(i).next()) { + for (auto path_handle : covered_rev[n]) { + covered_rev[i].insert(path_handle); + } + } + for (const auto& c : mp_aln.subpath(i).connection()) { + for (auto path_handle : covered_rev[c.next()]) { + covered_rev[i].insert(path_handle); + } + } + + const auto& subpath_search_results = search_results[i]; + for (int64_t j = subpath_search_results.size() - 1; j >= 0; --j) { + for (const auto& path_pos : subpath_search_results[j]) { + if (!covered_rev[i].count(path_pos.first)) { + // we haven't already covered this path at an earlier position on the alignment + for (const auto& path_offset : path_pos.second) { + if (path_offset.second) { + // there's a position on the reverse strand of this path + auto mapping_len = mapping_from_length(mp_aln.subpath(i).path().mapping(j)); + return_val[path_pos.first].emplace_back(path_offset.first - mapping_len, + path_offset.second); + #ifdef debug_mpaln_offsets - cerr << "found rev pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first - mapping_len << " " << path_offset.second << endl; + cerr << "found rev pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first - mapping_len << " " << path_offset.second << endl; #endif - // we're now covering this path for future search results - covered_rev[i].insert(path_pos.first); - - break; + // we're now covering this path for future search results + covered_rev[i].insert(path_pos.first); + + break; + } + } } } } } + + return return_val; } - } - - return return_val; -} - -void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { - annotate_with_path_positions(graph, aln, true, search_limit, path_filter); -} - -void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { - annotate_with_path_positions(graph, aln, false, search_limit, path_filter); -} - -void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit, const std::function* path_filter) { - if (!aln.refpos_size()) { - // Get requested path positions - unordered_map > > positions = alignment_path_offsets(graph, aln, just_min, false, search_limit, path_filter); - // emit them in order of the path handle - vector ordered; - for (auto& path : positions) { ordered.push_back(path.first); } - std::sort(ordered.begin(), ordered.end(), [](const path_handle_t& a, const path_handle_t& b) { return as_integer(a) < as_integer(b); }); - for (auto& path : ordered) { - for (auto& p : positions[path]) { - // Add each determined refpos - Position* refpos = aln.add_refpos(); - refpos->set_name(graph.get_path_name(path)); - refpos->set_offset(p.first); - refpos->set_is_reverse(p.second); + + void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { + annotate_with_path_positions(graph, aln, true, search_limit, path_filter); + } + + void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { + annotate_with_path_positions(graph, aln, false, search_limit, path_filter); + } + + void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit, const std::function* path_filter) { + if (!aln.refpos_size()) { + // Get requested path positions + unordered_map > > positions = alignment_path_offsets(graph, aln, just_min, false, search_limit, path_filter); + // emit them in order of the path handle + vector ordered; + for (auto& path : positions) { ordered.push_back(path.first); } + std::sort(ordered.begin(), ordered.end(), [](const path_handle_t& a, const path_handle_t& b) { return as_integer(a) < as_integer(b); }); + for (auto& path : ordered) { + for (auto& p : positions[path]) { + // Add each determined refpos + + Position* refpos = aln.add_refpos(); + subrange_t subrange; + string path_name = graph.get_path_name(path); + path_name = Paths::strip_subrange(path_name, &subrange); + int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + refpos->set_name(path_name); + refpos->set_offset(offset + p.first); + refpos->set_is_reverse(p.second); + } + } } } - } -} -void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& alns, size_t search_limit, const std::function* path_filter) { - for (auto& aln : alns) annotate_with_initial_path_positions(graph, aln, search_limit, path_filter); -} + void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& alns, size_t search_limit, const std::function* path_filter) { + for (auto& aln : alns) annotate_with_initial_path_positions(graph, aln, search_limit, path_filter); + } -} -} + } +} \ No newline at end of file From a96368f13bbb3c816b4d2e56f44623039a6f224d Mon Sep 17 00:00:00 2001 From: rustrover Date: Sat, 9 Mar 2024 16:57:14 +0800 Subject: [PATCH 02/67] remove repetitive words Signed-off-by: rustrover --- ontology/vg.html | 2 +- ontology/vg.ttl | 2 +- src/algorithms/k_widest_paths.cpp | 2 +- src/cactus.cpp | 2 +- src/cluster.cpp | 2 +- src/graph_caller.cpp | 2 +- src/index_registry.cpp | 2 +- src/splicing.cpp | 2 +- src/transcriptome.hpp | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ontology/vg.html b/ontology/vg.html index 982fc7a668a..586ed09aaa5 100644 --- a/ontology/vg.html +++ b/ontology/vg.html @@ -688,7 +688,7 @@

rdfs:comment - "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)." + "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)." xsd:string diff --git a/ontology/vg.ttl b/ontology/vg.ttl index 056e421fa01..39ef0fc795b 100644 --- a/ontology/vg.ttl +++ b/ontology/vg.ttl @@ -31,7 +31,7 @@ . :Step rdf:type owl:Class ; - rdfs:comment "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."^^xsd:string ; + rdfs:comment "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."^^xsd:string ; rdfs:label "Step"^^xsd:string ; rdfs:subClassOf owl:Thing ; . diff --git a/src/algorithms/k_widest_paths.cpp b/src/algorithms/k_widest_paths.cpp index e9cb9f303d4..2047c41cf57 100644 --- a/src/algorithms/k_widest_paths.cpp +++ b/src/algorithms/k_widest_paths.cpp @@ -233,7 +233,7 @@ vector>> yens_k_widest_paths(const HandleGraph* g, forgotten_nodes.insert(g->flip(prev_path[j])); } - // find our path from the the spur_node to the sink + // find our path from the spur_node to the sink pair> spur_path_v = widest_dijkstra(g, spur_node, sink, node_weight_callback, edge_weight_callback, [&](handle_t h) {return forgotten_nodes.count(h);}, [&](edge_t e) {return forgotten_edges.count(e);}, diff --git a/src/cactus.cpp b/src/cactus.cpp index 7d5a6bc8ac3..a66c8e90e22 100644 --- a/src/cactus.cpp +++ b/src/cactus.cpp @@ -148,7 +148,7 @@ void getReachableBridges(stCactusEdgeEnd *edgeEnd1, stList *bridgeEnds) { } /** - * Finds an arbitrary pair of telomeres in a Cactus graph, which are are either + * Finds an arbitrary pair of telomeres in a Cactus graph, which are either * a pair of bridge edge ends or a pair of chain edge ends, oriented such that * they form a pair of boundaries. * diff --git a/src/cluster.cpp b/src/cluster.cpp index 8c3a9b60419..ec60a525357 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -735,7 +735,7 @@ void MEMClusterer::HitGraph::prune_low_scoring_edges(vector>& com } cerr << strm.str(); #endif - // the the original component + // the original component components[component_idx] = move(new_components[0]); // add the remaining to the end for (size_t i = 1; i < new_components.size(); i++) { diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index c9a518718e0..9d1c60f42cf 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -2267,7 +2267,7 @@ string NestedFlowCaller::flatten_alt_allele(const string& nested_allele, int all // todo: passing in a single ploidy simplisitic, would need to derive from the calls when // reucrising // in practice, the results will nearly the same but still needs fixing - // we try to get the the allele from the genotype if possible, but fallback on the fallback_allele + // we try to get the allele from the genotype if possible, but fallback on the fallback_allele int trav_allele = fallback_allele >= 0 ? fallback_allele : record.genotype_by_ploidy[ploidy-1].first[allele]; const SnarlTraversal& traversal = record.travs[trav_allele]; string nested_snarl_allele = trav_string(graph, traversal); diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 83a9c1b4a58..8f48b15ddc6 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -780,7 +780,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { } } - // we'll partition sequences that have the same samples (chunking the the VCFs + // we'll partition sequences that have the same samples (chunking the VCFs // ultimately requires that we do this) map, vector> sample_set_contigs; for (int i = 0; i < vcf_samples.size(); ++i) { diff --git a/src/splicing.cpp b/src/splicing.cpp index 2cb7f94ffd6..c4e1727ccfe 100644 --- a/src/splicing.cpp +++ b/src/splicing.cpp @@ -1633,7 +1633,7 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, } } else if (!have_right_linker) { - // the splice segment on the right side is empty, make connection the the left side + // the splice segment on the right side is empty, make connection the left side auto connection = left_mp_aln.mutable_subpath(right_subpaths_begin - 1)->add_connection(); connection->set_next(left_mp_aln.subpath_size()); connection->set_score(splice_score); diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index 90d3ba13bad..42214df12cb 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -281,7 +281,7 @@ class Transcriptome { /// Parse gtf/gff3 attribute value. string parse_attribute_value(const string & attribute, const string & name) const; - /// Returns the the mean node length of the graph + /// Returns the mean node length of the graph float mean_node_length() const; /// Adds the exon coordinates to a transcript. From e6c62d55dca3616283e03ec0fb130a95ebda47c7 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 9 Apr 2024 10:12:49 -0400 Subject: [PATCH 03/67] Pad libdeflate for renaming on Mac This should fix #4257 --- Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index f77ae12c82e..e845540f65e 100644 --- a/Makefile +++ b/Makefile @@ -215,6 +215,9 @@ ifeq ($(shell uname -s),Darwin) # We don't actually do any static linking on Mac, so we leave this empty. START_STATIC = END_STATIC = + + # We need to use special flags to let us rename libraries + LD_RENAMEABLE_FLAGS = -headerpad -headerpad_max_install_names else # We are not running on OS X $(info OS is Linux) @@ -252,7 +255,8 @@ else # Note that END_STATIC is only safe to use in a mostly-dynamic build, and has to appear or we will try to statically link secret trailing libraries. END_STATIC = -Wl,-Bdynamic - + # We don't need any flags because we don't need to rename libraries with install_name_tool + LD_RENAMEABLE_FLAGS = endif # Set the C++ standard we are using @@ -656,7 +660,7 @@ ifeq ($(shell uname -s),Darwin) endif $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c - +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) + +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) # We build htslib after libdeflate so it can use libdeflate. # We need to do some wizardry to get it to pick up the right build and target system types on modern autotools. From da344a8c82b12069b64a69f48c66b8d96776e06a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 19 Apr 2024 10:27:32 -0400 Subject: [PATCH 04/67] remove debug messages --- src/subcommand/chunk_main.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index c79a5afb90c..22f6e02651e 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -715,11 +715,9 @@ int main_chunk(int argc, char** argv) { unique_ptr subgraph; map trace_thread_frequencies; if (!component_ids.empty()) { - cout << "here?" << endl; subgraph = vg::io::new_output_graph(output_format); chunker.extract_component(component_ids[i], *subgraph, false); output_regions[i] = region; - cout << "nope" << endl; } else if (id_range == false) { subgraph = vg::io::new_output_graph(output_format); From d9487ef09e1f5cee0a40768f1c4ce3dba7d30c08 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 31 Jan 2024 14:40:28 -0800 Subject: [PATCH 05/67] Make WFAExtender apply full length bonuses even though it doesn't use them internally --- src/gbwt_extender.cpp | 13 ++- src/gbwt_extender.hpp | 8 +- src/unittest/gbwt_extender.cpp | 153 ++++++++++++++++++++------------- 3 files changed, 112 insertions(+), 62 deletions(-) diff --git a/src/gbwt_extender.cpp b/src/gbwt_extender.cpp index bec78296a8d..bb5b50846ca 100644 --- a/src/gbwt_extender.cpp +++ b/src/gbwt_extender.cpp @@ -2284,7 +2284,14 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co } WFAAlignment WFAExtender::suffix(const std::string& sequence, pos_t from) const { - return this->connect(sequence, from, pos_t(0, false, 0)); + WFAAlignment result = this->connect(sequence, from, pos_t(0, false, 0)); + + if (!result.edits.empty() && result.length == sequence.length() && (result.edits.back().first == WFAAlignment::match || result.edits.back().first == WFAAlignment::mismatch)) { + // The alignment used all of the sequence and has a match/mismatch at the appropriate end + result.score += this->aligner->full_length_bonus; + } + + return result; } WFAAlignment WFAExtender::prefix(const std::string& sequence, pos_t to) const { @@ -2297,6 +2304,10 @@ WFAAlignment WFAExtender::prefix(const std::string& sequence, pos_t to) const { WFAAlignment result = this->connect(reverse_complement(sequence), to, pos_t(0, false, 0)); result.flip(*(this->graph), sequence); + if (!result.edits.empty() && result.length == sequence.length() && (result.edits.front().first == WFAAlignment::match || result.edits.front().first == WFAAlignment::mismatch)) { + result.score += this->aligner->full_length_bonus; + } + return result; } diff --git a/src/gbwt_extender.hpp b/src/gbwt_extender.hpp index e7286a284dc..33e89aed19a 100644 --- a/src/gbwt_extender.hpp +++ b/src/gbwt_extender.hpp @@ -412,9 +412,11 @@ class WFAExtender { * entire sequence with an acceptable score, returns the highest-scoring * partial alignment, which may be empty. * + * Applies the full-length bonus if the result ends with a match or mismatch. + * TODO: Use the full-length bonus to determine the optimal alignment. + * * NOTE: This creates a suffix of the full alignment by aligning a * prefix of the sequence. - * TODO: Should we use full-length bonuses? */ WFAAlignment suffix(const std::string& sequence, pos_t from) const; @@ -424,9 +426,11 @@ class WFAExtender { * sequence with an acceptable score, returns the highest-scoring partial * alignment, which may be empty. * + * Applies the full-length bonus if the result begins with a match or mismatch. + * TODO: Use the full-length bonus to determine the optimal alignment. + * * NOTE: This creates a prefix of the full alignment by aligning a suffix * of the sequence. - * TODO: Should we use full-length bonuses? */ WFAAlignment prefix(const std::string& sequence, pos_t to) const; diff --git a/src/unittest/gbwt_extender.cpp b/src/unittest/gbwt_extender.cpp index d51b356a664..40180b29612 100644 --- a/src/unittest/gbwt_extender.cpp +++ b/src/unittest/gbwt_extender.cpp @@ -115,6 +115,13 @@ std::vector> normalize_seeds(std::vector Date: Mon, 6 May 2024 17:02:14 -0400 Subject: [PATCH 06/67] Protect linker arguments from Clang --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e845540f65e..6c8fd7262a1 100644 --- a/Makefile +++ b/Makefile @@ -217,7 +217,7 @@ ifeq ($(shell uname -s),Darwin) END_STATIC = # We need to use special flags to let us rename libraries - LD_RENAMEABLE_FLAGS = -headerpad -headerpad_max_install_names + LD_RENAMEABLE_FLAGS = -Wl,-headerpad -Wl,-headerpad_max_install_names else # We are not running on OS X $(info OS is Linux) From 5dc1f7d7307baea519b37b4aa0e0e12dce284e17 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 6 May 2024 17:11:54 -0400 Subject: [PATCH 07/67] Use correct target score for having full length bonus --- test/t/50_vg_giraffe.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 639f17c58d8..66e1ac75f9e 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -212,7 +212,7 @@ vg index -j 1mb1kgp.dist 1mb1kgp.vg vg autoindex -p 1mb1kgp -w giraffe -P "VG w/ Variant Paths:1mb1kgp.vg" -P "Giraffe Distance Index:1mb1kgp.dist" -r 1mb1kgp/z.fa -v 1mb1kgp/z.vcf.gz vg giraffe -Z 1mb1kgp.giraffe.gbz -f reads/1mb1kgp_longread.fq >longread.gam -U 300 --progress --track-provenance --align-from-chains # This is an 8001 bp read with 1 insert and 1 substitution -is "$(vg view -aj longread.gam | jq -r '.score')" "7989" "A long read can be correctly aligned" +is "$(vg view -aj longread.gam | jq -r '.score')" "7999" "A long read can be correctly aligned" is "$(vg view -aj longread.gam | jq -c '.path.mapping[].edit[] | select(.sequence)' | wc -l)" "2" "A long read has the correct edits found" is "$(vg view -aj longread.gam | jq -c '. | select(.annotation["filter_3_cluster-coverage_cluster_passed_size_total"] <= 300)' | wc -l)" "1" "Long read minimizer set is correctly restricted" From 27b48121d5125b97f6009acb7fc434a889160e25 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 15 May 2024 21:29:08 -0400 Subject: [PATCH 08/67] start decoupling TraversalFinder from protobuf --- src/clip.cpp | 2 +- src/deconstructor.cpp | 3 +- src/mcmc_caller.cpp | 2 +- src/subcommand/snarls_main.cpp | 2 +- src/traversal_finder.cpp | 152 ++++++++++++++++++++++++--------- src/traversal_finder.hpp | 22 +++-- src/unittest/snarls.cpp | 8 +- 7 files changed, 136 insertions(+), 55 deletions(-) diff --git a/src/clip.cpp b/src/clip.cpp index 0d2c5efd78d..8b078619fc9 100644 --- a/src/clip.cpp +++ b/src/clip.cpp @@ -135,7 +135,7 @@ void visit_contained_snarls(const PathPositionHandleGraph* graph, const vector ref_paths, const PathPositionHand // create the traversal finder map reads_by_name; - path_trav_finder = unique_ptr(new PathTraversalFinder(*graph, - *snarl_manager)); + path_trav_finder = unique_ptr(new PathTraversalFinder(*graph)); if (!path_restricted && !gbwt) { trav_finder = unique_ptr(new ExhaustiveTraversalFinder(*graph, diff --git a/src/mcmc_caller.cpp b/src/mcmc_caller.cpp index 956123d6f3c..de526ce3955 100644 --- a/src/mcmc_caller.cpp +++ b/src/mcmc_caller.cpp @@ -96,7 +96,7 @@ namespace vg { return false; } - PathTraversalFinder trav_finder(*path_position_handle_graph, snarl_manager, ref_paths); + PathTraversalFinder trav_finder(*path_position_handle_graph, ref_paths); auto trav_results = trav_finder.find_path_traversals(snarl); vector ref_path = trav_results.first; diff --git a/src/subcommand/snarls_main.cpp b/src/subcommand/snarls_main.cpp index 8587e93d2d2..20772eb328b 100644 --- a/src/subcommand/snarls_main.cpp +++ b/src/subcommand/snarls_main.cpp @@ -299,7 +299,7 @@ int main_snarl(int argc, char** argv) { if (path_traversals) { // Limit traversals to embedded paths - trav_finder.reset(new PathTraversalFinder(*graph, snarl_manager)); + trav_finder.reset(new PathTraversalFinder(*graph)); } else if (vcf_filename.empty()) { // This finder works with any backing graph trav_finder.reset(new ExhaustiveTraversalFinder(*graph, snarl_manager)); diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index ac5b20e7681..aaff06ab92b 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -815,9 +815,9 @@ vector ReadRestrictedTraversalFinder::find_traversals(const Snar return to_return; } -PathTraversalFinder::PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, +PathTraversalFinder::PathTraversalFinder(const PathHandleGraph& graph, const vector& path_names) : - graph(graph), snarl_manager(snarl_manager) { + graph(graph) { for (const string& path_name : path_names) { assert(graph.has_path(path_name)); paths.insert(graph.get_path_handle(path_name)); @@ -828,15 +828,36 @@ vector PathTraversalFinder::find_traversals(const Snarl& site) { return find_path_traversals(site).first; } -pair, vector > > PathTraversalFinder::find_path_traversals(const Snarl& site) { +vector PathTraversalFinder::find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + return find_path_traversals(snarl_start, snarl_end).first; +} - handle_t start_handle = graph.get_handle(site.start().node_id(), site.start().backward()); - handle_t end_handle = graph.get_handle(site.end().node_id(), site.end().backward()); - - vector start_steps = graph.steps_of_handle(start_handle); - vector end_steps = graph.steps_of_handle(end_handle); +pair, vector > > find_path_traversals(const Snarl& site); + +pair, vector> PathTraversalFinder::find_path_traversals(const Snarl& site) { + + pair, vector> path_travs = this->find_path_traversals( + graph.get_handle(site.start().node_id(), site.start().backward()), + graph.get_handle(site.end().node_id(), site.end().backward())); + + vector travs(path_travs.first.size()); - pair, unordered_set > snarl_contents = snarl_manager.deep_contents(&site, graph, true); + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + SnarlTraversal& proto_trav = travs[i]; + const Traversal& handle_trav = path_travs.first[i]; + for (int64_t j = 0; j < handle_trav.size(); ++j) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph.get_id(handle_trav[j])); + visit->set_backward(graph.get_is_reverse(handle_trav[j])); + } + } + return make_pair(travs, path_travs.second); +} + +pair, vector> PathTraversalFinder::find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + + vector start_steps = graph.steps_of_handle(snarl_start); + vector end_steps = graph.steps_of_handle(snarl_end); // use this to skip paths that don't reach the end node unordered_set end_path_handles; @@ -845,40 +866,48 @@ pair, vector > > PathT } #ifdef debug - cerr << "Finding traversals of " << pb2json(site) << " using PathTraversalFinder" << endl + cerr << "Finding traversals of " << (graph->get_is_reverse(snarl_start) ? "<" : ">") << graph->get_id(snarl_start) + << (graph->get_is_reverse(snarl_end) ? "<" : ">") << graph->get_id(snarl_edn) << " using PathTraversalFinder" << endl << " - there are " << start_steps.size() << " start_steps, " << end_steps.size() << " end_steps" << " and " << end_path_handles.size() << " end_path_handles" << endl; #endif - vector out_travs; + vector out_travs; vector > out_steps; + // collect edges that lead out the snarl to make sure we don't search in the wrong direction + unordered_set block_edges; + graph.follow_edges(snarl_start, true, [&](handle_t prev) { + block_edges.insert(graph.edge_handle(prev, snarl_start)); + }); + graph.follow_edges(snarl_end, false, [&](handle_t next) { + block_edges.insert(graph.edge_handle(snarl_end, next)); + }); + for (const step_handle_t& start_step : start_steps) { path_handle_t start_path_handle = graph.get_path_handle_of_step(start_step); // only crawl paths that have a chance of reaching the end if ((paths.empty() || paths.count(start_path_handle)) && end_path_handles.count(start_path_handle)) { - handle_t end_check = end_handle; + handle_t end_check = snarl_end; #ifdef debug cerr << " - considering path " << graph.get_path_name(start_path_handle) << endl; #endif // try to make a traversal by walking forward - SnarlTraversal trav; + Traversal trav; bool can_continue = true; step_handle_t step = start_step; while (can_continue) { handle_t handle = graph.get_handle_of_step(step); - Visit* start_visit = trav.add_visit(); - start_visit->set_node_id(graph.get_id(handle)); - start_visit->set_backward(graph.get_is_reverse(handle)); + trav.push_back(handle); can_continue = false; - if (graph.has_next_step(step) && handle != end_handle) { + if (graph.has_next_step(step) && handle != snarl_end) { step_handle_t next_step = graph.get_next_step(step); handle_t next_handle = graph.get_handle_of_step(next_step); - if (snarl_contents.first.count(graph.get_id(next_handle)) && - snarl_contents.second.count(graph.edge_handle(handle, next_handle))) { + // todo: we only need this check once + if (!block_edges.count(graph.edge_handle(handle, next_handle))) { step = next_step; can_continue = true; } @@ -890,25 +919,21 @@ pair, vector > > PathT cerr << " - failed to find forward traversal of path " << graph.get_path_name(start_path_handle) << endl; #endif // try to make a traversal by walking backward - end_check = graph.flip(end_handle); + end_check = graph.flip(snarl_end); - trav.Clear(); + trav.clear(); can_continue = true; step = start_step; while (can_continue) { handle_t handle = graph.flip(graph.get_handle_of_step(step)); - - Visit* start_visit = trav.add_visit(); - start_visit->set_node_id(graph.get_id(handle)); - start_visit->set_backward(graph.get_is_reverse(handle)); + trav.push_back(handle); can_continue = false; - if (graph.has_previous_step(step) && handle != end_handle) { + if (graph.has_previous_step(step) && handle != snarl_end) { step_handle_t prev_step = graph.get_previous_step(step); handle_t prev_handle = graph.flip(graph.get_handle_of_step(prev_step)); - - if (snarl_contents.first.count(graph.get_id(prev_handle)) && - snarl_contents.second.count(graph.edge_handle(handle, prev_handle))) { + // todo: we only need this check once + if (!block_edges.count(graph.edge_handle(prev_handle, handle))) { step = prev_step; can_continue = true; } @@ -2556,7 +2581,7 @@ VCFTraversalFinder::VCFTraversalFinder(const PathHandleGraph& graph, SnarlManage snarl_manager(snarl_manager), skip_alt(skip_alt), max_traversal_cutoff(max_traversal_cutoff), - path_finder(graph, snarl_manager, ref_path_names) { + path_finder(graph, ref_path_names) { create_variant_index(vcf, ref_fasta, ins_fasta); } @@ -3380,15 +3405,38 @@ GBWTTraversalFinder::~GBWTTraversalFinder() { pair, vector>> GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) { + + pair, vector>> gbwt_travs = this->find_gbwt_traversals( + graph.get_handle(site.start().node_id(), site.start().backward()), + graph.get_handle(site.end().node_id(), site.end().backward()), + return_paths); + + vector travs(gbwt_travs.first.size()); + + for (int64_t i = 0; i < gbwt_travs.first.size(); ++i) { + SnarlTraversal& proto_trav = travs[i]; + const Traversal& handle_trav = gbwt_travs.first[i]; + for (int64_t j = 0; j < handle_trav.size(); ++j) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph.get_id(handle_trav[j])); + visit->set_backward(graph.get_is_reverse(handle_trav[j])); + } + } + + return make_pair(travs, gbwt_travs.second); +} + +pair, vector>> +GBWTTraversalFinder::find_gbwt_traversals(const handle_t& snarl_start, const handle_t& snarl_end, bool return_paths) { // follow all gbwt threads from start to end vector, gbwt::SearchState> > forward_traversals = list_haplotypes( graph, gbwt, - graph.get_handle(site.start().node_id(), site.start().backward()), + snarl_start, [&] (const vector& new_thread) { - return gbwt::Node::id(new_thread.back()) == site.end().node_id() && - gbwt::Node::is_reverse(new_thread.back()) == site.end().backward(); + return gbwt::Node::id(new_thread.back()) == graph.get_id(snarl_end) && + gbwt::Node::is_reverse(new_thread.back()) == graph.get_is_reverse(snarl_end); }); // follow all gbwt threads from end to start @@ -3397,15 +3445,15 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) backward_traversals = list_haplotypes( graph, gbwt, - graph.get_handle(site.end().node_id(), !site.end().backward()), + graph.flip(snarl_end), [&] (const vector& new_thread) { - return gbwt::Node::id(new_thread.back()) == site.start().node_id() && - gbwt::Node::is_reverse(new_thread.back()) == !site.start().backward(); + return gbwt::Node::id(new_thread.back()) == graph.get_id(snarl_start) && + gbwt::Node::is_reverse(new_thread.back()) == !graph.get_is_reverse(snarl_start); }); } // store them all as snarltraversals - vector traversals; + vector traversals; vector> gbwt_paths; traversals.reserve(forward_traversals.size() + backward_traversals.size()); @@ -3413,8 +3461,7 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) for (int i = 0; i < forward_traversals.size(); ++i) { traversals.emplace_back(); for (auto j = forward_traversals[i].first.begin(); j != forward_traversals[i].first.end(); ++j) { - Visit* visit = traversals.back().add_visit(); - *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + traversals.back().push_back(graph.get_handle(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j))); } if (return_paths) { gbwt_paths.push_back(gbwt.locate(forward_traversals[i].second)); @@ -3458,8 +3505,7 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) // insert if not duplicate of existing forward traversal traversals.emplace_back(); for (auto j = backward_traversals[i].first.begin(); j != backward_traversals[i].first.end(); ++j) { - Visit* visit = traversals.back().add_visit(); - *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + traversals.back().push_back(graph.get_handle(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j))); } if (return_paths) { gbwt_paths.push_back(gbwt.locate(backward_traversals[i].second)); @@ -3474,6 +3520,10 @@ vector GBWTTraversalFinder::find_traversals(const Snarl& site) { return find_gbwt_traversals(site, false).first; } +vector GBWTTraversalFinder::find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + return find_gbwt_traversals(snarl_start, snarl_end, false).first; +} + pair, vector> GBWTTraversalFinder::find_path_traversals(const Snarl& site) { // get the unique traversals pair, vector>> gbwt_traversals = find_gbwt_traversals(site, true); @@ -3492,6 +3542,26 @@ pair, vector> GBWTTraversalFinder::find_ return path_traversals; } +pair, vector> GBWTTraversalFinder::find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + // get the unique traversals + pair, vector>> gbwt_traversals = this->find_gbwt_traversals(snarl_start, snarl_end, true); + + // expand them out to one per path (this is to be consistent with PathTraversalFinder as used in deconstruct) + pair, vector> path_traversals; + for (size_t i = 0; i < gbwt_traversals.first.size(); ++i) { + Traversal& trav = gbwt_traversals.first[i]; + vector& paths = gbwt_traversals.second[i]; + for (size_t j = 0; j < paths.size(); ++j) { + path_traversals.first.push_back(trav); + path_traversals.second.push_back(paths[j]); + } + } + + return path_traversals; +} + + + } diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index 47ad31e80ae..76d77ad48bf 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -36,6 +36,9 @@ using namespace std; class AugmentedGraph; +using Traversal = vector; +using PathInterval = pair; + /** * Represents a strategy for finding traversals of (nested) sites. Polymorphic * base class/interface. @@ -44,7 +47,12 @@ class TraversalFinder { public: virtual ~TraversalFinder() = default; - virtual vector find_traversals(const Snarl& site) = 0; + virtual vector find_traversals(const Snarl& site) = 0; + + // new, protobuf-free interface. hope is to eventually deprecate the old one. for now it is only supported in a few places + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + assert(false); return{}; + } }; class ExhaustiveTraversalFinder : public TraversalFinder { @@ -201,25 +209,25 @@ class PathTraversalFinder : public TraversalFinder { // our graph with indexed path positions const PathHandleGraph& graph; - SnarlManager& snarl_manager; - // restrict to these paths unordered_set paths; public: // if path_names not empty, only those paths will be considered - PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, + PathTraversalFinder(const PathHandleGraph& graph, const vector& path_names = {}); /** * Return all traversals through the site that are sub-paths of embedded paths in the graph */ virtual vector find_traversals(const Snarl& site); + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end); /** * Like above, but return the path steps for the for the traversal endpoints */ - virtual pair, vector > > find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end); }; @@ -636,18 +644,22 @@ class GBWTTraversalFinder : public TraversalFinder { /* Return a traversal for every gbwt thread through the snarl */ virtual vector find_traversals(const Snarl& site); + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end); /** Return the traversals, paired with their path identifiers in the gbwt. The traversals are * unique, but there can be more than one path along each one (hence the vector) */ virtual pair, vector>> find_gbwt_traversals(const Snarl& site, bool return_paths = true); + virtual pair, vector>> + find_gbwt_traversals(const handle_t& snarl_start, const handle_t& snarl_end, bool return_paths = true); /** Return traversals paired with path identifiers from the GBWT. The traversals are *not* unique * (which is consistent with PathTraversalFinder) * To get the sample name from the path identifier id, use gbwtgraph::get_path_sample_name(); */ virtual pair, vector> find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end); const gbwt::GBWT& get_gbwt() { return gbwt; } diff --git a/src/unittest/snarls.cpp b/src/unittest/snarls.cpp index 77d06a9dd73..670de4ad76b 100644 --- a/src/unittest/snarls.cpp +++ b/src/unittest/snarls.cpp @@ -3933,7 +3933,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(2); @@ -3972,7 +3972,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(7); @@ -4013,7 +4013,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(8); @@ -4053,7 +4053,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(11); From 165ed97479629ced90beb01a00d35d3f43b33e1a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 16 May 2024 13:21:31 -0700 Subject: [PATCH 09/67] Update to current libvgio --- deps/libvgio | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libvgio b/deps/libvgio index 9b0d0e11df6..def4827b903 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit 9b0d0e11df6f9bd389ba4dba08d107953eabff8f +Subproject commit def4827b9034d9624179c442c8568978ca33e5b8 From bea8e9ca5748bfea1b1bd9e540b909ce3d801ec5 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 15 Mar 2024 15:31:47 -0400 Subject: [PATCH 10/67] Define PARAMS_JSON tag with the registry --- src/io/register_libvg_io.cpp | 2 ++ src/io/register_loader_params_json.cpp | 27 ++++++++++++++++++++++++++ src/io/register_loader_params_json.hpp | 21 ++++++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100644 src/io/register_loader_params_json.cpp create mode 100644 src/io/register_loader_params_json.hpp diff --git a/src/io/register_libvg_io.cpp b/src/io/register_libvg_io.cpp index 94c0a643a8c..4d8fb51f603 100644 --- a/src/io/register_libvg_io.cpp +++ b/src/io/register_libvg_io.cpp @@ -20,6 +20,7 @@ #include "register_loader_saver_packed_graph.hpp" #include "register_loader_saver_hash_graph.hpp" #include "register_loader_saver_gfa.hpp" +#include "register_loader_params_json.hpp" #include "register_libvg_io.hpp" @@ -46,6 +47,7 @@ bool register_libvg_io() { register_loader_saver_xg(); register_loader_saver_packed_graph(); register_loader_saver_hash_graph(); + register_loader_params_json(); return true; } diff --git a/src/io/register_loader_params_json.cpp b/src/io/register_loader_params_json.cpp new file mode 100644 index 00000000000..e06f8f12855 --- /dev/null +++ b/src/io/register_loader_params_json.cpp @@ -0,0 +1,27 @@ +/** + * \file register_loader_params_json.cpp + * Defines IO for a VG graph from stream files of Graph objects. + */ + +#include +#include "register_loader_params_json.hpp" + + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_params_json() { + Registry::register_loader("PARAMS_JSON", wrap_bare_loader([](const std::istream& stream) -> void* { + // Read the whole stream with an iterator. See . + return new std::string(std::istreambuf_iterator(stream), {}); + }); +} + +} + +} + diff --git a/src/io/register_loader_params_json.hpp b/src/io/register_loader_params_json.hpp new file mode 100644 index 00000000000..9455c73b36e --- /dev/null +++ b/src/io/register_loader_params_json.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_PARAMS_JSON_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_PARAMS_JSON_HPP_INCLUDED + +/** + * \file register_loader_params_json.hpp + * Defines IO for embedded parameters. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_params_json(); + +} + +} + +#endif From 0c64463a9d321b5d7715f1b851dcc8d3e7833e63 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 15 Mar 2024 15:32:21 -0400 Subject: [PATCH 11/67] Finish implementing PARAMS_JSON loader --- src/io/register_loader_params_json.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/io/register_loader_params_json.cpp b/src/io/register_loader_params_json.cpp index e06f8f12855..bc82deb0235 100644 --- a/src/io/register_loader_params_json.cpp +++ b/src/io/register_loader_params_json.cpp @@ -6,6 +6,8 @@ #include #include "register_loader_params_json.hpp" +#include + namespace vg { @@ -15,10 +17,10 @@ using namespace std; using namespace vg::io; void register_loader_params_json() { - Registry::register_loader("PARAMS_JSON", wrap_bare_loader([](const std::istream& stream) -> void* { + Registry::register_loader("PARAMS_JSON", wrap_bare_loader([](std::istream& stream) -> void* { // Read the whole stream with an iterator. See . return new std::string(std::istreambuf_iterator(stream), {}); - }); + })); } } From 0b1ac8cef553b90305c6384e4c324cf8cf1c5e67 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 17 May 2024 08:13:18 -0400 Subject: [PATCH 12/67] deprecate exhaustive deconstruct option --- src/deconstructor.cpp | 151 +++++----------------------- src/deconstructor.hpp | 23 ----- src/subcommand/deconstruct_main.cpp | 25 +---- src/variant_recall.cpp | 2 +- test/t/26_deconstruct.t | 17 +--- 5 files changed, 32 insertions(+), 186 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 1e96e3944a6..3802f97d9e3 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -8,8 +8,7 @@ using namespace std; namespace vg { -Deconstructor::Deconstructor() : VCFOutputCaller(""), - exhaustive_jaccard_warning(false){ +Deconstructor::Deconstructor() : VCFOutputCaller("") { } Deconstructor::~Deconstructor(){ } @@ -293,7 +292,7 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name assert(names.size() == trav_to_allele.size()); // set up our variant fields v.format.push_back("GT"); - if (show_path_info && path_to_sample_phase && path_restricted) { + if (show_path_info && path_to_sample_phase) { v.format.push_back("PI"); } @@ -364,7 +363,7 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name } } v.samples[sample_name]["GT"] = {blank_gt}; - if (show_path_info && path_to_sample_phase && path_restricted) { + if (show_path_info && path_to_sample_phase) { v.samples[sample_name]["PI"] = {blank_gt}; } } @@ -422,9 +421,9 @@ pair, bool> Deconstructor::choose_traversals(const string& sample_na std::any_of(gbwt_phases.begin(), gbwt_phases.end(), [](int i) { return i >= 0; }); //|| path_to_sample_phase; bool phasing_conflict = false; - int sample_ploidy = ploidy; + int sample_ploidy = 1; int min_phase = 1; - int max_phase = ploidy; + int max_phase = 1; if (has_phasing || path_to_sample_phase) { if (has_phasing) { // override ploidy with information about all phases found in input @@ -691,36 +690,6 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { #endif return false; } - - bool exhaustive = !path_restricted && gbwt_trav_finder.get() == nullptr; - if (exhaustive) { - // add in the exhaustive traversals - vector additional_travs; - - // exhaustive traversal can't do all snarls - if (snarl->type() != ULTRABUBBLE) { - return false; - } - if (!check_max_nodes(snarl)) { -#pragma omp critical (cerr) - cerr << "Warning: Skipping site because it is too complex for exhaustive traversal enumeration: " << pb2json(*snarl) << endl << " Consider using -e to traverse embedded paths" << endl; - return false; - } - additional_travs = explicit_exhaustive_traversals(snarl); - - // happens when there was a nested non-ultrabubble snarl - if (additional_travs.empty()) { - return false; - } - path_travs.first.insert(path_travs.first.end(), additional_travs.begin(), additional_travs.end()); - for (int i = 0; i < additional_travs.size(); ++i) { - // dummy names so we can use the same code as the named path traversals above - path_trav_names.push_back(" >>" + std::to_string(i)); - // dummy handles so we can use the same code as the named path traversals above - path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); - } - - } // there's not alt path through the snarl, so we can't make an interesting variant if (path_travs.first.size() < 2) { @@ -742,12 +711,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { // map from each path_trav index to the ref_trav index it best maps to vector path_trav_to_ref_trav; - if (ref_travs.size() > 1 && this->path_jaccard_window && exhaustive && !exhaustive_jaccard_warning) { -#pragma omp critical (cerr) - cerr << "warning [vg deconstruct]: Conext Jaccard logic for multiple references disabled with exhaustive traversals. Use -e, -g or GBZ input to switch to path-based traversals only (recommended)." << endl; - exhaustive_jaccard_warning = true; - } - if (ref_travs.size() > 1 && this->path_jaccard_window && !exhaustive) { + if (ref_travs.size() > 1 && this->path_jaccard_window) { path_trav_to_ref_trav.resize(path_travs.first.size()); #ifdef debug #pragma omp critical (cerr) @@ -870,16 +834,12 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { prev_char, use_start); // Fill in the genotypes - if (path_restricted || gbwt_trav_finder.get()) { - get_genotypes(v, path_trav_names, trav_to_allele); - } + get_genotypes(v, path_trav_names, trav_to_allele); // we only bother printing out sites with at least 1 non-reference allele if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { - if (path_restricted || gbwt_trav_finder.get()) { - // run vcffixup to add some basic INFO like AC - vcf_fixup(v); - } + // run vcffixup to add some basic INFO like AC + vcf_fixup(v); add_variant(v); } } @@ -892,8 +852,6 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { * Convenience wrapper function for deconstruction of multiple paths. */ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, - bool path_restricted_traversals, - int ploidy, bool include_nested, int context_jaccard_window, bool untangle_traversals, @@ -904,8 +862,6 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand this->graph = graph; this->snarl_manager = snarl_manager; - this->path_restricted = path_restricted_traversals; - this->ploidy = ploidy; this->ref_paths = set(ref_paths.begin(), ref_paths.end()); this->include_nested = include_nested; this->path_jaccard_window = context_jaccard_window; @@ -984,6 +940,13 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } } + if (sample_to_haps.empty()) { + cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that " + << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct " + << "now only works on embedded paths and GBWT threads" << endl; + exit(1); + } + // find some stats about the haplotypes for each sample gbwt_sample_to_phase_range.clear(); sample_ploidys.clear(); @@ -995,10 +958,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand // print the VCF header stringstream stream; stream << "##fileformat=VCFv4.2" << endl; - if (path_restricted || gbwt) { - stream << "##FORMAT=" << endl; - } - if (show_path_info && path_to_sample_phase && path_restricted) { + stream << "##FORMAT=" << endl; + if (show_path_info && path_to_sample_phase) { stream << "##FORMAT=" << endl; } if (path_to_sample_phase || gbwt) { @@ -1008,12 +969,10 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } stream << "\">" << endl; } - if (path_restricted || gbwt) { - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - } + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; if (include_nested) { stream << "##INFO=" << endl; stream << "##INFO=" << endl; @@ -1091,10 +1050,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - if (path_restricted || gbwt) { - for (auto& sample_name : sample_names) { - stream << "\t" << sample_name; - } + for (auto& sample_name : sample_names) { + stream << "\t" << sample_name; } stream << endl; @@ -1105,14 +1062,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand // create the traversal finder map reads_by_name; path_trav_finder = unique_ptr(new PathTraversalFinder(*graph)); - - if (!path_restricted && !gbwt) { - trav_finder = unique_ptr(new ExhaustiveTraversalFinder(*graph, - *snarl_manager, - true)); - - } - + if (gbwt != nullptr) { gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } @@ -1158,57 +1108,6 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand write_variants(cout, snarl_manager); } -bool Deconstructor::check_max_nodes(const Snarl* snarl) const { - unordered_set nodeset = snarl_manager->deep_contents(snarl, *graph, false).first; - int node_count = 0; - for (auto node_id : nodeset) { - handle_t node = graph->get_handle(node_id); - if (graph->get_degree(node, true) > 1 || graph->get_degree(node, false) > 1) { - ++node_count; - if (node_count > max_nodes_for_exhaustive) { - return false; - } - } - } - return true; -}; - -vector Deconstructor::explicit_exhaustive_traversals(const Snarl* snarl) const { - vector out_travs; - bool ultra_all_the_way_down = true; - function extend_trav = - [&](const SnarlTraversal& trav, const Snarl& nested_snarl) { - // exhaustive traversal finder is limited. if we find something - // that's not an ultrabubble, not much we can do - if (nested_snarl.type() != ULTRABUBBLE) { - ultra_all_the_way_down = false; - return; - } - vector nested_travs = trav_finder->find_traversals(nested_snarl); - for (auto& nested_trav : nested_travs) { - SnarlTraversal extended_trav = trav; - bool is_explicit = true; - for (int i = 0; i < nested_trav.visit_size(); ++i) { - if (nested_trav.visit(i).node_id() != 0) { - Visit* visit = extended_trav.add_visit(); - *visit = nested_trav.visit(i); - } else { - extend_trav(extended_trav, nested_trav.visit(i).snarl()); - is_explicit = false; - } - } - if (is_explicit) { - out_travs.push_back(extended_trav); - } - } - }; - SnarlTraversal trav; - extend_trav(trav, *snarl); - if (!ultra_all_the_way_down) { - out_travs.clear(); - } - return out_travs; -} } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index a8c66e369bb..cfba5ea0920 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -39,8 +39,6 @@ class Deconstructor : public VCFOutputCaller { // deconstruct the entire graph to cout. // Not even a little bit thread safe. void deconstruct(vector refpaths, const PathPositionHandleGraph* grpah, SnarlManager* snarl_manager, - bool path_restricted_traversals, - int ploidy, bool include_nested, int context_jaccard_window, bool untangle_traversals, @@ -74,13 +72,6 @@ class Deconstructor : public VCFOutputCaller { const vector& trav_to_name, const vector& gbwt_phases) const; - // check to see if a snarl is too big to exhaustively traverse - bool check_max_nodes(const Snarl* snarl) const; - - // get traversals from the exhaustive finder. if they have nested visits, fill them in (exhaustively) - // with node visits - vector explicit_exhaustive_traversals(const Snarl* snarl) const; - // gets a sorted node id context for a given path vector get_context( const pair, @@ -101,12 +92,6 @@ class Deconstructor : public VCFOutputCaller { const dac_vector<>& target, const vector& query) const; - // toggle between exhaustive and path restricted traversal finder - bool path_restricted = false; - - // the max ploidy we expect. - int ploidy; - // the graph const PathPositionHandleGraph* graph; @@ -115,8 +100,6 @@ class Deconstructor : public VCFOutputCaller { // the traversal finders. we always use a path traversal finder to get the reference path unique_ptr path_trav_finder; - // we optionally use another (exhaustive for now) traversal finder if we don't want to rely on paths - unique_ptr trav_finder; // we can also use a gbwt for traversals unique_ptr gbwt_trav_finder; // When using the gbwt we need some precomputed information to ask about stored paths. @@ -143,9 +126,6 @@ class Deconstructor : public VCFOutputCaller { // the sample ploidys given in the phases in our path names unordered_map sample_ploidys; - // upper limit of degree-2+ nodes for exhaustive traversal - int max_nodes_for_exhaustive = 100; - // target window size for determining the correct reference position for allele traversals with path jaccard int path_jaccard_window = 10000; @@ -160,9 +140,6 @@ class Deconstructor : public VCFOutputCaller { // should we keep conflicted genotypes or not bool keep_conflicted_genotypes = false; - - // warn about context jaccard not working with exhaustive traversals - mutable atomic exhaustive_jaccard_warning; }; // helpel for measuring set intersectiond and union size diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 975336f6aa4..497922e5b70 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -42,12 +42,10 @@ void help_deconstruct(char** argv){ << " -P, --path-prefix NAME All paths [excluding GBWT threads / non-reference GBZ paths] beginning with NAME used as reference (multiple allowed)." << endl << " Other non-ref paths not considered as samples. " << endl << " -r, --snarls FILE Snarls file (from vg snarls) to avoid recomputing." << endl - << " -g, --gbwt FILE only consider alt traversals that correspond to GBWT threads FILE (not needed for GBZ graph input)." << endl + << " -g, --gbwt FILE consider alt traversals that correspond to GBWT haplotypes in FILE (not needed for GBZ graph input)." << endl << " -T, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names and AT fields in output" << endl << " -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl - << " -e, --path-traversals Only consider traversals that correspond to paths in the graph." << endl << " -a, --all-snarls Process all snarls, including nested snarls (by default only top-level snarls reported)." << endl - << " -d, --ploidy N Expected ploidy. If more traversals found, they will be flagged as conflicts (default: 2)" << endl << " -c, --context-jaccard N Set context mapping size used to disambiguate alleles at sites with multiple reference traversals (default: 10000)." << endl << " -u, --untangle-travs Use context mapping to determine the reference-relative positions of each step in allele traversals (AP INFO field)." << endl << " -K, --keep-conflicted Retain conflicted genotypes in output." << endl @@ -73,8 +71,6 @@ int main_deconstruct(int argc, char** argv){ bool gbz_translation = false; bool path_restricted_traversals = false; bool show_progress = false; - int ploidy = 2; - bool set_ploidy = false; bool all_snarls = false; bool keep_conflicted = false; bool strict_conflicts = false; @@ -96,7 +92,6 @@ int main_deconstruct(int argc, char** argv){ {"translation", required_argument, 0, 'T'}, {"gbz-translation", no_argument, 0, 'O'}, {"path-traversals", no_argument, 0, 'e'}, - {"ploidy", required_argument, 0, 'd'}, {"context-jaccard", required_argument, 0, 'c'}, {"untangle-travs", no_argument, 0, 'u'}, {"all-snarls", no_argument, 0, 'a'}, @@ -140,12 +135,12 @@ int main_deconstruct(int argc, char** argv){ gbz_translation = true; break; case 'e': - path_restricted_traversals = true; + cerr << "Warning [vg deconstruct]: -e is deprecated as it's now on default" << endl; break; case 'd': - ploidy = parse(optarg); - set_ploidy = true; + cerr << "Warning [vg deconstruct]: -d is deprecated - ploidy now inferred from haplotypes in path names" << endl; break; + break; case 'c': context_jaccard_window = parse(optarg); break; @@ -206,16 +201,6 @@ int main_deconstruct(int argc, char** argv){ cerr << "Error [vg deconstruct]: -O can only be used when input graph is in GBZ format" << endl; } - if (set_ploidy && !path_restricted_traversals && gbwt_file_name.empty() && !gbz_graph) { - cerr << "Error [vg deconstruct]: -d can only be used with -e or -g or GBZ input" << endl; - return 1; - } - - if ((!gbwt_file_name.empty() || gbz_graph) && path_restricted_traversals && !gbz_graph) { - cerr << "Error [vg deconstruct]: -e cannot be used with -g or GBZ input" << endl; - return 1; - } - if (!gbwt_file_name.empty() || gbz_graph) { // context jaccard depends on having steps for each alt traversal, which is // not something we have on hand when getting traversals from the GBWT/GBZ @@ -352,7 +337,7 @@ int main_deconstruct(int argc, char** argv){ } dd.set_translation(translation.get()); dd.set_nested(all_snarls); - dd.deconstruct(refpaths, graph, snarl_manager.get(), path_restricted_traversals, ploidy, + dd.deconstruct(refpaths, graph, snarl_manager.get(), all_snarls, context_jaccard_window, untangle_traversals, diff --git a/src/variant_recall.cpp b/src/variant_recall.cpp index 340f86c394a..e85821a3d14 100644 --- a/src/variant_recall.cpp +++ b/src/variant_recall.cpp @@ -82,7 +82,7 @@ void genotype_svs(VG* graph, Deconstructor decon; CactusSnarlFinder finder(xg_index); SnarlManager snarl_manager = finder.find_snarls(); - decon.deconstruct({refpath}, &xg_index, &snarl_manager, false, 1, false, 10000, false, false, false, false); + decon.deconstruct({refpath}, &xg_index, &snarl_manager, false, 10000, false, false, false, false); } direct_ins.clear(); diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index aa11d17d092..1b2cafb7460 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,22 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 24 - -vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > tiny.vg -vg index tiny.vg -x tiny.xg -vg deconstruct tiny.xg -p x -t 1 > tiny_decon.vcf -# we pop out that GC allele because it gets invented by the adjacent snps in the graph -gzip -dc tiny/tiny.vcf.gz | tail -3 | awk '{print $1 "\t" $2 "\t" $4 "\t" $5}' > tiny_orig.tsv -cat tiny_decon.vcf | grep -v "#" | grep -v GC | awk '{print $1 "\t" $2 "\t" $4 "\t" $5}' > tiny_dec.tsv -diff tiny_orig.tsv tiny_dec.tsv -is "$?" 0 "deconstruct retrieved original VCF (modulo adjacent snp allele)" -grep '>1>6' tiny_decon.vcf | awk '{print $8}' > allele_travs -printf "AT=>1>3>5>6,>1>3>4>6,>1>2>5>6,>1>2>4>6\n" > expected_allele_travs -diff allele_travs expected_allele_travs -is "$?" 0 "deconstruct produced expected AT field" - -rm -f tiny.vg tiny.xg tiny_decon.vcf tiny_orig.tsv tiny_dec.tsv allele_travs expected_allele_travs +plan tests 22 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg From 7a59e90627f0361c4f399d711f69bc8700fd0490 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 17 May 2024 10:10:51 -0400 Subject: [PATCH 13/67] refactor traversal finding into its own function --- src/deconstructor.cpp | 323 +++++++++++++++++++++--------------------- src/deconstructor.hpp | 21 +-- 2 files changed, 173 insertions(+), 171 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 3802f97d9e3..21dbf7f9ed3 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -22,13 +22,12 @@ Deconstructor::~Deconstructor(){ * ought to become in the VCF. If a traversal is flagged off, it gets a -1. */ vector Deconstructor::get_alleles(vcflib::Variant& v, - const pair, - vector>>& path_travs, + const vector& travs, + const vector>& trav_steps, int ref_path_idx, const vector& use_trav, char prev_char, bool use_start) const { - auto& travs = path_travs.first; assert(ref_path_idx >=0 && ref_path_idx < travs.size()); // map strings to allele numbers (and their traversal) @@ -132,7 +131,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, if (untangle_allele_traversals) { // set up for reference position context mapping across allele traversals - path_handle_t ref_path = graph->get_path_handle_of_step(path_travs.second.at(ref_path_idx).first); + path_handle_t ref_path = graph->get_path_handle_of_step(trav_steps.at(ref_path_idx).first); unordered_map>>> ref_dup_nodes; unordered_map ref_simple_pos; { @@ -187,8 +186,8 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, for (size_t i = 0; i < allele_idx_unfolded.size(); i++) { int allele_no = i; int allele_trav_no = allele_idx_unfolded[i]; - auto start_step = path_travs.second.at(allele_trav_no).first; - auto end_step = path_travs.second.at(allele_trav_no).second; + auto start_step = trav_steps.at(allele_trav_no).first; + auto end_step = trav_steps.at(allele_trav_no).second; auto start_pos = graph->get_position_of_step(start_step); auto end_pos = graph->get_position_of_step(end_step); bool flip_path = start_pos > end_pos; @@ -563,66 +562,37 @@ vector Deconstructor::get_context( return context; } -vector Deconstructor::get_context( - const pair, - vector>>& path_travs, - const int& trav_idx) const { - step_handle_t start_step = path_travs.second[trav_idx].first; - step_handle_t end_step = path_travs.second[trav_idx].second; - return get_context(start_step, end_step); -} -bool Deconstructor::deconstruct_site(const Snarl* snarl) const { - - auto contents = snarl_manager->shallow_contents(snarl, *graph, false); - if (contents.first.empty()) { - // Nothing but the boundary nodes in this snarl +void Deconstructor::get_traversals(const Snarl* snarl, + vector& out_travs, + vector& out_trav_path_names, + vector>& out_trav_steps) const { + // empty snarl check + vector next_handles; + graph->follow_edges(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), false, [&](handle_t handle) { + next_handles.push_back(handle); + }); + if (next_handles.size() == 1 && next_handles.back() == graph->get_handle(snarl->end().node_id(), snarl->end().backward())) { #ifdef debug #pragma omp critical (cerr) cerr << "Skipping empty site " << pb2json(*snarl) << endl; -#endif - return false; +#endif + return; } + #ifdef debug #pragma omp crtiical (cerr) cerr << "Computing traversals of site " << pb2json(*snarl) << endl; #endif // find every traversal that runs through a path in the graph - pair, vector > > path_travs; - path_travs = path_trav_finder->find_path_traversals(*snarl); - vector path_trav_names; - for (const pair& trav_ends : path_travs.second) { - path_trav_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_ends.first))); - } - - // pick out the traversal corresponding to a reference path, breaking ties consistently - string ref_trav_name; - for (int i = 0; i < path_travs.first.size(); ++i) { - const string& path_trav_name = path_trav_names.at(i); -#ifdef debug -#pragma omp critical (cerr) - { - cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << path_travs.first[i].visit_size() - << ", start=" << graph->get_position_of_step(path_travs.second[i].first) - << ", end=" << graph->get_position_of_step(path_travs.second[i].second) << endl - << " trav=" << pb2json(path_travs.first[i]) << endl; - } -#endif - if (ref_paths.count(path_trav_name) && - (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { - ref_trav_name = path_trav_name; -#ifdef debug -#pragma omp critical (cerr) - cerr << "Setting ref_trav_name " << ref_trav_name << endl; -#endif - } + std::tie(out_travs, out_trav_steps) = path_trav_finder->find_path_traversals(*snarl); + for (const pair& trav_steps : out_trav_steps) { + out_trav_path_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_steps.first))); } // add in the gbwt traversals // after this, all traversals are treated the same, with metadata embedded in their names - int64_t first_gbwt_trav_idx = path_trav_names.size(); - vector gbwt_path_ids; if (gbwt_trav_finder.get() != nullptr) { const gbwt::GBWT& gbwt_index = gbwt_trav_finder->get_gbwt(); pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(*snarl); @@ -634,7 +604,6 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { continue; } - gbwt_path_ids.push_back(path_id); PathSense sense = gbwtgraph::get_path_sense(gbwt_index, path_id, gbwt_reference_samples); if (sense == PathSense::HAPLOTYPE) { @@ -647,13 +616,51 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { gbwtgraph::get_path_haplotype(gbwt_index, path_id, sense), gbwtgraph::get_path_phase_block(gbwt_index, path_id, sense), gbwtgraph::get_path_subrange(gbwt_index, path_id, sense)); - path_trav_names.push_back(path_name); - path_travs.first.push_back(thread_travs.first[i]); - // dummy handles so we can use the same code as the named path traversals above - path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); + out_trav_path_names.push_back(path_name); + out_travs.push_back(std::move(thread_travs.first[i])); } } } +} + + +bool Deconstructor::deconstruct_site(const Snarl* snarl) const { + + vector travs; + vector trav_path_names; + // note that this vector (unlike the above two) is for embedded paths only (not GBWT) + vector> trav_steps; + + // compute all the traversals from embedded paths and gbwt + this->get_traversals(snarl, travs, trav_path_names, trav_steps); + + if (travs.empty()) { + return false; + } + + // pick out the traversal corresponding to an embedded reference path, breaking ties consistently + string ref_trav_name; + for (int i = 0; i < trav_steps.size(); ++i) { + const string& path_trav_name = trav_path_names[i]; +#ifdef debug +#pragma omp critical (cerr) + { + cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].visit_size() + << ", start=" << graph->get_position_of_step(trav_steps[i].first) + << ", end=" << graph->get_position_of_step(trav_steps[i].second) << endl + << " trav=" << pb2json(travs[i]) << endl; + } +#endif + if (ref_paths.count(path_trav_name) && + (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { + ref_trav_name = path_trav_name; +#ifdef debug +#pragma omp critical (cerr) + cerr << "Setting ref_trav_name " << ref_trav_name << endl; +#endif + } + } + // remember all the reference traversals (there can be more than one only in the case of a // cycle in the reference path @@ -665,8 +672,8 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { // hacky subpath support -- gets added to variant on output vector ref_offsets; if (!ref_trav_name.empty()) { - for (int i = 0; i < path_travs.first.size(); ++i) { - const string& path_trav_name = path_trav_names.at(i); + for (int i = 0; i < trav_steps.size(); ++i) { + const string& path_trav_name = trav_path_names.at(i); subrange_t subrange ; Paths::strip_subrange(path_trav_name, &subrange); int64_t sub_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; @@ -692,7 +699,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { } // there's not alt path through the snarl, so we can't make an interesting variant - if (path_travs.first.size() < 2) { + if (travs.size() < 2) { #ifdef debug #pragma omp critical (cerr) cerr << "Skipping site because to alt traversal was found " << pb2json(*snarl) << endl; @@ -712,139 +719,131 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { vector path_trav_to_ref_trav; if (ref_travs.size() > 1 && this->path_jaccard_window) { - path_trav_to_ref_trav.resize(path_travs.first.size()); + path_trav_to_ref_trav.resize(trav_steps.size()); #ifdef debug #pragma omp critical (cerr) cerr << "Multiple ref traversals!" << endl; #endif - { - vector> ref_contexts(ref_travs.size()); + vector> ref_contexts(ref_travs.size()); #pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < ref_travs.size(); ++i) { - auto& trav_id = ref_travs[i]; - ref_contexts[i] = get_context(path_travs, trav_id); - } - // now for each traversal, we compute and equivalent context and match it to a ref context - // using a jaccard metric over node ids + for (size_t i = 0; i < ref_travs.size(); ++i) { + ref_contexts[i] = get_context(trav_steps[ref_travs[i]].first, trav_steps[ref_travs[i]].second); + } + + // now for each traversal, we compute and equivalent context and match it to a ref context + // using a jaccard metric over node ids #pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < path_travs.first.size(); ++i) { - vector context = get_context(path_travs, i); - // map jaccard metric to the index of the ref_trav - vector> ref_mappings; - for (uint64_t j = 0; j < ref_travs.size(); ++j) { - ref_mappings.push_back(make_pair( - context_jaccard( - ref_contexts[j], - context), - ref_travs[j])); - } - std::sort(ref_mappings.begin(), ref_mappings.end()); - // the best is the last, which has the highest jaccard - path_trav_to_ref_trav[i] = ref_mappings.back().second; + for (size_t i = 0; i < trav_steps.size(); ++i) { + vector context = get_context(trav_steps[i].first, trav_steps[i].second); + // map jaccard metric to the index of the ref_trav + vector> ref_mappings; + for (uint64_t j = 0; j < ref_travs.size(); ++j) { + ref_mappings.push_back(make_pair( + context_jaccard( + ref_contexts[j], + context), + ref_travs[j])); } + std::sort(ref_mappings.begin(), ref_mappings.end()); + // the best is the last, which has the highest jaccard + path_trav_to_ref_trav[i] = ref_mappings.back().second; } } // we write a variant for every reference traversal // (optionally) selecting the subset of path traversals that are 1:1 -//#pragma omp parallel for for (size_t i = 0; i < ref_travs.size(); ++i) { -//#pragma omp task firstprivate(i) - { - auto& ref_trav_idx = ref_travs[i]; - auto& ref_trav_offset = ref_offsets[i]; + auto& ref_trav_idx = ref_travs[i]; + auto& ref_trav_offset = ref_offsets[i]; - const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx]; + const SnarlTraversal& ref_trav = travs[ref_trav_idx]; - vcflib::Variant v; - v.quality = 60; + vcflib::Variant v; + v.quality = 60; - // in VCF we usually just want the contig - string contig_name = PathMetadata::parse_locus_name(ref_trav_name); - if (contig_name == PathMetadata::NO_LOCUS_NAME) { + // in VCF we usually just want the contig + string contig_name = PathMetadata::parse_locus_name(ref_trav_name); + if (contig_name == PathMetadata::NO_LOCUS_NAME) { + contig_name = ref_trav_name; + } else if (long_ref_contig) { + // the sample name isn't unique enough, so put a full ugly name in the vcf + if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { contig_name = ref_trav_name; - } else if (long_ref_contig) { - // the sample name isn't unique enough, so put a full ugly name in the vcf - if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { - contig_name = ref_trav_name; - } else { - contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, - PathMetadata::parse_sample_name(ref_trav_name), - contig_name, - PathMetadata::parse_haplotype(ref_trav_name), - PathMetadata::NO_PHASE_BLOCK, - PathMetadata::NO_SUBRANGE); - } + } else { + contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, + PathMetadata::parse_sample_name(ref_trav_name), + contig_name, + PathMetadata::parse_haplotype(ref_trav_name), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); } + } - // write variant's sequenceName (VCF contig) - v.sequenceName = contig_name; - - // Map our snarl endpoints to oriented positions in the embedded path in the graph - handle_t first_path_handle; - size_t first_path_pos; - bool use_start; - assert(ref_trav_idx < first_gbwt_trav_idx); - step_handle_t start_step = path_travs.second[ref_trav_idx].first; - step_handle_t end_step = path_travs.second[ref_trav_idx].second; - handle_t start_handle = graph->get_handle_of_step(start_step); - handle_t end_handle = graph->get_handle_of_step(end_step); - size_t start_pos = graph->get_position_of_step(start_step); - size_t end_pos = graph->get_position_of_step(end_step); - use_start = start_pos < end_pos; - first_path_handle = use_start ? start_handle : end_handle; - first_path_pos = use_start ? start_pos : end_pos; + // write variant's sequenceName (VCF contig) + v.sequenceName = contig_name; + + // Map our snarl endpoints to oriented positions in the embedded path in the graph + handle_t first_path_handle; + size_t first_path_pos; + bool use_start; + step_handle_t start_step = trav_steps[ref_trav_idx].first; + step_handle_t end_step = trav_steps[ref_trav_idx].second; + handle_t start_handle = graph->get_handle_of_step(start_step); + handle_t end_handle = graph->get_handle_of_step(end_step); + size_t start_pos = graph->get_position_of_step(start_step); + size_t end_pos = graph->get_position_of_step(end_step); + use_start = start_pos < end_pos; + first_path_handle = use_start ? start_handle : end_handle; + first_path_pos = use_start ? start_pos : end_pos; - // Get the first visit of our snarl traversal - const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1); - - char prev_char; - if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) || - (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) { - // Our path and traversal have consistent orientation. leave off the end of the start node going forward - first_path_pos += graph->get_length(first_path_handle); - prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); - } else { - // They are flipped: leave off the beginning of the start node going backward - prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0])); - } + // Get the first visit of our snarl traversal + const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1); + + char prev_char; + if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) || + (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) { + // Our path and traversal have consistent orientation. leave off the end of the start node going forward + first_path_pos += graph->get_length(first_path_handle); + prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); + } else { + // They are flipped: leave off the beginning of the start node going backward + prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0])); + } - // shift from 0-based to 1-based for VCF - first_path_pos += 1; + // shift from 0-based to 1-based for VCF + first_path_pos += 1; - v.position = first_path_pos + ref_trav_offset; + v.position = first_path_pos + ref_trav_offset; - v.id = print_snarl(*snarl); + v.id = print_snarl(*snarl); - // Convert the snarl traversals to strings and add them to the variant - vector use_trav(path_travs.first.size()); - if (path_trav_to_ref_trav.size()) { - for (uint64_t i = 0; i < use_trav.size(); ++i) { - use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]); - } - } else { - for (uint64_t i = 0; i < use_trav.size(); ++i) { - use_trav[i] = true; - } + // Convert the snarl traversals to strings and add them to the variant + vector use_trav(travs.size()); + if (path_trav_to_ref_trav.size()) { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]); } + } else { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = true; + } + } - vector trav_to_allele = get_alleles(v, path_travs, - ref_trav_idx, - use_trav, - prev_char, use_start); + vector trav_to_allele = get_alleles(v, travs, trav_steps, + ref_trav_idx, + use_trav, + prev_char, use_start); - // Fill in the genotypes - get_genotypes(v, path_trav_names, trav_to_allele); + // Fill in the genotypes + get_genotypes(v, trav_path_names, trav_to_allele); - // we only bother printing out sites with at least 1 non-reference allele - if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { - // run vcffixup to add some basic INFO like AC - vcf_fixup(v); - add_variant(v); - } + // we only bother printing out sites with at least 1 non-reference allele + if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { + // run vcffixup to add some basic INFO like AC + vcf_fixup(v); + add_variant(v); } } -//#pragma omp taskwait return true; } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index cfba5ea0920..13a4e8b3ee3 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -38,7 +38,7 @@ class Deconstructor : public VCFOutputCaller { // deconstruct the entire graph to cout. // Not even a little bit thread safe. - void deconstruct(vector refpaths, const PathPositionHandleGraph* grpah, SnarlManager* snarl_manager, + void deconstruct(vector refpaths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, bool include_nested, int context_jaccard_window, bool untangle_traversals, @@ -53,10 +53,19 @@ class Deconstructor : public VCFOutputCaller { // (need to have a path going through the site) bool deconstruct_site(const Snarl* site) const; + // get the traversals for a given site + // this returns a combination of embedded path traversals and gbwt traversals + // the embedded paths come first, and only they get trav_steps. + // so you can use trav_steps.size() to find the index of the first gbwt traversal... + void get_traversals(const Snarl* site, + vector& out_travs, + vector& out_trav_path_names, + vector>& out_trav_steps) const; + // convert traversals to strings. returns mapping of traversal (offset in travs) to allele vector get_alleles(vcflib::Variant& v, - const pair, - vector>>& path_travs, + const vector& travs, + const vector>& trav_steps, int ref_path_idx, const vector& use_trav, char prev_char, bool use_start) const; @@ -72,12 +81,6 @@ class Deconstructor : public VCFOutputCaller { const vector& trav_to_name, const vector& gbwt_phases) const; - // gets a sorted node id context for a given path - vector get_context( - const pair, - vector>>& path_travs, - const int& trav_idx) const; - // the underlying context-getter vector get_context( step_handle_t start_step, From 5aa32c3069e6295c8763a8e4dfcc82378a423d60 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 17 May 2024 11:17:13 -0400 Subject: [PATCH 14/67] Explain how to get vg on the PATH --- README.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 55556f6f44d..2f58c12094e 100644 --- a/README.md +++ b/README.md @@ -55,10 +55,14 @@ The easiest way to get vg is to download one of our release builds for Linux. We If you don't want to or can't use a pre-built release of vg, or if you want to become a vg developer, you can build it from source instead. +#### Linux: Clone VG + First, obtain the repo and its submodules: git clone --recursive https://github.com/vgteam/vg.git cd vg + +#### Linux: Install Dependencies Then, install VG's dependencies. You'll need the protobuf and jansson development libraries installed, and to run the tests you will need: * `jq`, `bc`, `rs`, and `parallel` @@ -85,22 +89,47 @@ Other libraries may be required. Please report any build difficulties. Note that a 64-bit OS is required. Ubuntu 20.04 should work. -When you are ready, build with `. ./source_me.sh && make`, and run with `./bin/vg`. +#### Linux: Build + +When you are ready, build with `. ./source_me.sh && make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers. Note that vg can take anywhere from 10 minutes to more than an hour to compile depending on your machine and the number of threads used. You can also produce a static binary with `make static`, assuming you have static versions of all the dependencies installed on your system. +#### Linux: Run + +Once vg is built, the binary will be at `bin/vg` inside the vg repository directory. You can run it with: + +``` +./bin/vg +``` + +You can also add its directory to your `PATH` enviornment variable, so that you can invoke `vg` from any directory. To do that on Bash, use this command from the vg repository directory: + +``` +echo 'export PATH="${PATH}:'"$(pwd)"'/bin"' >>~/.bashrc +``` + +Then close your terminal and open a new one. Run `vg` to make sure it worked. + +If it did not work, make sure that you have a `.bash_profile` file in your home directory that will run your `.bashrc`: +``` +if [ -f ~/.bashrc ]; then + source ~/.bashrc +fi +``` + ### Building on MacOS -#### Clone VG +#### Mac: Clone VG The first step is to clone the vg repository: git clone --recursive https://github.com/vgteam/vg.git cd vg -#### Install Dependencies +#### Mac: Install Dependencies VG depends on a number of packages being installed on the system where it is being built. Dependencies can be installed using either [MacPorts](https://www.macports.org/install.php) or [Homebrew](http://brew.sh/). @@ -118,17 +147,35 @@ Homebrew provides another package management solution for OSX, and may be prefer # Install all the dependencies in the Brewfile brew bundle -#### Build +#### Mac: Build With dependencies installed, VG can now be built: . ./source_me.sh && make + +As with Linux, you can add `-j16` or other numbers at the end to run multiple build tasks at once, if your computer can handle them. **Note that static binaries cannot yet be built for Mac.** Our team has successfully built vg on Mac with GCC versions 4.9, 5.3, 6, 7, and 7.3, as well as Clang 9.0. -#### Migrating to ARM Macs +#### Mac: Run + +Once vg is built, the binary will be at `bin/vg` inside the vg repository directory. You can run it with: + +``` +./bin/vg +``` + +You can also add its directory to your `PATH` enviornment variable, so that you can invoke `vg` from any directory. To do that on the default `zsh` Mac shell, use this command from the vg repository directory: + +``` +echo 'export PATH="${PATH}:'"$(pwd)"'/bin"' >>~/.zshrc +``` + +Then close your terminal and open a new one. Run `vg` to make sure it worked. + +##### Migrate a VG installation from x86 to ARM The Mac platform is moving to ARM, with Apple's M1, M1 Pro, M1 Max, and subsequent chip designs. The vg codebase supports ARM on Mac as well as on Linux. **The normal installation instructions work on a factory-fresh ARM Mac**. From 7a887073610ac6ba4f8451cf64d59ac0775b1f5d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 17 May 2024 11:26:20 -0400 Subject: [PATCH 15/67] get rid of protobuf traversals in deconstruct --- src/deconstructor.cpp | 64 ++++++++++++++++++++-------------------- src/deconstructor.hpp | 8 ++--- src/graph_caller.cpp | 23 +++++++++++++++ src/graph_caller.hpp | 6 ++++ src/traversal_finder.cpp | 30 +++++++++++++++++++ src/traversal_finder.hpp | 4 +++ 6 files changed, 99 insertions(+), 36 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 21dbf7f9ed3..8dfa79148a1 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -22,7 +22,7 @@ Deconstructor::~Deconstructor(){ * ought to become in the VCF. If a traversal is flagged off, it gets a -1. */ vector Deconstructor::get_alleles(vcflib::Variant& v, - const vector& travs, + const vector& travs, const vector>& trav_steps, int ref_path_idx, const vector& use_trav, @@ -40,12 +40,11 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, vector trav_to_allele(travs.size()); // compute the allele as a string - auto trav_to_string = [&](const SnarlTraversal& trav) { + auto trav_to_string = [&](const Traversal& trav) { string allele; // we skip the snarl endpoints - for (int j = 1; j < trav.visit_size() - 1; ++j) { - const string& node_sequence = graph->get_sequence(graph->get_handle(trav.visit(j).node_id())); - allele += trav.visit(j).backward() ? reverse_complement(node_sequence) : node_sequence; + for (int j = 1; j < trav.size() - 1; ++j) { + allele += graph->get_sequence(trav[j]); } return toUppercase(allele); }; @@ -136,10 +135,10 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, unordered_map ref_simple_pos; { auto& trav = travs.at(ref_path_idx); - for (size_t i = 0; i < trav.visit_size(); ++i) { - size_t j = !reversed ? i : trav.visit_size() - 1 - i; - const Visit& visit = trav.visit(j); - nid_t node_id = visit.node_id(); + for (size_t i = 0; i < trav.size(); ++i) { + size_t j = !reversed ? i : trav.size() - 1 - i; + const handle_t& handle = trav[j]; + nid_t node_id = graph->get_id(handle); if (ref_simple_pos.find(node_id) != ref_simple_pos.end()) continue; if (ref_dup_nodes.find(node_id) != ref_dup_nodes.end()) continue; handle_t h = graph->get_handle(node_id); @@ -271,7 +270,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, int allele_no = ai_pair.second.first; int allele_trav_no = ai_pair.second.second; // update the traversal info - add_allele_path_to_info(v, allele_no, travs[allele_trav_no], reversed, !substitution); + add_allele_path_to_info(graph, v, allele_no, travs[allele_trav_no], reversed, !substitution); } } @@ -563,30 +562,30 @@ vector Deconstructor::get_context( } -void Deconstructor::get_traversals(const Snarl* snarl, - vector& out_travs, +void Deconstructor::get_traversals(const handle_t& snarl_start, const handle_t& snarl_end, + vector& out_travs, vector& out_trav_path_names, vector>& out_trav_steps) const { // empty snarl check vector next_handles; - graph->follow_edges(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), false, [&](handle_t handle) { + graph->follow_edges(snarl_start, false, [&](handle_t handle) { next_handles.push_back(handle); }); - if (next_handles.size() == 1 && next_handles.back() == graph->get_handle(snarl->end().node_id(), snarl->end().backward())) { + if (next_handles.size() == 1 && next_handles.back() == snarl_end) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping empty site " << pb2json(*snarl) << endl; + cerr << "Skipping empty site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return; } #ifdef debug #pragma omp crtiical (cerr) - cerr << "Computing traversals of site " << pb2json(*snarl) << endl; + cerr << "Computing traversals of site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif // find every traversal that runs through a path in the graph - std::tie(out_travs, out_trav_steps) = path_trav_finder->find_path_traversals(*snarl); + std::tie(out_travs, out_trav_steps) = path_trav_finder->find_path_traversals(snarl_start, snarl_end); for (const pair& trav_steps : out_trav_steps) { out_trav_path_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_steps.first))); } @@ -595,7 +594,7 @@ void Deconstructor::get_traversals(const Snarl* snarl, // after this, all traversals are treated the same, with metadata embedded in their names if (gbwt_trav_finder.get() != nullptr) { const gbwt::GBWT& gbwt_index = gbwt_trav_finder->get_gbwt(); - pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(*snarl); + pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(snarl_start, snarl_end); for (int i = 0; i < thread_travs.first.size(); ++i) { // We need to get a bunch of metadata about the path, but the GBWT // we have might not even have structured path names stored. @@ -624,15 +623,15 @@ void Deconstructor::get_traversals(const Snarl* snarl, } -bool Deconstructor::deconstruct_site(const Snarl* snarl) const { +bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const { - vector travs; + vector travs; vector trav_path_names; // note that this vector (unlike the above two) is for embedded paths only (not GBWT) vector> trav_steps; // compute all the traversals from embedded paths and gbwt - this->get_traversals(snarl, travs, trav_path_names, trav_steps); + this->get_traversals(snarl_start, snarl_end, travs, trav_path_names, trav_steps); if (travs.empty()) { return false; @@ -645,10 +644,10 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { #ifdef debug #pragma omp critical (cerr) { - cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].visit_size() + cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].size() << ", start=" << graph->get_position_of_step(trav_steps[i].first) << ", end=" << graph->get_position_of_step(trav_steps[i].second) << endl - << " trav=" << pb2json(travs[i]) << endl; + << " trav=" << traversal_to_string(travs[i]) << endl; } #endif if (ref_paths.count(path_trav_name) && @@ -693,7 +692,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { if (ref_travs.empty()) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping site because no reference traversal was found " << pb2json(*snarl) << endl; + cerr << "Skipping site because no reference traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; } @@ -702,7 +701,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { if (travs.size() < 2) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping site because to alt traversal was found " << pb2json(*snarl) << endl; + cerr << "Skipping site because to alt traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; } @@ -713,7 +712,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { // to compare with equivalent windows from the alternate allele paths // we will associate these 1:1 with reference traversals - // remember that path_travs := pair, vector > > path_travs; + // remember that path_travs := pair, vector > > path_travs; // map from each path_trav index to the ref_trav index it best maps to vector path_trav_to_ref_trav; @@ -756,7 +755,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { auto& ref_trav_idx = ref_travs[i]; auto& ref_trav_offset = ref_offsets[i]; - const SnarlTraversal& ref_trav = travs[ref_trav_idx]; + const Traversal& ref_trav = travs[ref_trav_idx]; vcflib::Variant v; v.quality = 60; @@ -797,11 +796,11 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { first_path_pos = use_start ? start_pos : end_pos; // Get the first visit of our snarl traversal - const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1); + const handle_t& first_trav_handle = use_start ? ref_trav.front() : ref_trav.back(); char prev_char; - if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) || - (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) { + if ((use_start && graph->get_is_reverse(first_trav_handle) == graph->get_is_reverse(first_path_handle)) || + (!use_start && graph->get_is_reverse(first_trav_handle) != graph->get_is_reverse(first_path_handle))) { // Our path and traversal have consistent orientation. leave off the end of the start node going forward first_path_pos += graph->get_length(first_path_handle); prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); @@ -815,7 +814,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { v.position = first_path_pos + ref_trav_offset; - v.id = print_snarl(*snarl); + v.id = print_snarl(graph, snarl_start, snarl_end); // Convert the snarl traversals to strings and add them to the variant vector use_trav(travs.size()); @@ -1097,7 +1096,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand //#pragma omp task firstprivate(i) { auto& snarl = snarls_todo[i]; - deconstruct_site(snarl); + deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward())); } } } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 13a4e8b3ee3..d05c916e5cc 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -51,20 +51,20 @@ class Deconstructor : public VCFOutputCaller { // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) - bool deconstruct_site(const Snarl* site) const; + bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const; // get the traversals for a given site // this returns a combination of embedded path traversals and gbwt traversals // the embedded paths come first, and only they get trav_steps. // so you can use trav_steps.size() to find the index of the first gbwt traversal... - void get_traversals(const Snarl* site, - vector& out_travs, + void get_traversals(const handle_t& snarl_start, const handle_t& snarl_end, + vector& out_travs, vector& out_trav_path_names, vector>& out_trav_steps) const; // convert traversals to strings. returns mapping of traversal (offset in travs) to allele vector get_alleles(vcflib::Variant& v, - const vector& travs, + const vector& travs, const vector>& trav_steps, int ref_path_idx, const vector& use_trav, diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 921c941f02e..355bfceb9ac 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -366,6 +366,17 @@ void VCFOutputCaller::set_nested(bool nested) { include_nested = nested; } +void VCFOutputCaller::add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, const Traversal& trav, + bool reversed, bool one_based) const { + SnarlTraversal proto_trav; + for (const handle_t& handle : trav) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph->get_id(handle)); + visit->set_backward(graph->get_is_reverse(handle)); + } + this->add_allele_path_to_info(v, allele, proto_trav, reversed, one_based); +} + void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const { auto& trav_info = v.info["AT"]; @@ -726,6 +737,18 @@ void VCFOutputCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool } } +string VCFOutputCaller::print_snarl(const HandleGraph* graph, const handle_t& snarl_start, + const handle_t& snarl_end, bool in_brackets) const { + Snarl snarl; + Visit* start = snarl.mutable_start(); + start->set_node_id(graph->get_id(snarl_start)); + start->set_backward(graph->get_is_reverse(snarl_start)); + Visit* end = snarl.mutable_end(); + end->set_node_id(graph->get_id(snarl_end)); + end->set_backward(graph->get_is_reverse(snarl_end)); + return this->print_snarl(snarl, in_brackets); +} + string VCFOutputCaller::print_snarl(const Snarl& snarl, bool in_brackets) const { // todo, should we canonicalize here by putting lexicographic lowest node first? nid_t start_node_id = snarl.start().node_id(); diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 7bdcfe9d880..0485d03db3f 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -103,8 +103,12 @@ class VCFOutputCaller { protected: /// add a traversal to the VCF info field in the format of a GFA W-line or GAF path + void add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, + const Traversal& trav, bool reversed, bool one_based) const; + /// legacy version of above void add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const; + /// convert a traversal into an allele string string trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const; @@ -131,6 +135,8 @@ class VCFOutputCaller { /// print a snarl in a consistent form like >3435<12222 /// if in_brackets set to true, do (>3435<12222) instead (this is only used for nested caller) + string print_snarl(const HandleGraph* grpah, const handle_t& snarl_start, const handle_t& snarl_end, bool in_brackets = false) const; + /// legacy version of above string print_snarl(const Snarl& snarl, bool in_brackets = false) const; /// do the opposite of above diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index aaff06ab92b..4a2b7498f9b 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -10,6 +10,36 @@ namespace vg { using namespace std; +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, bool max_steps) { + string s; + function handle_to_string = [&](handle_t handle) { + string ss = graph->get_is_reverse(handle) ? "<" : ">"; + return ss + std::to_string(graph->get_id(handle)); + }; + if (traversal.size() <= max_steps) { + for (const handle_t& handle : traversal) { + s += handle_to_string(handle); + } + } else { + for (int64_t i = 0; i < max_steps / 2; ++i) { + s += handle_to_string(traversal[i]); + } + s += "..."; + for (int64_t i = 0; i < max_steps / 2; ++i) { + s += handle_to_string(traversal[traversal.size() - 1 - i]); + } + } + return s; +} + +string graph_interval_to_string(const HandleGraph* graph, const handle_t& start_handle, const handle_t& end_handle) { + function handle_to_string = [&](handle_t handle) { + string ss = graph->get_is_reverse(handle) ? "<" : ">"; + return ss + std::to_string(graph->get_id(handle)); + }; + return handle_to_string(start_handle) + handle_to_string(end_handle); +} + PathBasedTraversalFinder::PathBasedTraversalFinder(const PathHandleGraph& g, SnarlManager& sm) : graph(g), snarlmanager(sm){ } diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index 76d77ad48bf..349d6c27f00 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -36,8 +36,12 @@ using namespace std; class AugmentedGraph; +// some Protobuf replacements using Traversal = vector; using PathInterval = pair; +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, bool max_steps = 10); +// replaces pb2json(snarl) +string graph_interval_to_string(const HandleGraph* graph, const handle_t& start_handle, const handle_t& end_handle); /** * Represents a strategy for finding traversals of (nested) sites. Polymorphic From 02604b55ba07edc549b0ee453422e772f3186641 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 17 May 2024 11:29:39 -0400 Subject: [PATCH 16/67] Fix zstd package name and walk through Ubuntu bootstrap --- README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2f58c12094e..b72f9ca1ead 100644 --- a/README.md +++ b/README.md @@ -71,15 +71,24 @@ Then, install VG's dependencies. You'll need the protobuf and jansson developmen On Ubuntu, you should be able to do: make get-deps + +If you get complaints that `sudo` is not found, install it: + + apt update + apt install sudo + +If you get a bunch of errors like `E: Unable to locate package build-essential`, make sure your package index files are up to date by running: + + sudo apt update -On other distros, you will need to perform the equivalent of: +On other distros, or if you do not have root access, you will need to perform the equivalent of: sudo apt-get install build-essential git cmake pkg-config libncurses-dev libbz2-dev \ protobuf-compiler libprotoc-dev libprotobuf-dev libjansson-dev \ automake gettext autopoint libtool jq bsdmainutils bc rs parallel \ npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \ liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \ - libzstd-devel pybind11-dev python3-pybind11 + libzstd-dev pybind11-dev python3-pybind11 Note that **Ubuntu 16.04** does not ship a sufficiently new Protobuf; vg requires **Protobuf 3** which will have to be manually installed. From 3c191704059c59cebc79af81d6b11b9dd567a672 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 17 May 2024 11:30:39 -0400 Subject: [PATCH 17/67] Fix test dependency list --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b72f9ca1ead..f4714c8e6e6 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,10 @@ First, obtain the repo and its submodules: #### Linux: Install Dependencies Then, install VG's dependencies. You'll need the protobuf and jansson development libraries installed, and to run the tests you will need: - * `jq`, `bc`, `rs`, and `parallel` - * `hexdump` and `column` from `bsdmainutils` - * [`npm` for testing documentation examples](https://github.com/anko/txm)). +* `jq`, `bc`, `rs`, and `parallel` +* `hexdump` and `column` from `bsdmainutils` +* [`npm` for testing documentation examples](https://github.com/anko/txm)). + On Ubuntu, you should be able to do: make get-deps From 7b6ae3ad7469609dbb44222b91878e53b47f798b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 17 May 2024 12:23:50 -0400 Subject: [PATCH 18/67] put vcf header into its own function --- src/deconstructor.cpp | 64 +++++++++++++++++++++++-------------------- src/deconstructor.hpp | 6 ++++ 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 8dfa79148a1..102e7bf571e 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -846,35 +846,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t return true; } -/** - * Convenience wrapper function for deconstruction of multiple paths. - */ -void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, - bool include_nested, - int context_jaccard_window, - bool untangle_traversals, - bool keep_conflicted, - bool strict_conflicts, - bool long_ref_contig, - gbwt::GBWT* gbwt) { - - this->graph = graph; - this->snarl_manager = snarl_manager; - this->ref_paths = set(ref_paths.begin(), ref_paths.end()); - this->include_nested = include_nested; - this->path_jaccard_window = context_jaccard_window; - this->untangle_allele_traversals = untangle_traversals; - this->keep_conflicted_genotypes = keep_conflicted; - this->strict_conflict_checking = strict_conflicts; - if (gbwt) { - this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); - } - - // the need to use nesting is due to a problem with omp tasks and shared state - // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) - omp_set_nested(1); - omp_set_max_active_levels(3); - +string Deconstructor::get_vcf_header() { // Keep track of the non-reference paths in the graph. They'll be our sample names ref_samples.clear(); set ref_haplotypes; @@ -1052,8 +1024,40 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand stream << "\t" << sample_name; } stream << endl; + return stream.str(); +} + +/** + * Convenience wrapper function for deconstruction of multiple paths. + */ +void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, + bool include_nested, + int context_jaccard_window, + bool untangle_traversals, + bool keep_conflicted, + bool strict_conflicts, + bool long_ref_contig, + gbwt::GBWT* gbwt) { + + this->graph = graph; + this->snarl_manager = snarl_manager; + this->ref_paths = set(ref_paths.begin(), ref_paths.end()); + this->include_nested = include_nested; + this->path_jaccard_window = context_jaccard_window; + this->untangle_allele_traversals = untangle_traversals; + this->keep_conflicted_genotypes = keep_conflicted; + this->strict_conflict_checking = strict_conflicts; + if (gbwt) { + this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); + } + this->gbwt = gbwt; + + // the need to use nesting is due to a problem with omp tasks and shared state + // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) + omp_set_nested(1); + omp_set_max_active_levels(3); - string hstr = stream.str(); + string hstr = this->get_vcf_header(); assert(output_vcf.openForOutput(hstr)); cout << output_vcf.header << endl; diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index d05c916e5cc..0a62002895d 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -49,6 +49,9 @@ class Deconstructor : public VCFOutputCaller { private: + // initialize the vcf and get the header + string get_vcf_header(); + // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const; @@ -101,6 +104,9 @@ class Deconstructor : public VCFOutputCaller { // the snarl manager SnarlManager* snarl_manager; + // the gbwt + gbwt::GBWT* gbwt; + // the traversal finders. we always use a path traversal finder to get the reference path unique_ptr path_trav_finder; // we can also use a gbwt for traversals From f0c7874e8b6596853979a9dd917a7246718f3028 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 17 May 2024 12:32:51 -0400 Subject: [PATCH 19/67] further isolate snarl code --- src/deconstructor.cpp | 81 +++++++++++++++++++++++-------------------- src/deconstructor.hpp | 5 ++- 2 files changed, 45 insertions(+), 41 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 102e7bf571e..4d7c1a42695 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1027,6 +1027,48 @@ string Deconstructor::get_vcf_header() { return stream.str(); } +void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { + + vector snarls_todo; + // Do the top-level snarls in parallel + snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { + vector todo(1, snarl); + vector next; + while (!todo.empty()) { + for (auto next_snarl : todo) { + // if we can't make a variant from the snarl due to not finding + // paths through it, we try again on the children + // note: we may want to push the parallelism down a bit +#pragma omp critical (snarls_todo) + snarls_todo.push_back(next_snarl); + if (include_nested) { + // n.b. we no longer attempt to deconstruct the site to determine if we nest + const vector& children = snarl_manager->children_of(next_snarl); + next.insert(next.end(), children.begin(), children.end()); + } + } + swap(todo, next); + next.clear(); + } + }); + +//#pragma omp parallel +//#pragma omp single + { +#pragma omp parallel for schedule(dynamic,1) + for (size_t i = 0; i < snarls_todo.size(); i++) { +//#pragma omp task firstprivate(i) + { + auto& snarl = snarls_todo[i]; + deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward())); + } + } + } +//#pragma omp taskwait + +} + /** * Convenience wrapper function for deconstruction of multiple paths. */ @@ -1040,7 +1082,6 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand gbwt::GBWT* gbwt) { this->graph = graph; - this->snarl_manager = snarl_manager; this->ref_paths = set(ref_paths.begin(), ref_paths.end()); this->include_nested = include_nested; this->path_jaccard_window = context_jaccard_window; @@ -1069,43 +1110,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } - vector snarls_todo; - // Do the top-level snarls in parallel - snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { - vector todo(1, snarl); - vector next; - while (!todo.empty()) { - for (auto next_snarl : todo) { - // if we can't make a variant from the snarl due to not finding - // paths through it, we try again on the children - // note: we may want to push the parallelism down a bit -#pragma omp critical (snarls_todo) - snarls_todo.push_back(next_snarl); - if (include_nested) { - // n.b. we no longer attempt to deconstruct the site to determine if we nest - const vector& children = snarl_manager->children_of(next_snarl); - next.insert(next.end(), children.begin(), children.end()); - } - } - swap(todo, next); - next.clear(); - } - }); - -//#pragma omp parallel -//#pragma omp single - { -#pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < snarls_todo.size(); i++) { -//#pragma omp task firstprivate(i) - { - auto& snarl = snarls_todo[i]; - deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), - graph->get_handle(snarl->end().node_id(), snarl->end().backward())); - } - } - } -//#pragma omp taskwait + deconstruct_graph(snarl_manager); // write variants in sorted order write_variants(cout, snarl_manager); diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 0a62002895d..cac80f7c210 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -51,6 +51,8 @@ class Deconstructor : public VCFOutputCaller { // initialize the vcf and get the header string get_vcf_header(); + + void deconstruct_graph(SnarlManager* snarl_manager); // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) @@ -101,9 +103,6 @@ class Deconstructor : public VCFOutputCaller { // the graph const PathPositionHandleGraph* graph; - // the snarl manager - SnarlManager* snarl_manager; - // the gbwt gbwt::GBWT* gbwt; From c32f84b710a5a58b85e75eb45b76785d0f492afc Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 17 May 2024 13:18:53 -0400 Subject: [PATCH 20/67] recurse on failed snarls (when not nesting) --- src/deconstructor.cpp | 98 ++++++++++++++++++++++++++++--------------- src/deconstructor.hpp | 8 +++- 2 files changed, 71 insertions(+), 35 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 4d7c1a42695..263191d3859 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1029,44 +1029,70 @@ string Deconstructor::get_vcf_header() { void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { - vector snarls_todo; - // Do the top-level snarls in parallel + vector snarls; + vector queue; + + // read all our snarls into a list snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { - vector todo(1, snarl); - vector next; - while (!todo.empty()) { - for (auto next_snarl : todo) { - // if we can't make a variant from the snarl due to not finding - // paths through it, we try again on the children - // note: we may want to push the parallelism down a bit -#pragma omp critical (snarls_todo) - snarls_todo.push_back(next_snarl); - if (include_nested) { - // n.b. we no longer attempt to deconstruct the site to determine if we nest - const vector& children = snarl_manager->children_of(next_snarl); - next.insert(next.end(), children.begin(), children.end()); - } - } - swap(todo, next); - next.clear(); - } - }); + queue.push_back(snarl); + }); + if (include_nested) { + while (!queue.empty()) { + const Snarl* snarl = queue.back(); + queue.pop_back(); + snarls.push_back(snarl); + const vector& children = snarl_manager->children_of(snarl); + snarls.insert(snarls.end(), children.begin(), children.end()); + } + } else { + swap(snarls, queue); + } -//#pragma omp parallel -//#pragma omp single - { + // process the whole shebang in parallel #pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < snarls_todo.size(); i++) { -//#pragma omp task firstprivate(i) - { - auto& snarl = snarls_todo[i]; - deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), - graph->get_handle(snarl->end().node_id(), snarl->end().backward())); - } - } + for (size_t i = 0; i < snarls.size(); i++) { + deconstruct_site(graph->get_handle(snarls[i]->start().node_id(), snarls[i]->start().backward()), + graph->get_handle(snarls[i]->end().node_id(), snarls[i]->end().backward())); } -//#pragma omp taskwait +} + +void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { + // logic copied from vg call (graph_caller.cpp) + // Used to recurse on children of parents that can't be called + size_t thread_count = get_thread_count(); + vector> snarl_queue(thread_count); + + // Run the deconstructor on a snarl, and queue up the children if it fails + auto process_snarl = [&](const Snarl* snarl) { + if (!snarl_manager->is_trivial(snarl, *graph)) { + bool was_deconstructed = deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward())); + if (include_nested || !was_deconstructed) { + const vector& children = snarl_manager->children_of(snarl); + vector& thread_queue = snarl_queue[omp_get_thread_num()]; + thread_queue.insert(thread_queue.end(), children.begin(), children.end()); + } + } + }; + + // Start with the top level snarls + snarl_manager->for_each_top_level_snarl_parallel(process_snarl); + + // Then recurse on any children the snarl caller failed to handle + while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), + [](const vector& snarl_vec) {return snarl_vec.empty();})) { + vector cur_queue; + for (vector& thread_queue : snarl_queue) { + cur_queue.reserve(cur_queue.size() + thread_queue.size()); + std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); + thread_queue.clear(); + } +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < cur_queue.size(); ++i) { + process_snarl(cur_queue[i]); + } + } } /** @@ -1110,7 +1136,11 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } - deconstruct_graph(snarl_manager); + if (include_nested) { + deconstruct_graph(snarl_manager); + } else { + deconstruct_graph_top_down(snarl_manager); + } // write variants in sorted order write_variants(cout, snarl_manager); diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index cac80f7c210..3ae015b45a1 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -52,8 +52,14 @@ class Deconstructor : public VCFOutputCaller { // initialize the vcf and get the header string get_vcf_header(); + // deconstruct all snarls in parallel (ie nesting relationship ignored) void deconstruct_graph(SnarlManager* snarl_manager); - + + // deconstruct all top-level snarls in parallel + // nested snarls are processed after their parents in the same thread + // (same logic as vg call) + void deconstruct_graph_top_down(SnarlManager* snarl_manager); + // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const; From 446be84a06f59b8b5746261ab65fb4e8b13bf645 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 20 May 2024 16:56:16 -0400 Subject: [PATCH 21/67] remove exhaustive deconstruct from README -- thanks, tests! --- README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/README.md b/README.md index 55556f6f44d..1769d290e59 100644 --- a/README.md +++ b/README.md @@ -396,14 +396,6 @@ vg index hla.vg -x hla.xg vg deconstruct hla.xg -e -p "gi|568815592:29791752-29792749" > hla_variants.vcf ``` -Variants can also be inferred strictly from topology by not using `-e`, though unlike the above example, cycles are not supported. "Deconstruct" the VCF variants that were used to construct the graph. The output will be similar but identical to `small/x.vcf.gz` as `vg construct` can add edges between adjacent alts and/or do some normalization: - - -```sh -# using the same graph from the `map` example -vg deconstruct x.xg -p x > x.vcf -``` - Haplotype paths from `.gbz` or `.gbwt` indexes input can be considered using `-z` and `-g', respectively. As with `vg call`, it is best to compute snarls separately and pass them in with `-r` when working with large graphs. From 948e2c13e87182b95bfe3286d58052522e77faca Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 21 May 2024 15:37:40 -0400 Subject: [PATCH 22/67] Pin back Requests to make sure Python Docker API works --- vgci/vgci.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 192d6d19839..54b52f09911 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -340,12 +340,14 @@ then # Dependencies for running tests. Need numpy, scipy and sklearn for # running toil-vg mapeval, and dateutils and reqests for - # ./mins_since_last_build.py. Make sure to get the giant buildable modules - # as binaries only to avoid wasting CI time building some slightly nicer - # version Pip prefers. - pip3 install pytest timeout_decorator requests dateutils + # ./mins_since_last_build.py. Need exactly the right version of requests + # for the Python Docker API to work (see + # ). + pip3 install pytest timeout_decorator 'requests==2.31.0' dateutils # TODO: To upgrade to Numpy 1.24+, we need to remove usages of `numpy.int` # AKA `np.int` from toil-vg. See + # Make sure to get the giant buildable modules as binaries only to avoid + # wasting CI time building some slightly nicer version Pip prefers. pip3 install 'numpy==1.23.5' scipy scikit-learn --only-binary :all: # Install Toil From 3e766a7896258c9e2b8cacc461ecb0b0881adf7d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 22 May 2024 11:07:36 -0400 Subject: [PATCH 23/67] start trav clustering; move jaccard code alongside --- src/deconstructor.cpp | 37 +++---------------------- src/deconstructor.hpp | 24 ----------------- src/traversal_clusters.cpp | 45 +++++++++++++++++++++++++++++++ src/traversal_clusters.hpp | 55 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 58 deletions(-) create mode 100644 src/traversal_clusters.cpp create mode 100644 src/traversal_clusters.hpp diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 263191d3859..5dea7b7be50 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1,6 +1,7 @@ #include "deconstructor.hpp" #include "traversal_finder.hpp" #include +#include "traversal_clusters.hpp" //#define debug @@ -248,7 +249,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, for (auto& c : ref_contexts) { auto& ref_context = c.second; auto& ref_pos = c.first; - double j = context_jaccard(ref_context, path_context); + double j = jaccard_coefficient(ref_context, path_context); if (j > best_jaccard) { best_jaccard = j; best_pos = ref_pos; @@ -487,38 +488,6 @@ pair, bool> Deconstructor::choose_traversals(const string& sample_na return make_pair(most_frequent_travs, conflict); } - -// todo refactor if we need to reuse elsewhere in vg -// implemented inline for development -// assumes sorted input -double Deconstructor::context_jaccard( - const vector& target, - const vector& query) const { - size_t node_isec = 0; - std::set_intersection(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_isec)); - size_t node_union = 0; - std::set_union(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_union)); - return (double)node_isec / (double)node_union; -} - -double Deconstructor::context_jaccard( - const dac_vector<>& target, - const vector& query) const { - size_t node_isec = 0; - std::set_intersection(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_isec)); - size_t node_union = 0; - std::set_union(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_union)); - return (double)node_isec / (double)node_union; -} - vector Deconstructor::get_context( step_handle_t start_step, step_handle_t end_step) const { @@ -738,7 +707,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t vector> ref_mappings; for (uint64_t j = 0; j < ref_travs.size(); ++j) { ref_mappings.push_back(make_pair( - context_jaccard( + jaccard_coefficient( ref_contexts[j], context), ref_travs[j])); diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 3ae015b45a1..19f8e0300ff 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -96,15 +96,6 @@ class Deconstructor : public VCFOutputCaller { vector get_context( step_handle_t start_step, step_handle_t end_step) const; - - // compares node contexts - double context_jaccard(const vector& target, - const vector& query) const; - - // specialization for enc_vectors - double context_jaccard( - const dac_vector<>& target, - const vector& query) const; // the graph const PathPositionHandleGraph* graph; @@ -156,21 +147,6 @@ class Deconstructor : public VCFOutputCaller { bool keep_conflicted_genotypes = false; }; -// helpel for measuring set intersectiond and union size -template -class count_back_inserter { - size_t &count; -public: - typedef void value_type; - typedef void difference_type; - typedef void pointer; - typedef void reference; - typedef std::output_iterator_tag iterator_category; - count_back_inserter(size_t &count) : count(count) {}; - void operator=(const T &){ ++count; } - count_back_inserter &operator *(){ return *this; } - count_back_inserter &operator++(){ return *this; } -}; } #endif diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp new file mode 100644 index 00000000000..c5a01e3b00e --- /dev/null +++ b/src/traversal_clusters.cpp @@ -0,0 +1,45 @@ +#include "traversal_clusters.hpp" + +namespace vg { + +vector> cluster_traversals(const PathHandleGraph* graph, + const vector& traversals, + const vector& traversal_order, + double min_jaccard) { + + assert(traversal_order.size() == traversals.size()); + + // the values are indexes in the input traversals vector + // the "reference" traversal of each cluster (to which distance is computed) + // is always its first element + vector> clusters; + + for (const int64_t& i : traversal_order) { + const Traversal& trav = traversals[i]; + double min_jaccard = numeric_limits::max(); + int64_t min_cluster_idx = -1; + for (int64_t j = 0; j < clusters.size(); ++j) { + const Traversal& cluster_trav = traversals[clusters[j][0]]; + double jac = jaccard_coefficient(trav, cluster_trav); + if (jac < min_jaccard) { + min_jaccard = jac; + min_cluster_idx = j; + if (jac == 0) { + break; + } + } + } + if (min_cluster_idx >= 0) { + // we've found a suitably similar cluster, add it do that + clusters[min_cluster_idx].push_back(i); + } else { + // there's no cluster close enough, need to start a new one + clusters.push_back({i}); + } + } + + return clusters; +} + + +} diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp new file mode 100644 index 00000000000..13b869a41f2 --- /dev/null +++ b/src/traversal_clusters.hpp @@ -0,0 +1,55 @@ +#include "handle.hpp" +#include "traversal_finder.hpp" + +#pragma once + + +/** \file +* Utilities for finding and clustering similar snarl traversals +*/ +namespace vg { +using namespace std; + +// +template +class count_back_inserter { + size_t &count; +public: + typedef void value_type; + typedef void difference_type; + typedef void pointer; + typedef void reference; + typedef std::output_iterator_tag iterator_category; + count_back_inserter(size_t &count) : count(count) {}; + void operator=(const T &){ ++count; } + count_back_inserter &operator *(){ return *this; } + count_back_inserter &operator++(){ return *this; } +}; + +// compute the jaccard coefficient (|intersection| / |union|) of two sets +// target and query should be sorted vectors (or something equivalent) +template +inline double jaccard_coefficient(const T& target, const U& query) { + size_t isec_size = 0; + std::set_intersection(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(isec_size)); + size_t union_size = 0; + std::set_union(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(union_size)); + return (double)isec_size / (double)union_size; + +} + +/// cluster the traversals. The algorithm is: +/// - visit traversals in provided order +/// - if the traversal is <= min_jaccard away from the reference traversal of cluster, add to cluster +/// - else start a new cluster, with the given traversal as a reference +vector> cluster_traversals(const PathHandleGraph* graph, + const vector& traversals, + function traversal_order, + double min_jaccard); + + +} From cb1c4d154ec0cf0042d319a760b36dc881957987 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 23 May 2024 13:58:42 -0400 Subject: [PATCH 24/67] start traversal-clustering logic --- src/deconstructor.cpp | 20 +++++--- src/deconstructor.hpp | 8 +++- src/subcommand/deconstruct_main.cpp | 13 ++++-- src/traversal_clusters.cpp | 72 +++++++++++++++++++++++------ src/traversal_clusters.hpp | 26 ++++++++++- src/traversal_finder.cpp | 2 +- src/traversal_finder.hpp | 2 +- 7 files changed, 116 insertions(+), 27 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 5dea7b7be50..7228669f51e 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -26,7 +26,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, const vector& travs, const vector>& trav_steps, int ref_path_idx, - const vector& use_trav, + const vector>& trav_clusters, char prev_char, bool use_start) const { assert(ref_path_idx >=0 && ref_path_idx < travs.size()); @@ -57,10 +57,11 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, bool substitution = true; // set the other alleles (they can end up as 0 alleles too if their strings match the reference) - for (int i = 0; i < travs.size(); ++i) { - if (i != ref_path_idx) { - if (use_trav[i]) { - string allele = trav_to_string(travs[i]); + // note that we have one (unique) allele per cluster, so we take advantage of that here + for (const vector& cluster : trav_clusters) { + string allele = trav_to_string(travs[cluster.front()]); + for (const int& i : cluster) { + if (i != ref_path_idx) { auto ai_it = allele_idx.find(allele); if (ai_it == allele_idx.end()) { // make a new allele for this string @@ -797,9 +798,15 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } } + // Sort the traversals for clustering + vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, use_trav); + + // jaccard clustering (using handles for now) on traversals + vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, cluster_threshold); + vector trav_to_allele = get_alleles(v, travs, trav_steps, ref_trav_idx, - use_trav, + trav_clusters, prev_char, use_start); // Fill in the genotypes @@ -1074,6 +1081,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand bool keep_conflicted, bool strict_conflicts, bool long_ref_contig, + double cluster_threshold, gbwt::GBWT* gbwt) { this->graph = graph; diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 19f8e0300ff..af9668a2c41 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -45,6 +45,7 @@ class Deconstructor : public VCFOutputCaller { bool keep_conflicted, bool strict_conflicts, bool long_ref_contig, + double cluster_threshold = 1.0, gbwt::GBWT* gbwt = nullptr); private: @@ -78,7 +79,7 @@ class Deconstructor : public VCFOutputCaller { const vector& travs, const vector>& trav_steps, int ref_path_idx, - const vector& use_trav, + const vector>& trav_clusters, char prev_char, bool use_start) const; // write traversal path names as genotypes @@ -145,6 +146,11 @@ class Deconstructor : public VCFOutputCaller { // should we keep conflicted genotypes or not bool keep_conflicted_genotypes = false; + + // used to merge together similar traversals (to keep allele counts down) + // currently implemented as handle jaccard coefficient. So 1 means only + // merge if identical (which is what deconstruct has always done) + double cluster_threshold = 1.0; }; diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 497922e5b70..2ccc8192786 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -51,6 +51,7 @@ void help_deconstruct(char** argv){ << " -K, --keep-conflicted Retain conflicted genotypes in output." << endl << " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl << " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl + << " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0)" << endl << " -t, --threads N Use N threads" << endl << " -v, --verbose Print some status messages" << endl << endl; @@ -77,6 +78,7 @@ int main_deconstruct(int argc, char** argv){ int context_jaccard_window = 10000; bool untangle_traversals = false; bool contig_only_ref = false; + double cluster_threshold = 1.0; int c; optind = 2; // force optind past command positional argument @@ -97,14 +99,15 @@ int main_deconstruct(int argc, char** argv){ {"all-snarls", no_argument, 0, 'a'}, {"keep-conflicted", no_argument, 0, 'K'}, {"strict-conflicts", no_argument, 0, 'S'}, - {"contig-only-ref", no_argument, 0, 'C'}, + {"contig-only-ref", no_argument, 0, 'C'}, + {"cluster", required_argument, 0, 'L'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uat:v", + c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:t:v", long_options, &option_index); // Detect the end of the options. @@ -158,7 +161,10 @@ int main_deconstruct(int argc, char** argv){ break; case 'C': contig_only_ref = true; - break; + break; + case 'L': + cluster_threshold = min(0.0, max(1.0, parse(optarg))); + break; case 't': omp_set_num_threads(parse(optarg)); break; @@ -344,6 +350,7 @@ int main_deconstruct(int argc, char** argv){ keep_conflicted, strict_conflicts, !contig_only_ref, + cluster_threshold, gbwt_index); return 0; } diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index c5a01e3b00e..0b7b26aed09 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -2,36 +2,82 @@ namespace vg { + +vector get_traversal_order(const PathHandleGraph* graph, + const vector& traversals, + const vector& trav_path_names, + const vector& ref_travs, + const vector& use_traversal) { + set ref_set(ref_travs.begin(), ref_travs.end()); + + function trav_less = [&](int i, int j) { + if (ref_set.count(i) && !ref_set.count(j)) { + return true; + } + if (!trav_path_names[i].empty() && (!trav_path_names[j].empty() || trav_path_names[i] < trav_path_names[j])) { + return true; + } + return false; + }; + + vector sorted_travs; + sorted_travs.reserve(traversals.size()); + for (int64_t i = 0; i < traversals.size(); ++i) { + if (use_traversal[i]) { + sorted_travs.push_back(i); + } + } + std::sort(sorted_travs.begin(), sorted_travs.end(), trav_less); + return sorted_travs; +} + vector> cluster_traversals(const PathHandleGraph* graph, const vector& traversals, - const vector& traversal_order, + const vector& traversal_order, double min_jaccard) { - assert(traversal_order.size() == traversals.size()); + assert(traversal_order.size() <= traversals.size()); // the values are indexes in the input traversals vector // the "reference" traversal of each cluster (to which distance is computed) // is always its first element vector> clusters; - for (const int64_t& i : traversal_order) { - const Traversal& trav = traversals[i]; - double min_jaccard = numeric_limits::max(); - int64_t min_cluster_idx = -1; + // need the clusters as sorted lists. we'll forget the endpoints while we're at + // it since they're always shared. note we work with multisets since we want to + // count differences between, say, cycle copy numbers. + vector> sorted_traversals; + for (const Traversal& trav : traversals) { + assert(trav.size() >=2 ); + // prune snarl nodes as they're always the same + // todo: does jaccard properly handle empty sets? + int64_t first = trav.size() == 2 ? 0 : 1; + int64_t last = trav.size() == 2 ? trav.size() : trav.size() - 1; + multiset sorted_trav; + for (int64_t i = first; i < last; ++i) { + sorted_trav.insert(trav[i]); + } + sorted_traversals.push_back(sorted_trav); + } + + for (const int& i : traversal_order) { + const auto& trav = sorted_traversals[i]; + double max_jaccard = 0; + int64_t max_cluster_idx = -1; for (int64_t j = 0; j < clusters.size(); ++j) { - const Traversal& cluster_trav = traversals[clusters[j][0]]; + const auto& cluster_trav = sorted_traversals[clusters[j][0]]; double jac = jaccard_coefficient(trav, cluster_trav); - if (jac < min_jaccard) { - min_jaccard = jac; - min_cluster_idx = j; - if (jac == 0) { + if (jac > max_jaccard) { + max_jaccard = jac; + max_cluster_idx = j; + if (jac == 1) { break; } } } - if (min_cluster_idx >= 0) { + if (max_cluster_idx >= 0 && max_jaccard >= min_jaccard) { // we've found a suitably similar cluster, add it do that - clusters[min_cluster_idx].push_back(i); + clusters[max_cluster_idx].push_back(i); } else { // there's no cluster close enough, need to start a new one clusters.push_back({i}); diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index 13b869a41f2..b03bd779267 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -42,14 +42,36 @@ inline double jaccard_coefficient(const T& target, const U& query) { } +// the information needed from the parent traversal in order to +// genotype a child traversal +struct ParentGenotypeInfo { + Traversal ref_traversal; + pair ref_steps; + unordered_map sample_to_ploidy; // to check/enforce consistency +}; + + +/// sort the traversals, putting the reference first then using names +/// traversals masked out by use_traversal will be filrtered out entirely +/// (so the output vector may be smaller than the input...) +vector get_traversal_order(const PathHandleGraph* graph, + const vector& traversals, + const vector& trav_path_names, + const vector& ref_travs, + const vector& use_traversal); + + /// cluster the traversals. The algorithm is: /// - visit traversals in provided order /// - if the traversal is <= min_jaccard away from the reference traversal of cluster, add to cluster /// - else start a new cluster, with the given traversal as a reference +/// note that traversal_order can specify a subset of traversals vector> cluster_traversals(const PathHandleGraph* graph, const vector& traversals, - function traversal_order, + const vector& traversal_order, double min_jaccard); - +//int64_t find_parent_traversal(const PathHandleGraph* graph, +// const vector& traversals, + } diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index 4a2b7498f9b..2c1a1590c95 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -10,7 +10,7 @@ namespace vg { using namespace std; -string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, bool max_steps) { +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, int64_t max_steps) { string s; function handle_to_string = [&](handle_t handle) { string ss = graph->get_is_reverse(handle) ? "<" : ">"; diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index 349d6c27f00..e39499cba03 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -39,7 +39,7 @@ class AugmentedGraph; // some Protobuf replacements using Traversal = vector; using PathInterval = pair; -string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, bool max_steps = 10); +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, int64_t max_steps = 10); // replaces pb2json(snarl) string graph_interval_to_string(const HandleGraph* graph, const handle_t& start_handle, const handle_t& end_handle); From 9d011277ed88241a76817714c40ee17b63ab1253 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 24 May 2024 15:46:54 -0400 Subject: [PATCH 25/67] add some tags for alt cluster info --- src/deconstructor.cpp | 55 +++++++++++++++++++++---- src/deconstructor.hpp | 3 +- src/subcommand/deconstruct_main.cpp | 4 +- src/traversal_clusters.cpp | 62 ++++++++++++++++++++++++++--- src/traversal_clusters.hpp | 5 ++- 5 files changed, 112 insertions(+), 17 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 7228669f51e..a68486d9790 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -288,13 +288,18 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, } void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& names, - const vector& trav_to_allele) const { + const vector& trav_to_allele, + const vector>& trav_to_cluster_info) const { assert(names.size() == trav_to_allele.size()); // set up our variant fields v.format.push_back("GT"); if (show_path_info && path_to_sample_phase) { v.format.push_back("PI"); } + if (this->cluster_threshold < 1.0) { + v.format.push_back("TS"); + v.format.push_back("TL"); + } // get a list of traversals for every vcf sample // (this will be 1:1 unless we're using the path_to_sample name map) @@ -344,6 +349,13 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name } genotype += (chosen_travs[i] != -1 && (!conflict || keep_conflicted_genotypes)) ? std::to_string(trav_to_allele[chosen_travs[i]]) : "."; + if (this->cluster_threshold < 1.0) { + ostringstream ss; + ss.precision(3); + ss << trav_to_cluster_info[chosen_travs[i]].first; + v.samples[sample_name]["TS"].push_back(ss.str()); + v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second)); + } } v.samples[sample_name]["GT"] = {genotype}; if (show_path_info && path_to_sample_phase) { @@ -609,15 +621,17 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // pick out the traversal corresponding to an embedded reference path, breaking ties consistently string ref_trav_name; - for (int i = 0; i < trav_steps.size(); ++i) { + for (int i = 0; i < travs.size(); ++i) { const string& path_trav_name = trav_path_names[i]; #ifdef debug #pragma omp critical (cerr) { - cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].size() - << ", start=" << graph->get_position_of_step(trav_steps[i].first) - << ", end=" << graph->get_position_of_step(trav_steps[i].second) << endl - << " trav=" << traversal_to_string(travs[i]) << endl; + cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].size(); + if (i < trav_steps.size()) { + cerr << ", start=" << graph->get_position_of_step(trav_steps[i].first) + << ", end=" << graph->get_position_of_step(trav_steps[i].second) << endl; + } + cerr << " trav=" << traversal_to_string(graph, travs[i]) << endl; } #endif if (ref_paths.count(path_trav_name) && @@ -802,7 +816,24 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, use_trav); // jaccard clustering (using handles for now) on traversals - vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, cluster_threshold); + vector> trav_cluster_info; + vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, cluster_threshold, + trav_cluster_info); + +#ifdef debug + cerr << "cluster priority"; + for (const auto& t: sorted_travs) { + cerr << " " << t; + } + cerr << endl; + for (const auto& tc : trav_clusters) { + cerr << "traversal cluster:"; + for (const auto& t: tc) { + cerr << t << "(" << trav_cluster_info[t].first << "," << trav_cluster_info[t].second << ") "; + } + cerr << endl; + } +#endif vector trav_to_allele = get_alleles(v, travs, trav_steps, ref_trav_idx, @@ -810,7 +841,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t prev_char, use_start); // Fill in the genotypes - get_genotypes(v, trav_path_names, trav_to_allele); + get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info); // we only bother printing out sites with at least 1 non-reference allele if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { @@ -915,6 +946,13 @@ string Deconstructor::get_vcf_header() { } stream << "\">" << endl; } + if (path_to_sample_phase && cluster_threshold < 1) { + stream << "##FORMAT" + << endl; + stream << "##FORMAT" + << endl; + + } stream << "##INFO=" << endl; stream << "##INFO=" << endl; stream << "##INFO=" << endl; @@ -1094,6 +1132,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand if (gbwt) { this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); } + this->cluster_threshold = cluster_threshold; this->gbwt = gbwt; // the need to use nesting is due to a problem with omp tasks and shared state diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index af9668a2c41..bee12b540c6 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -83,7 +83,8 @@ class Deconstructor : public VCFOutputCaller { char prev_char, bool use_start) const; // write traversal path names as genotypes - void get_genotypes(vcflib::Variant& v, const vector& names, const vector& trav_to_allele) const; + void get_genotypes(vcflib::Variant& v, const vector& names, const vector& trav_to_allele, + const vector>& trav_to_cluster_info) const; // given a set of traversals associated with a particular sample, select a set of size for the VCF // the highest-frequency ALT traversal is chosen diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 2ccc8192786..0eac567df23 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -51,7 +51,7 @@ void help_deconstruct(char** argv){ << " -K, --keep-conflicted Retain conflicted genotypes in output." << endl << " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl << " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl - << " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0)" << endl + << " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental]" << endl << " -t, --threads N Use N threads" << endl << " -v, --verbose Print some status messages" << endl << endl; @@ -163,7 +163,7 @@ int main_deconstruct(int argc, char** argv){ contig_only_ref = true; break; case 'L': - cluster_threshold = min(0.0, max(1.0, parse(optarg))); + cluster_threshold = max(0.0, min(1.0, parse(optarg))); break; case 't': omp_set_num_threads(parse(optarg)); diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 0b7b26aed09..fefe78277a3 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -1,5 +1,7 @@ #include "traversal_clusters.hpp" +//#define debug + namespace vg { @@ -11,10 +13,14 @@ vector get_traversal_order(const PathHandleGraph* graph, set ref_set(ref_travs.begin(), ref_travs.end()); function trav_less = [&](int i, int j) { - if (ref_set.count(i) && !ref_set.count(j)) { - return true; + // give reference set preference priority + bool iref = ref_set.count(i); + bool jref = ref_set.count(j); + if (iref != jref) { + return iref; } - if (!trav_path_names[i].empty() && (!trav_path_names[j].empty() || trav_path_names[i] < trav_path_names[j])) { + // fall back to name comparison + if (!trav_path_names[i].empty() && (trav_path_names[j].empty() || trav_path_names[i] < trav_path_names[j])) { return true; } return false; @@ -27,15 +33,30 @@ vector get_traversal_order(const PathHandleGraph* graph, sorted_travs.push_back(i); } } +#ifdef debug + cerr << "names:"; + for (auto x : trav_path_names) cerr << " " << x; + cerr << endl << "ref set:"; + for (auto x : ref_set) cerr << " " << x; + cerr << endl << "before sort:"; + for (auto x : sorted_travs) cerr << " " << x; +#endif std::sort(sorted_travs.begin(), sorted_travs.end(), trav_less); +#ifdef debug + cerr << endl << "after sort:"; + for (auto x : sorted_travs) cerr << " " << x; + cerr << endl; +#endif + assert(ref_travs.empty() || sorted_travs.empty() || ref_set.count(sorted_travs.front())); return sorted_travs; } vector> cluster_traversals(const PathHandleGraph* graph, const vector& traversals, const vector& traversal_order, - double min_jaccard) { - + double min_jaccard, + vector>& out_info) { + assert(traversal_order.size() <= traversals.size()); // the values are indexes in the input traversals vector @@ -43,6 +64,8 @@ vector> cluster_traversals(const PathHandleGraph* graph, // is always its first element vector> clusters; + out_info.resize(traversals.size(), make_pair(-1., 0)); + // need the clusters as sorted lists. we'll forget the endpoints while we're at // it since they're always shared. note we work with multisets since we want to // count differences between, say, cycle copy numbers. @@ -78,12 +101,41 @@ vector> cluster_traversals(const PathHandleGraph* graph, if (max_cluster_idx >= 0 && max_jaccard >= min_jaccard) { // we've found a suitably similar cluster, add it do that clusters[max_cluster_idx].push_back(i); + out_info[i] = make_pair(max_jaccard, 0); } else { // there's no cluster close enough, need to start a new one clusters.push_back({i}); + out_info[i] = make_pair(1.0, 0); } } + // fill in the size deltas + for (vector& cluster : clusters) { + // only non-zero for non-empty clusters + if (cluster.size() > 1) { + int64_t cluster_ref_length = -1; + for (int64_t i = 1; i < cluster.size(); ++i) { + if (out_info[cluster[i]].first < 1) { + // get the cluster reference length on-demand + if (cluster_ref_length == -1) { + cluster_ref_length = 0; + for (const handle_t& handle : traversals[cluster[0]]) { + cluster_ref_length += graph->get_length(handle); + } + } + // compute the length of the non-ref traversal + int64_t length = 0; + for (const handle_t& handle : traversals[cluster[i]]) { + length += graph->get_length(handle); + } + // store the delta + out_info[cluster[i]].second = length - cluster_ref_length; + } + + } + } + } + return clusters; } diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index b03bd779267..15b35aa57f6 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -66,10 +66,13 @@ vector get_traversal_order(const PathHandleGraph* graph, /// - if the traversal is <= min_jaccard away from the reference traversal of cluster, add to cluster /// - else start a new cluster, with the given traversal as a reference /// note that traversal_order can specify a subset of traversals +/// out_info are Simlarity/length-delta pairs comparing the traversal to its cluster reference +/// (if the traversal wasn't in traversal_order, it'll get -1) vector> cluster_traversals(const PathHandleGraph* graph, const vector& traversals, const vector& traversal_order, - double min_jaccard); + double min_jaccard, + vector>& out_info); //int64_t find_parent_traversal(const PathHandleGraph* graph, // const vector& traversals, From f491fdb033b1f412b741eca98119c6bd80756709 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 27 May 2024 11:51:05 -0400 Subject: [PATCH 26/67] move to weighted jac; add small test --- src/deconstructor.cpp | 17 +++++++++++------ src/subcommand/deconstruct_main.cpp | 19 ++++++++++++++----- src/traversal_clusters.cpp | 27 ++++++++++++++++++++++++++- src/traversal_clusters.hpp | 4 +++- test/t/26_deconstruct.t | 17 ++++++++++++++++- 5 files changed, 70 insertions(+), 14 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index a68486d9790..9352e1e8364 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -350,11 +350,16 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name genotype += (chosen_travs[i] != -1 && (!conflict || keep_conflicted_genotypes)) ? std::to_string(trav_to_allele[chosen_travs[i]]) : "."; if (this->cluster_threshold < 1.0) { - ostringstream ss; - ss.precision(3); - ss << trav_to_cluster_info[chosen_travs[i]].first; - v.samples[sample_name]["TS"].push_back(ss.str()); - v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second)); + if (*genotype.rbegin() == '.') { + v.samples[sample_name]["TS"].push_back("."); + v.samples[sample_name]["TL"].push_back("."); + } else { + ostringstream ss; + ss.precision(3); + ss << trav_to_cluster_info[chosen_travs[i]].first; + v.samples[sample_name]["TS"].push_back(ss.str()); + v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second)); + } } } v.samples[sample_name]["GT"] = {genotype}; @@ -920,7 +925,7 @@ string Deconstructor::get_vcf_header() { if (sample_to_haps.empty()) { cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that " << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct " - << "now only works on embedded paths and GBWT threads" << endl; + << "now only works on embedded paths and GBWT threads." << endl; exit(1); } diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 0eac567df23..e0cb36dac31 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -260,13 +260,22 @@ int main_deconstruct(int argc, char** argv){ } if (refpaths.empty() && refpath_prefixes.empty()) { + bool found_hap; // No paths specified: use them all graph->for_each_path_handle([&](path_handle_t path_handle) { - const string& name = graph->get_path_name(path_handle); - if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) { - refpaths.push_back(name); - } - }); + const string& name = graph->get_path_name(path_handle); + if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) { + refpaths.push_back(name); + } else { + found_hap = true; + } + }); + + if (!found_hap) { + cerr << "error [vg deconstruct]: All graph paths selected as references (leaving no alts). Please use -P/-p " + << "to narrow down the reference to a subset of paths, or GBZ/GBWT input that contains haplotype paths" << endl; + return 1; + } } // Read the translation diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index fefe78277a3..4ff358cf603 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -5,6 +5,31 @@ namespace vg { +// specialized version of jaccard_coefficient() that weights jaccard by node lenths. +double weighted_jaccard_coefficient(const PathHandleGraph* graph, + const multiset& target, + const multiset& query) { + vector intersection_handles; + std::set_intersection(target.begin(), target.end(), + query.begin(), query.end(), + std::back_inserter(intersection_handles)); + vector union_handles; + std::set_union(target.begin(), target.end(), + query.begin(), query.end(), + std::back_inserter(union_handles)); + + int64_t isec_size = 0; + for (const handle_t& handle : intersection_handles) { + isec_size += graph->get_length(handle); + } + int64_t union_size = 0; + for (const handle_t& handle : union_handles) { + union_size += graph->get_length(handle); + } + return (double)isec_size / (double)union_size; +} + + vector get_traversal_order(const PathHandleGraph* graph, const vector& traversals, const vector& trav_path_names, @@ -89,7 +114,7 @@ vector> cluster_traversals(const PathHandleGraph* graph, int64_t max_cluster_idx = -1; for (int64_t j = 0; j < clusters.size(); ++j) { const auto& cluster_trav = sorted_traversals[clusters[j][0]]; - double jac = jaccard_coefficient(trav, cluster_trav); + double jac = weighted_jaccard_coefficient(graph, trav, cluster_trav); if (jac > max_jaccard) { max_jaccard = jac; max_cluster_idx = j; diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index 15b35aa57f6..31c857f553b 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -39,9 +39,11 @@ inline double jaccard_coefficient(const T& target, const U& query) { query.begin(), query.end(), count_back_inserter(union_size)); return (double)isec_size / (double)union_size; - } +// specialized version that weights jaccard by node lenths. +double weighted_jaccard_coefficient(const PathHandleGraph* graph, const multiset& target, const multiset& query); + // the information needed from the parent traversal in order to // genotype a child traversal struct ParentGenotypeInfo { diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 1b2cafb7460..bde71f7483e 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 22 +plan tests 25 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -141,4 +141,19 @@ is "$?" 0 "gbz deconstruction gives same output as gbwt deconstruction" rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon.vcf x.giraffe.gbz x.min x.dist small.s1.h1.fa small.s1.h2.fa decon.s1.h1.fa decon.s1.h2.fa +# todo you could argue merging shouldn't happen here because there's no child snarl +# this check should come into play with nesting support +vg construct -r small/x.fa -v small/x.vcf.gz | vg view - > small_cluster.gfa +printf "L\t1\t+\t9\t+\t0M\n" >> small_cluster.gfa +printf "P\ty\t1+,2+,4+,6+,7+,9+\t*\n" >> small_cluster.gfa +printf "P\tz\t1+,9+\t*\n" >> small_cluster.gfa +vg deconstruct small_cluster.gfa -p x > small_cluster_0.vcf +vg deconstruct small_cluster.gfa -p x -L 0.3 > small_cluster_3.vcf +is "$(tail -1 small_cluster_0.vcf | awk '{print $5}')" "GATTTGA,G" "cluster-free deconstruction finds all alt alleles" +is "$(tail -1 small_cluster_3.vcf | awk '{print $5}')" "G" "clustered deconstruction finds fewer alt alleles" +is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered deconstruction finds correct allele info" + +rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf + + From 793ab79bf1ec918ab80f36ea58dfb7dffaf31c5f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 30 May 2024 14:19:00 -0400 Subject: [PATCH 27/67] start wiring in new nested deconstruction mode --- src/deconstructor.cpp | 178 ++++++++++++++++++++++++---- src/deconstructor.hpp | 31 ++++- src/subcommand/deconstruct_main.cpp | 15 ++- src/traversal_clusters.cpp | 104 +++++++++++++++- src/traversal_clusters.hpp | 20 +++- test/t/26_deconstruct.t | 11 +- 6 files changed, 320 insertions(+), 39 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 9352e1e8364..eb653eceb8f 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -43,11 +43,16 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, // compute the allele as a string auto trav_to_string = [&](const Traversal& trav) { string allele; - // we skip the snarl endpoints - for (int j = 1; j < trav.size() - 1; ++j) { - allele += graph->get_sequence(trav[j]); + // hack to support star alleles + if (trav.size() == 0) { + allele = "*"; + } else { + // we skip the snarl endpoints + for (int j = 1; j < trav.size() - 1; ++j) { + allele += toUppercase(graph->get_sequence(trav[j])); + } } - return toUppercase(allele); + return allele; }; // set the reference allele @@ -609,8 +614,65 @@ void Deconstructor::get_traversals(const handle_t& snarl_start, const handle_t& } } +unordered_map> Deconstructor::add_star_traversals(vector& travs, + vector& names, + vector>& trav_clusters, + const unordered_map>& parent_haplotypes) const { + + // todo: refactor this into general genotyping code + unordered_map> sample_to_haps; + + // find out what's in the traversals + assert(names.size() == travs.size()); + for (int64_t i = 0; i < names.size(); ++i) { + string sample_name = PathMetadata::parse_sample_name(names[i]); + // for backward compatibility + if (sample_name.empty()) { + sample_name = names[i]; + } + auto phase = PathMetadata::parse_haplotype(names[i]); + if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { + // THis probably won't fit in an int. Use 0 instead. + phase = 0; + } + sample_to_haps[sample_name].push_back(phase); + } + + // find everything that's in parent_haplotyes but not the travefsals, + // and add in dummy start-alleles for them + for (const auto& parent_sample_haps : parent_haplotypes) { + for (int parent_hap : parent_sample_haps.second) { + bool found = false; + if (sample_to_haps.count(parent_sample_haps.first)) { + // note: this is brute-force search, but number of haplotypes usually tiny. + for (int hap : sample_to_haps[parent_sample_haps.first]) { + if (parent_hap == hap) { + found = true; + break; + } + } + } + if (!found) { + travs.push_back(Traversal()); + names.push_back(PathMetadata::create_path_name(PathSense::REFERENCE, + parent_sample_haps.first, + "star", + parent_hap, + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE)); + sample_to_haps[parent_sample_haps.first].push_back(parent_hap); + trav_clusters.push_back({(int)travs.size() - 1}); + } + } + } + return sample_to_haps; +} + + +bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, + const NestingInfo* in_nesting_info, + vector* out_nesting_infos) const { -bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const { vector travs; vector trav_path_names; @@ -626,6 +688,10 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // pick out the traversal corresponding to an embedded reference path, breaking ties consistently string ref_trav_name; + string parent_ref_trav_name; + if (in_nesting_info != nullptr) { + parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->ref_path_interval.first)); + } for (int i = 0; i < travs.size(); ++i) { const string& path_trav_name = trav_path_names[i]; #ifdef debug @@ -639,17 +705,24 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t cerr << " trav=" << traversal_to_string(graph, travs[i]) << endl; } #endif - if (ref_paths.count(path_trav_name) && + bool ref_path_check; + if (!parent_ref_trav_name.empty()) { + // the reference was specified by the parent + ref_path_check = path_trav_name == parent_ref_trav_name; + } else { + // the reference comes from the global options + ref_path_check = ref_paths.count(path_trav_name); + } + if (ref_path_check && (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { ref_trav_name = path_trav_name; #ifdef debug #pragma omp critical (cerr) - cerr << "Setting ref_trav_name " << ref_trav_name << endl; + cerr << "Setting ref_trav_name " << ref_trav_name << (in_nesting_info ? " using nesting info" : "") << endl; #endif } } - // remember all the reference traversals (there can be more than one only in the case of a // cycle in the reference path @@ -822,8 +895,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // jaccard clustering (using handles for now) on traversals vector> trav_cluster_info; - vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, cluster_threshold, - trav_cluster_info); + vector child_snarl_to_trav; + vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, + (in_nesting_info ? in_nesting_info->child_snarls : + vector>()), + cluster_threshold, + trav_cluster_info, + child_snarl_to_trav); #ifdef debug cerr << "cluster priority"; @@ -840,6 +918,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } #endif + unordered_map> sample_to_haps; + if (i == 0 && in_nesting_info != nullptr) { + // add in the star alleles -- these are alleles that were genotyped in the parent but not + // the current allele, and are treated as *'s in VCF. + sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, in_nesting_info->sample_to_haplotypes); + } + vector trav_to_allele = get_alleles(v, travs, trav_steps, ref_trav_idx, trav_clusters, @@ -848,6 +933,26 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // Fill in the genotypes get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info); + if (i == 0 && out_nesting_infos != nullptr) { + // we pass some information down to the children + // todo: do/can we consider all the diferent reference intervals? + // right now, the info passed is hopefully coarse-grained enough not to matter? + assert(in_nesting_info != nullptr && + in_nesting_info->child_snarls.size() == out_nesting_infos->size()); + + for (int64_t j = 0; j < out_nesting_infos->size(); ++j) { + out_nesting_infos->at(j).child_snarls.clear(); + out_nesting_infos->at(j).has_ref = false; + if (child_snarl_to_trav[j] >= 0) { + if (child_snarl_to_trav[j] < trav_steps.size()) { + out_nesting_infos->at(j).has_ref = true; + out_nesting_infos->at(j).ref_path_interval = trav_steps[child_snarl_to_trav[j]]; + out_nesting_infos->at(j).sample_to_haplotypes = sample_to_haps; + } + } + } + } + // we only bother printing out sites with at least 1 non-reference allele if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { // run vcffixup to add some basic INFO like AC @@ -1080,36 +1185,56 @@ void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { // Used to recurse on children of parents that can't be called size_t thread_count = get_thread_count(); - vector> snarl_queue(thread_count); + vector>> snarl_queue(thread_count); // Run the deconstructor on a snarl, and queue up the children if it fails - auto process_snarl = [&](const Snarl* snarl) { + auto process_snarl = [&](const Snarl* snarl, NestingInfo nesting_info) { if (!snarl_manager->is_trivial(snarl, *graph)) { + const vector& children = snarl_manager->children_of(snarl); + assert(nesting_info.child_snarls.empty()); + for (const Snarl* child : children) { + nesting_info.child_snarls.push_back(make_pair(graph->get_handle(child->start().node_id(), child->start().backward()), + graph->get_handle(child->end().node_id(), child->end().backward()))); + + } + vector out_nesting_infos(children.size()); bool was_deconstructed = deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), - graph->get_handle(snarl->end().node_id(), snarl->end().backward())); + graph->get_handle(snarl->end().node_id(), snarl->end().backward()), + include_nested ? &nesting_info : nullptr, + include_nested ? &out_nesting_infos : nullptr); if (include_nested || !was_deconstructed) { - const vector& children = snarl_manager->children_of(snarl); - vector& thread_queue = snarl_queue[omp_get_thread_num()]; - thread_queue.insert(thread_queue.end(), children.begin(), children.end()); + vector>& thread_queue = snarl_queue[omp_get_thread_num()]; + for (int64_t i = 0; i < children.size(); ++i) { + thread_queue.push_back(make_pair(children[i], out_nesting_infos[i])); + } + } } }; // Start with the top level snarls - snarl_manager->for_each_top_level_snarl_parallel(process_snarl); + // (note, can't do for_each_top_level_snarl_parallel() because interface wont take nesting info) + vector> top_level_snarls; + snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { + top_level_snarls.push_back(make_pair(snarl, NestingInfo())); + }); +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < top_level_snarls.size(); ++i) { + process_snarl(top_level_snarls[i].first, top_level_snarls[i].second); + } // Then recurse on any children the snarl caller failed to handle while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), - [](const vector& snarl_vec) {return snarl_vec.empty();})) { - vector cur_queue; - for (vector& thread_queue : snarl_queue) { + [](const vector>& snarl_vec) {return snarl_vec.empty();})) { + vector> cur_queue; + for (vector>& thread_queue : snarl_queue) { cur_queue.reserve(cur_queue.size() + thread_queue.size()); std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); thread_queue.clear(); } #pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < cur_queue.size(); ++i) { - process_snarl(cur_queue[i]); + process_snarl(cur_queue[i].first, cur_queue[i].second); } } } @@ -1125,11 +1250,12 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand bool strict_conflicts, bool long_ref_contig, double cluster_threshold, - gbwt::GBWT* gbwt) { + gbwt::GBWT* gbwt, + bool nested_decomposition) { this->graph = graph; this->ref_paths = set(ref_paths.begin(), ref_paths.end()); - this->include_nested = include_nested; + this->include_nested = include_nested || nested_decomposition; this->path_jaccard_window = context_jaccard_window; this->untangle_allele_traversals = untangle_traversals; this->keep_conflicted_genotypes = keep_conflicted; @@ -1157,10 +1283,10 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } - if (include_nested) { - deconstruct_graph(snarl_manager); - } else { + if (nested_decomposition) { deconstruct_graph_top_down(snarl_manager); + } else { + deconstruct_graph(snarl_manager); } // write variants in sorted order diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index bee12b540c6..4497ba1506b 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -46,7 +46,8 @@ class Deconstructor : public VCFOutputCaller { bool strict_conflicts, bool long_ref_contig, double cluster_threshold = 1.0, - gbwt::GBWT* gbwt = nullptr); + gbwt::GBWT* gbwt = nullptr, + bool nested_decomposition = false); private: @@ -61,9 +62,22 @@ class Deconstructor : public VCFOutputCaller { // (same logic as vg call) void deconstruct_graph_top_down(SnarlManager* snarl_manager); + // some information we pass from parent to child site when + // doing nested deconstruction + struct NestingInfo { + bool has_ref; + vector> child_snarls; + PathInterval ref_path_interval; + unordered_map> sample_to_haplotypes; + }; + // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) - bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const; + // the nesting_info structs are optional and used to pass reference information through nested sites... + // the output nesting_info vector writes a record for each child snarl + bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, + const NestingInfo* in_nesting_info = nullptr, + vector* out_nesting_infos = nullptr) const; // get the traversals for a given site // this returns a combination of embedded path traversals and gbwt traversals @@ -74,6 +88,19 @@ class Deconstructor : public VCFOutputCaller { vector& out_trav_path_names, vector>& out_trav_steps) const; + // this is a hack to add in * alleles -- these are haplotypes that we genotyped in the + // parent but aren't represented in any of the traversals found in the current + // site. *-alleles are represented as empty traversals. + // todo: conflicts arising from alt-cycles will be able to lead to conflicting + // results -- need to overhaul code to pass more detailed traversal information + // from parent to child to have a chance at consistently resolving + // star traversals are appended onto travs and trav_names + // this funtion returns a map containing both parent and child haploty + unordered_map> add_star_traversals(vector& travs, + vector& trav_names, + vector>& trav_clusters, + const unordered_map>& parent_haplotypes) const; + // convert traversals to strings. returns mapping of traversal (offset in travs) to allele vector get_alleles(vcflib::Variant& v, const vector& travs, diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index e0cb36dac31..2c3c3fb383f 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -52,6 +52,7 @@ void help_deconstruct(char** argv){ << " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl << " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl << " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental]" << endl + << " -n, --nested Write a nested VCF, including *-alleles and special tags. [experimental]" << endl << " -t, --threads N Use N threads" << endl << " -v, --verbose Print some status messages" << endl << endl; @@ -79,6 +80,7 @@ int main_deconstruct(int argc, char** argv){ bool untangle_traversals = false; bool contig_only_ref = false; double cluster_threshold = 1.0; + bool nested = false; int c; optind = 2; // force optind past command positional argument @@ -101,13 +103,14 @@ int main_deconstruct(int argc, char** argv){ {"strict-conflicts", no_argument, 0, 'S'}, {"contig-only-ref", no_argument, 0, 'C'}, {"cluster", required_argument, 0, 'L'}, + {"nested", no_argument, 0, 'n'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:t:v", + c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nt:v", long_options, &option_index); // Detect the end of the options. @@ -165,6 +168,9 @@ int main_deconstruct(int argc, char** argv){ case 'L': cluster_threshold = max(0.0, min(1.0, parse(optarg))); break; + case 'n': + nested = true; + break; case 't': omp_set_num_threads(parse(optarg)); break; @@ -271,7 +277,7 @@ int main_deconstruct(int argc, char** argv){ } }); - if (!found_hap) { + if (!found_hap && gbwt_index == nullptr) { cerr << "error [vg deconstruct]: All graph paths selected as references (leaving no alts). Please use -P/-p " << "to narrow down the reference to a subset of paths, or GBZ/GBWT input that contains haplotype paths" << endl; return 1; @@ -351,7 +357,7 @@ int main_deconstruct(int argc, char** argv){ cerr << "Deconstructing top-level snarls" << endl; } dd.set_translation(translation.get()); - dd.set_nested(all_snarls); + dd.set_nested(all_snarls || nested); dd.deconstruct(refpaths, graph, snarl_manager.get(), all_snarls, context_jaccard_window, @@ -360,7 +366,8 @@ int main_deconstruct(int argc, char** argv){ strict_conflicts, !contig_only_ref, cluster_threshold, - gbwt_index); + gbwt_index, + nested); return 0; } diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 4ff358cf603..7b3907c6cf0 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -79,8 +79,10 @@ vector get_traversal_order(const PathHandleGraph* graph, vector> cluster_traversals(const PathHandleGraph* graph, const vector& traversals, const vector& traversal_order, + const vector>& child_snarls, double min_jaccard, - vector>& out_info) { + vector>& out_info, + vector& out_child_snarl_to_trav) { assert(traversal_order.size() <= traversals.size()); @@ -90,6 +92,17 @@ vector> cluster_traversals(const PathHandleGraph* graph, vector> clusters; out_info.resize(traversals.size(), make_pair(-1., 0)); + out_child_snarl_to_trav.resize(child_snarls.size(), -1); + + // keep track of which traversals cover which child snarls + vector> trav_to_child_snarls = assign_child_snarls_to_traversals(graph, traversals, child_snarls); + // keep track of which child snarls are covered + unordered_set uncovered_child_snarls; + for (const vector& child_snarls : trav_to_child_snarls) { + for (int i : child_snarls) { + uncovered_child_snarls.insert(i); + } + } // need the clusters as sorted lists. we'll forget the endpoints while we're at // it since they're always shared. note we work with multisets since we want to @@ -131,9 +144,41 @@ vector> cluster_traversals(const PathHandleGraph* graph, // there's no cluster close enough, need to start a new one clusters.push_back({i}); out_info[i] = make_pair(1.0, 0); + // check off all the child snarls it covers + for (const int& child_snarl_idx : trav_to_child_snarls[i]) { + if (uncovered_child_snarls.count(child_snarl_idx)) { + uncovered_child_snarls.erase(child_snarl_idx); + out_child_snarl_to_trav[child_snarl_idx] = i; + } + } } } + // break up clusters until all child snarls are covered + // todo: can/should we factor child coverage into objective of original clustering?? + // todo todo: is it simpler to relax things so that a traversal can be used as a reference + // in a nested snarl, even if it doesn't have an exact allele (ie is a cluster reference) + // in the parent? -- for now we keep things simple -- all reference alleles are explictly represented in vcf + vector> new_clusters; + for (int64_t i = clusters.size() - 1; i >= 0 && !uncovered_child_snarls.empty(); --i) { + for (int64_t j = clusters[i].size() -1 && !uncovered_child_snarls.empty(); j > 0; --j) { + const vector& trav_childs = trav_to_child_snarls[clusters[i][j]]; + bool uncovered = false; + for (int k : trav_childs) { + if (uncovered_child_snarls.count(k)) { + uncovered = true; + uncovered_child_snarls.erase(k); + out_child_snarl_to_trav[k] = clusters[i][j]; + } + } + if (uncovered) { + new_clusters.push_back({clusters[i][j]}); + clusters[i].erase(clusters[i].begin() + j); + } + } + } + assert(uncovered_child_snarls.empty()); + // fill in the size deltas for (vector& cluster : clusters) { // only non-zero for non-empty clusters @@ -164,5 +209,62 @@ vector> cluster_traversals(const PathHandleGraph* graph, return clusters; } +vector> assign_child_snarls_to_traversals(const PathHandleGraph* graph, + const vector& traversals, + const vector>& child_snarls) { + + // index the child snarls + unordered_map handle_to_child; + for (int64_t i = 0; i < child_snarls.size(); ++i) { + handle_to_child[child_snarls[i].first] = i; + handle_to_child[child_snarls[i].second] = i; + } + + // use the index to find which snarls are fully contained in a given traversal + // this is a linear scan of the traversal, with above index checked (twice) for each handle + function(const Traversal&, bool)> get_contained_snarls = [&] (const Traversal& trav, bool fully_contained) { + map fw_count; + map rv_count; + for (const handle_t& handle : trav) { + if (handle_to_child.count(handle)) { + fw_count[handle_to_child[handle]] += 1; + } + handle_t rhandle = graph->flip(handle); + if (handle_to_child.count(rhandle)) { + rv_count[handle_to_child[rhandle]] += 1; + } + } + vector contained_snarls; + for (const auto& cs_count : fw_count) { + assert(cs_count.second == 1 || cs_count.second == 2); + if (cs_count.second == 2 || (!fully_contained && cs_count.second == 1)) { + contained_snarls.push_back(cs_count.first); + } + } + for (const auto& cs_count : rv_count) { + assert(cs_count.second == 1 || cs_count.second == 2); + if (cs_count.second == 2 || (!fully_contained && cs_count.second == 1)) { + contained_snarls.push_back(cs_count.first); + } + } + return contained_snarls; + }; + + // fill in the output map + vector> trav_to_child_snarls(traversals.size()); + + if (!child_snarls.empty()) { + for (int64_t i = 0; i < traversals.size(); ++i) { + trav_to_child_snarls[i] = get_contained_snarls(traversals[i], true); + } + } + + return trav_to_child_snarls; +} + + + + + } diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index 31c857f553b..ee850b944be 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -70,13 +70,23 @@ vector get_traversal_order(const PathHandleGraph* graph, /// note that traversal_order can specify a subset of traversals /// out_info are Simlarity/length-delta pairs comparing the traversal to its cluster reference /// (if the traversal wasn't in traversal_order, it'll get -1) +/// if child snarls are given, then we make sure that every child snarl is contained in a reference +/// traversal (this guarantees the snarl nesting structure is exactly represented in the vcf alleles!) +/// if child snarls are given, then we also fill in the mapping of each child snarl to its +/// "reference" traversals -- ie first traversal in a cluster that contains both its endpoints vector> cluster_traversals(const PathHandleGraph* graph, const vector& traversals, - const vector& traversal_order, + const vector& traversal_order, + const vector>& child_snarls, double min_jaccard, - vector>& out_info); + vector>& out_info, + vector& out_child_snarl_to_trav); + +/// assign a list of child snarls to traversals that fully conrtain them +/// the output is a list (for each traversal) of each child snarl that's contained in it +/// so otuput[i] = {x,y,z} means that child_snarls[x],[y],[z] are in traversals[i] +vector> assign_child_snarls_to_traversals(const PathHandleGraph* graph, + const vector& traversals, + const vector>& child_snarls); -//int64_t find_parent_traversal(const PathHandleGraph* graph, -// const vector& traversals, - } diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index bde71f7483e..1cb15d9d001 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 25 +plan tests 26 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -155,5 +155,14 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf +vg deconstruct nesting/nested_snp_in_del.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv +printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv +printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv +diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletion" + +rm -f nested_snp_in_del.tsv nested_snp_in_del_truth.tsv + + From 4a434e24d6d99ef63a13e532d3f6d26e5a7e9c99 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 31 May 2024 11:24:37 -0400 Subject: [PATCH 28/67] basic support for nested insertions --- src/deconstructor.cpp | 65 +++++++++++++++++++++++++++++++++++++---- src/deconstructor.hpp | 1 + test/t/26_deconstruct.t | 18 +++++++++++- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index eb653eceb8f..5d576798dab 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -617,6 +617,7 @@ void Deconstructor::get_traversals(const handle_t& snarl_start, const handle_t& unordered_map> Deconstructor::add_star_traversals(vector& travs, vector& names, vector>& trav_clusters, + vector>& trav_cluster_info, const unordered_map>& parent_haplotypes) const { // todo: refactor this into general genotyping code @@ -641,6 +642,14 @@ unordered_map> Deconstructor::add_star_traversals(vectorsample_names.count(parent_sample_name)) { + // dont' bother for purely reference samples -- we don't need to force and allele for them. + continue; + } for (int parent_hap : parent_sample_haps.second) { bool found = false; if (sample_to_haps.count(parent_sample_haps.first)) { @@ -662,9 +671,11 @@ unordered_map> Deconstructor::add_star_traversals(vectorhas_ref) { parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->ref_path_interval.first)); +#ifdef debug +#pragma omp critical (cerr) + cerr << "Using nesting information to set reference to " << parent_ref_trav_name << endl; +#endif } for (int i = 0; i < travs.size(); ++i) { const string& path_trav_name = trav_path_names[i]; @@ -743,7 +758,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t ref_offsets.push_back(sub_offset); #ifdef debug #pragma omp critical (cerr) - cerr << "Adding ref_tav idx=" << i << " offset=" << sub_offset << " because " << path_trav_name << " == " << ref_trav_name << endl; + cerr << "Adding ref_trav idx=" << i << " offset=" << sub_offset << " because " << path_trav_name << " == " << ref_trav_name << endl; #endif } } @@ -910,19 +925,45 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } cerr << endl; for (const auto& tc : trav_clusters) { - cerr << "traversal cluster:"; + cerr << "traversal cluster: { "; for (const auto& t: tc) { cerr << t << "(" << trav_cluster_info[t].first << "," << trav_cluster_info[t].second << ") "; } - cerr << endl; + cerr << " }" << endl; } #endif unordered_map> sample_to_haps; if (i == 0 && in_nesting_info != nullptr) { + // if the reference traversal is also an alt traversal, we pop out an extra copy + // todo: this is a hack add add off-reference support while keeping the current + // logic where the reference traversal is always distinct from the alts. this step + // could be avoided, but it would come at the cost of some detailed refactoring of the + // allele getting code... + string ref_sample_name = PathMetadata::parse_sample_name(trav_path_names[ref_trav_idx]); + if (this->sample_names.count(ref_sample_name)) { + int alt_trav_copy = travs.size(); + travs.push_back(travs[ref_trav_idx]); + trav_path_names.push_back(trav_path_names[ref_trav_idx]); + trav_cluster_info.push_back(make_pair(0, 0)); + if (trav_steps.size() == travs.size()) { + trav_steps.push_back(trav_steps[ref_trav_idx]); + } + bool found_cluster = false; + for (vector& cluster : trav_clusters) { + if (cluster[0] == ref_trav_idx) { + found_cluster =true; + cluster.push_back(alt_trav_copy); + break; + } + } + assert(found_cluster == true); + } + // add in the star alleles -- these are alleles that were genotyped in the parent but not // the current allele, and are treated as *'s in VCF. - sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, in_nesting_info->sample_to_haplotypes); + sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, + in_nesting_info->sample_to_haplotypes); } vector trav_to_allele = get_alleles(v, travs, trav_steps, @@ -930,6 +971,16 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t trav_clusters, prev_char, use_start); + +#ifdef debug + assert(trav_to_allele.size() == travs.size()); + cerr << "trav_to_allele ="; + for (const auto& tta : trav_to_allele) { + cerr << " " << tta; + } + cerr << endl; +#endif + // Fill in the genotypes get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info); @@ -1216,7 +1267,9 @@ void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { // (note, can't do for_each_top_level_snarl_parallel() because interface wont take nesting info) vector> top_level_snarls; snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { - top_level_snarls.push_back(make_pair(snarl, NestingInfo())); + NestingInfo nesting_info; + nesting_info.has_ref = false; + top_level_snarls.push_back(make_pair(snarl, nesting_info)); }); #pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < top_level_snarls.size(); ++i) { diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 4497ba1506b..40f032711ef 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -99,6 +99,7 @@ class Deconstructor : public VCFOutputCaller { unordered_map> add_star_traversals(vector& travs, vector& trav_names, vector>& trav_clusters, + vector>& trav_cluster_info, const unordered_map>& parent_haplotypes) const; // convert traversals to strings. returns mapping of traversal (offset in travs) to allele diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 1cb15d9d001..f8ea5967bd6 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 26 +plan tests 28 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -163,6 +163,22 @@ is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletio rm -f nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv +printf "C\tCAAG,CATG\t1|2\n" > nested_snp_in_ins_truth.tsv +printf "A\tT\t0|1\n" >> nested_snp_in_ins_truth.tsv +diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv +is "$?" 0 "nested deconstruction gets correct allele for snp inside insert" + +rm -f nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv + +vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv +printf "C\tCAAG,CATG\t1|2\t1|0\n" > nested_snp_in_ins2._truth.tsv +printf "A\tT,*\t0|1\t0|2\n" >> nested_snp_in_ins2._truth.tsv +diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv +is "$?" 0 "nested deconstruction gets correct star allele for snp ins2.ide ins2.ert" + +rm -f nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv + From 350b1b275cf4dcde15e1d4416b911b944c8d4441 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 31 May 2024 13:09:49 -0400 Subject: [PATCH 29/67] add off-ref contigs to vcf header with -n --- src/deconstructor.cpp | 140 ++++++++++++++-------------- src/deconstructor.hpp | 9 ++ src/subcommand/deconstruct_main.cpp | 5 + test/t/26_deconstruct.t | 13 ++- 4 files changed, 92 insertions(+), 75 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 5d576798dab..60b22ef96fe 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -705,7 +705,9 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t #ifdef debug #pragma omp critical (cerr) cerr << "Using nesting information to set reference to " << parent_ref_trav_name << endl; -#endif +#endif + // remember it for the vcf header + this->off_ref_paths[omp_get_thread_num()].insert(graph->get_path_handle_of_step(in_nesting_info->ref_path_interval.first)); } for (int i = 0; i < travs.size(); ++i) { const string& path_trav_name = trav_path_names[i]; @@ -1127,72 +1129,6 @@ string Deconstructor::get_vcf_header() { } else { stream << "##INFO=" << endl; } - set gbwt_ref_paths; - map ref_path_to_length; - for(auto& refpath : ref_paths) { - if (graph->has_path(refpath)) { - int64_t path_len = 0; - path_handle_t path_handle = graph->get_path_handle(refpath); - for (handle_t handle : graph->scan_path(path_handle)) { - path_len += graph->get_length(handle); - } - string locus_name = graph->get_locus_name(path_handle); - if (locus_name == PathMetadata::NO_LOCUS_NAME) { - locus_name = refpath; - } else if (long_ref_contig) { - // the sample name isn't unique enough, so put a full ugly name in the vcf - if (graph->get_sense(path_handle) == PathSense::GENERIC) { - locus_name = graph->get_path_name(path_handle); - } else { - locus_name = PathMetadata::create_path_name(PathSense::REFERENCE, - graph->get_sample_name(path_handle), - locus_name, - graph->get_haplotype(path_handle), - PathMetadata::NO_PHASE_BLOCK, - PathMetadata::NO_SUBRANGE); - } - } - - subrange_t subrange = graph->get_subrange(path_handle); - int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; - ref_path_to_length[locus_name] = std::max(ref_path_to_length[locus_name], path_len + offset); - } else { - gbwt_ref_paths.insert(refpath); - } - } - for (auto& ref_path_len : ref_path_to_length) { - stream << "##contig=" << endl; - } - if (!gbwt_ref_paths.empty()) { - unordered_map> gbwt_name_to_ids; - for (size_t i = 0; i < gbwt->metadata.paths(); i++) { - // Collect all the GBWT path IDs for each sample and contig. - gbwt_name_to_ids[compose_short_path_name(*gbwt, i)].push_back(i); - } - for (const string& refpath : gbwt_ref_paths) { - // For each sample and contig name that is a GBWT ref path - vector& thread_ids = gbwt_name_to_ids.at(refpath); - size_t path_len = 0; - for (gbwt::size_type thread_id : thread_ids) { - // For each actual path in the GBWT for that sample-and-contig, - // we need to see how long it extends the space of the sample - // and contig. - - // TODO: These are probably all guaranteed to be haplotype sense? - PathSense sense = gbwtgraph::get_path_sense(*gbwt, thread_id, gbwt_reference_samples); - subrange_t subrange = gbwtgraph::get_path_subrange(*gbwt, thread_id, sense); - - // TODO: when importing GFAs we might cram the start of a walk - // into the GBWT count field. But we don't ever guarantee that - // we've done that so it might not be visible as a subrange - // here. Fix that somehow??? - size_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; - size_t len = path_to_length(extract_gbwt_path(*graph, *gbwt, thread_id)); - path_len = std::max(path_len, offset + len); - } - stream << "##contig=" << endl; - } - } stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; for (auto& sample_name : sample_names) { @@ -1202,6 +1138,62 @@ string Deconstructor::get_vcf_header() { return stream.str(); } +string Deconstructor::add_contigs_to_vcf_header(const string& vcf_header) const { + + vector header_lines = split_delims(vcf_header, "\n"); + + stringstream patched_header; + for (int64_t i = 0; i < header_lines.size() - 1; ++i) { + patched_header << header_lines[i] << "\n"; + } + + set all_ref_paths = this->ref_paths; + + // add in the off-ref paths that nested deconstruction may have found + for (const unordered_set& off_ref_path_set : this->off_ref_paths) { + for (const path_handle_t& off_ref_path : off_ref_path_set) { + all_ref_paths.insert(graph->get_path_name(off_ref_path)); + } + } + + map ref_path_to_length; + for(auto& refpath : all_ref_paths) { + assert(graph->has_path(refpath)); + int64_t path_len = 0; + path_handle_t path_handle = graph->get_path_handle(refpath); + for (handle_t handle : graph->scan_path(path_handle)) { + path_len += graph->get_length(handle); + } + string locus_name = graph->get_locus_name(path_handle); + if (locus_name == PathMetadata::NO_LOCUS_NAME) { + locus_name = refpath; + } else if (long_ref_contig) { + // the sample name isn't unique enough, so put a full ugly name in the vcf + if (graph->get_sense(path_handle) == PathSense::GENERIC) { + locus_name = graph->get_path_name(path_handle); + } else { + locus_name = PathMetadata::create_path_name(PathSense::REFERENCE, + graph->get_sample_name(path_handle), + locus_name, + graph->get_haplotype(path_handle), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); + } + } + + subrange_t subrange = graph->get_subrange(path_handle); + int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + ref_path_to_length[locus_name] = std::max(ref_path_to_length[locus_name], path_len + offset); + } + for (auto& ref_path_len : ref_path_to_length) { + patched_header << "##contig=" << endl; + } + + assert(header_lines.back().substr(0, 6) == "#CHROM"); + patched_header << header_lines.back(); + return patched_header.str(); +} + void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { vector snarls; @@ -1234,8 +1226,10 @@ void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { // logic copied from vg call (graph_caller.cpp) - // Used to recurse on children of parents that can't be called size_t thread_count = get_thread_count(); + this->off_ref_paths.clear(); + this->off_ref_paths.resize(get_thread_count()); + // Used to recurse on children of parents that can't be called vector>> snarl_queue(thread_count); // Run the deconstructor on a snarl, and queue up the children if it fails @@ -1324,10 +1318,6 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand omp_set_nested(1); omp_set_max_active_levels(3); - string hstr = this->get_vcf_header(); - assert(output_vcf.openForOutput(hstr)); - cout << output_vcf.header << endl; - // create the traversal finder map reads_by_name; path_trav_finder = unique_ptr(new PathTraversalFinder(*graph)); @@ -1336,12 +1326,18 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } + string hstr = this->get_vcf_header(); + assert(output_vcf.openForOutput(hstr)); + if (nested_decomposition) { deconstruct_graph_top_down(snarl_manager); } else { deconstruct_graph(snarl_manager); } + string patched_header = this->add_contigs_to_vcf_header(output_vcf.header); + cout << patched_header << endl; + // write variants in sorted order write_variants(cout, snarl_manager); } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 40f032711ef..77f312e1ff5 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -54,6 +54,11 @@ class Deconstructor : public VCFOutputCaller { // initialize the vcf and get the header string get_vcf_header(); + // the header needs to be initialized *before* construction for vcflib + // but we don't know all the non-ref contigs (in nested mode) until *after* + // construction. end result: this hacky function to patch them in before printing + string add_contigs_to_vcf_header(const string& vcf_header) const; + // deconstruct all snarls in parallel (ie nesting relationship ignored) void deconstruct_graph(SnarlManager* snarl_manager); @@ -146,6 +151,10 @@ class Deconstructor : public VCFOutputCaller { // the ref paths set ref_paths; + // the off-ref paths that may be found during nested deconstruction + // (buffered by thread) + mutable vector> off_ref_paths; + // keep track of reference samples set ref_samples; diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 2c3c3fb383f..aa976584d87 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -188,6 +188,11 @@ int main_deconstruct(int argc, char** argv){ } + if (nested == true && contig_only_ref == true) { + cerr << "Error [vg deconstruct]: -C cannot be used with -n" << endl; + return 1; + } + // Read the graph unique_ptr path_handle_graph_up; diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index f8ea5967bd6..1892ab8cc19 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 28 +plan tests 29 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -163,13 +163,20 @@ is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletio rm -f nested_snp_in_del.tsv nested_snp_in_del_truth.tsv -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv +vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf +grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv printf "C\tCAAG,CATG\t1|2\n" > nested_snp_in_ins_truth.tsv printf "A\tT\t0|1\n" >> nested_snp_in_ins_truth.tsv diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv is "$?" 0 "nested deconstruction gets correct allele for snp inside insert" -rm -f nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv +grep ^##contig nested_snp_in_ins.vcf > nested_snp_in_ins_contigs.tsv +printf "##contig=\n" > nested_snp_in_ins_contigs_truth.tsv +printf "##contig=\n" >> nested_snp_in_ins_contigs_truth.tsv +diff nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv +is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp inside insert" + +rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv printf "C\tCAAG,CATG\t1|2\t1|0\n" > nested_snp_in_ins2._truth.tsv From 2c8db7967845fb8b9fa058306795e5b461dd2730 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 31 May 2024 16:21:27 -0400 Subject: [PATCH 30/67] store parent allele number in PA --- src/deconstructor.cpp | 14 ++++++++++++++ src/deconstructor.hpp | 6 ++++++ test/t/26_deconstruct.t | 12 +++++++++--- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 60b22ef96fe..8e9e65454fd 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -966,6 +966,14 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // the current allele, and are treated as *'s in VCF. sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, in_nesting_info->sample_to_haplotypes); + + } + if (this->nested_decomposition) { + int64_t parent_allele = 0; + if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) { + parent_allele = in_nesting_info->ref_path_allele; + } + v.info["PA"].push_back(std::to_string(parent_allele)); } vector trav_to_allele = get_alleles(v, travs, trav_steps, @@ -1001,6 +1009,8 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t out_nesting_infos->at(j).has_ref = true; out_nesting_infos->at(j).ref_path_interval = trav_steps[child_snarl_to_trav[j]]; out_nesting_infos->at(j).sample_to_haplotypes = sample_to_haps; + out_nesting_infos->at(j).ref_path_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ? + trav_to_allele[child_snarl_to_trav[j]] : 0; } } } @@ -1124,6 +1134,9 @@ string Deconstructor::get_vcf_header() { stream << "##INFO=" << endl; stream << "##INFO=" << endl; } + if (this->nested_decomposition) { + stream << "##INFO=" << endl; + } if (untangle_allele_traversals) { stream << "##INFO=|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl; } else { @@ -1312,6 +1325,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } this->cluster_threshold = cluster_threshold; this->gbwt = gbwt; + this->nested_decomposition = nested_decomposition; // the need to use nesting is due to a problem with omp tasks and shared state // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 77f312e1ff5..a557c2d91c8 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -74,6 +74,7 @@ class Deconstructor : public VCFOutputCaller { vector> child_snarls; PathInterval ref_path_interval; unordered_map> sample_to_haplotypes; + int ref_path_allele; }; // write a vcf record for the given site. returns true if a record was written @@ -189,6 +190,11 @@ class Deconstructor : public VCFOutputCaller { // currently implemented as handle jaccard coefficient. So 1 means only // merge if identical (which is what deconstruct has always done) double cluster_threshold = 1.0; + + // activate the new nested decomposition mode, which is like the old include_nested + // (which lives in vcfoutputcaller) but with more of an effort to link + // the parent and child snarls, as well as better support for nested insertions + bool nested_decomposition = false; }; diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 1892ab8cc19..b297a298379 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 29 +plan tests 32 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -155,13 +155,16 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf -vg deconstruct nesting/nested_snp_in_del.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv +vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletion" -rm -f nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both sites of neseted deletion" + +rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv @@ -170,6 +173,9 @@ printf "A\tT\t0|1\n" >> nested_snp_in_ins_truth.tsv diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv is "$?" 0 "nested deconstruction gets correct allele for snp inside insert" +is $(grep PA= nested_snp_in_ins.vcf | head -1 | grep PA=0 | wc -l) 1 "PA=0 set for base allele of nested insertion" +is $(grep PA= nested_snp_in_ins.vcf | tail -1 | grep PA=1 | wc -l) 1 "PA=1 set for nested allele of nested insertion" + grep ^##contig nested_snp_in_ins.vcf > nested_snp_in_ins_contigs.tsv printf "##contig=\n" > nested_snp_in_ins_contigs_truth.tsv printf "##contig=\n" >> nested_snp_in_ins_contigs_truth.tsv From 2302021016c9d8e2bf27162d10a0e76cf9008a61 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 3 Jun 2024 15:00:15 -0400 Subject: [PATCH 31/67] add a bunch of new nesting tags with -n --- src/deconstructor.cpp | 71 +++++++++++++++++++++++++++++++++-------- src/deconstructor.hpp | 10 ++++-- src/graph_caller.cpp | 4 +++ test/t/26_deconstruct.t | 11 ++++++- 4 files changed, 79 insertions(+), 17 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 8e9e65454fd..60e378c609d 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -701,13 +701,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t string ref_trav_name; string parent_ref_trav_name; if (in_nesting_info != nullptr && in_nesting_info->has_ref) { - parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->ref_path_interval.first)); + parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); #ifdef debug #pragma omp critical (cerr) cerr << "Using nesting information to set reference to " << parent_ref_trav_name << endl; #endif // remember it for the vcf header - this->off_ref_paths[omp_get_thread_num()].insert(graph->get_path_handle_of_step(in_nesting_info->ref_path_interval.first)); + this->off_ref_paths[omp_get_thread_num()].insert(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); } for (int i = 0; i < travs.size(); ++i) { const string& path_trav_name = trav_path_names[i]; @@ -968,13 +968,6 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t in_nesting_info->sample_to_haplotypes); } - if (this->nested_decomposition) { - int64_t parent_allele = 0; - if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) { - parent_allele = in_nesting_info->ref_path_allele; - } - v.info["PA"].push_back(std::to_string(parent_allele)); - } vector trav_to_allele = get_alleles(v, travs, trav_steps, ref_trav_idx, @@ -994,6 +987,38 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // Fill in the genotypes get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info); + // Fill in some nesting-specific (site-level) tags + NestingInfo ref_info; // since in_nesting_info is const, we put top-level stuff here + if (this->nested_decomposition) { + if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) { + // if we're a child, just take what's passed in + ref_info.parent_allele = in_nesting_info->parent_allele; + ref_info.parent_len = in_nesting_info->parent_len; + ref_info.parent_ref_len = in_nesting_info->parent_ref_len; + ref_info.lv0_ref_name = in_nesting_info->lv0_ref_name; + ref_info.lv0_ref_start = in_nesting_info->lv0_ref_start; + ref_info.lv0_ref_len = in_nesting_info->lv0_ref_len; + ref_info.lv0_alt_len = in_nesting_info->lv0_alt_len; + } else { + // if we're a root, compute values from the prsent site + // todo: should they just be left undefined? + ref_info.parent_allele = 0; + ref_info.parent_len = v.alleles[0].length(); + ref_info.parent_ref_len = v.alleles[0].length(); + ref_info.lv0_ref_name = v.sequenceName; + ref_info.lv0_ref_start = v.position; + ref_info.lv0_ref_len = v.alleles[0].length(); + ref_info.lv0_alt_len = v.alleles[ref_info.parent_allele].length(); + } + v.info["PA"].push_back(std::to_string(ref_info.parent_allele)); + v.info["PL"].push_back(std::to_string(ref_info.parent_len)); + v.info["PR"].push_back(std::to_string(ref_info.parent_ref_len)); + v.info["RC"].push_back(ref_info.lv0_ref_name); + v.info["RS"].push_back(std::to_string(ref_info.lv0_ref_start)); + v.info["RE"].push_back(std::to_string(ref_info.lv0_ref_len)); + v.info["RL"].push_back(std::to_string(ref_info.lv0_alt_len)); + } + if (i == 0 && out_nesting_infos != nullptr) { // we pass some information down to the children // todo: do/can we consider all the diferent reference intervals? @@ -1006,11 +1031,23 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t out_nesting_infos->at(j).has_ref = false; if (child_snarl_to_trav[j] >= 0) { if (child_snarl_to_trav[j] < trav_steps.size()) { - out_nesting_infos->at(j).has_ref = true; - out_nesting_infos->at(j).ref_path_interval = trav_steps[child_snarl_to_trav[j]]; - out_nesting_infos->at(j).sample_to_haplotypes = sample_to_haps; - out_nesting_infos->at(j).ref_path_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ? + NestingInfo& child_info = out_nesting_infos->at(j); + child_info.has_ref = true; + child_info.parent_path_interval = trav_steps[child_snarl_to_trav[j]]; + child_info.sample_to_haplotypes = sample_to_haps; + child_info.parent_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ? trav_to_allele[child_snarl_to_trav[j]] : 0; + child_info.parent_len = v.alleles[child_info.parent_allele].length(); + child_info.parent_ref_len = v.alleles[0].length(); + child_info.lv0_ref_name = ref_info.lv0_ref_name; + child_info.lv0_ref_start = ref_info.lv0_ref_start; + child_info.lv0_ref_len = ref_info.lv0_ref_len; + if (in_nesting_info == nullptr || in_nesting_info->has_ref == false) { + // we're the parent of root, so we want to set this here + child_info.lv0_alt_len = child_info.parent_len; + } else { + child_info.lv0_alt_len = ref_info.lv0_alt_len; + } } } } @@ -1135,7 +1172,13 @@ string Deconstructor::get_vcf_header() { stream << "##INFO=" << endl; } if (this->nested_decomposition) { - stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; } if (untangle_allele_traversals) { stream << "##INFO=|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl; diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index a557c2d91c8..16c75d00c51 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -72,9 +72,15 @@ class Deconstructor : public VCFOutputCaller { struct NestingInfo { bool has_ref; vector> child_snarls; - PathInterval ref_path_interval; + PathInterval parent_path_interval; unordered_map> sample_to_haplotypes; - int ref_path_allele; + int parent_allele; + int64_t parent_len; + int64_t parent_ref_len; + string lv0_ref_name; + int64_t lv0_ref_start; + int64_t lv0_ref_len; + int64_t lv0_alt_len; }; // write a vcf record for the given site. returns true if a record was written diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 355bfceb9ac..1ce56a05765 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -419,6 +419,10 @@ void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, co } prev_visit = &visit; } + if (trav_info[allele].empty()) { + // note: * alleles get empty traversals + trav_info[allele] = "."; + } } string VCFOutputCaller::trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const { diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index b297a298379..90366d40875 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 32 +plan tests 35 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -192,6 +192,15 @@ is "$?" 0 "nested deconstruction gets correct star allele for snp ins2.ide ins2. rm -f nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv +# todo: the integrated snarl finder doesnt anchor to the reference +# probably not an issue on most real graphs from vg construct / minigraph cacuts +# but seems like something that needs reviewing +vg snarls nesting/nested_snp_in_nested_ins.gfa -A cactus > nested_snp_in_nested_ins.snarls +vg deconstruct nesting/nested_snp_in_nested_ins.gfa -r nested_snp_in_nested_ins.snarls -P x -n > nested_snp_in_nested_ins.vcf +is $(grep -v ^# nested_snp_in_nested_ins.vcf | head -1 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RE=1;RL=1;RS=1;LV=0" "INFO tags correct for first site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | head -2 | tail -1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RE=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for second site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | tail -1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RE=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for second site of double-nested SNP" +rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf From 8e634b5d300dbbabaa50a0901d33df8c9298dc40 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 3 Jun 2024 19:45:24 -0400 Subject: [PATCH 32/67] fix -C option regression --- src/deconstructor.cpp | 3 ++- test/t/26_deconstruct.t | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 60e378c609d..e869236e173 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1072,7 +1072,7 @@ string Deconstructor::get_vcf_header() { ref_haplotypes.insert(PathMetadata::parse_haplotype(ref_path_name)); } if (!long_ref_contig) { - long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1; + long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1 || nested_decomposition; } this->long_ref_contig = long_ref_contig; sample_names.clear(); @@ -1363,6 +1363,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand this->untangle_allele_traversals = untangle_traversals; this->keep_conflicted_genotypes = keep_conflicted; this->strict_conflict_checking = strict_conflicts; + this->long_ref_contig = long_ref_contig; if (gbwt) { this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); } diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 90366d40875..1a0616a3ed3 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -168,25 +168,25 @@ rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv -printf "C\tCAAG,CATG\t1|2\n" > nested_snp_in_ins_truth.tsv -printf "A\tT\t0|1\n" >> nested_snp_in_ins_truth.tsv +printf "A\tT\t0|1\n" > nested_snp_in_ins_truth.tsv +printf "C\tCAAG,CATG\t1|2\n" >> nested_snp_in_ins_truth.tsv diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv is "$?" 0 "nested deconstruction gets correct allele for snp inside insert" -is $(grep PA= nested_snp_in_ins.vcf | head -1 | grep PA=0 | wc -l) 1 "PA=0 set for base allele of nested insertion" -is $(grep PA= nested_snp_in_ins.vcf | tail -1 | grep PA=1 | wc -l) 1 "PA=1 set for nested allele of nested insertion" +is $(grep LV=0 nested_snp_in_ins.vcf | head -1 | grep PA=0 | wc -l) 1 "PA=0 set for base allele of nested insertion" +is $(grep LV=1 nested_snp_in_ins.vcf | tail -1 | grep PA=1 | wc -l) 1 "PA=1 set for nested allele of nested insertion" grep ^##contig nested_snp_in_ins.vcf > nested_snp_in_ins_contigs.tsv -printf "##contig=\n" > nested_snp_in_ins_contigs_truth.tsv -printf "##contig=\n" >> nested_snp_in_ins_contigs_truth.tsv +printf "##contig=\n" > nested_snp_in_ins_contigs_truth.tsv +printf "##contig=\n" >> nested_snp_in_ins_contigs_truth.tsv diff nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp inside insert" rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv -printf "C\tCAAG,CATG\t1|2\t1|0\n" > nested_snp_in_ins2._truth.tsv -printf "A\tT,*\t0|1\t0|2\n" >> nested_snp_in_ins2._truth.tsv +printf "A\tT,*\t0|1\t0|2\n" > nested_snp_in_ins2._truth.tsv +printf "C\tCAAG,CATG\t1|2\t1|0\n" >> nested_snp_in_ins2._truth.tsv diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv is "$?" 0 "nested deconstruction gets correct star allele for snp ins2.ide ins2.ert" @@ -197,9 +197,10 @@ rm -f nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv # but seems like something that needs reviewing vg snarls nesting/nested_snp_in_nested_ins.gfa -A cactus > nested_snp_in_nested_ins.snarls vg deconstruct nesting/nested_snp_in_nested_ins.gfa -r nested_snp_in_nested_ins.snarls -P x -n > nested_snp_in_nested_ins.vcf -is $(grep -v ^# nested_snp_in_nested_ins.vcf | head -1 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RE=1;RL=1;RS=1;LV=0" "INFO tags correct for first site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | head -2 | tail -1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RE=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for second site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | tail -1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RE=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for second site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RE=1;RL=1;RS=1;LV=0" "INFO tags correct for level-0 site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RE=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for level-1 site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RE=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for level-2 site of double-nested SNP" + rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf From 1fca800aecac01b2dab7d6bb82e685fd4721d65a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 3 Jun 2024 21:27:06 -0400 Subject: [PATCH 33/67] fix loop typo gcc should warn about --- src/traversal_clusters.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 7b3907c6cf0..a26343892ac 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -161,7 +161,7 @@ vector> cluster_traversals(const PathHandleGraph* graph, // in the parent? -- for now we keep things simple -- all reference alleles are explictly represented in vcf vector> new_clusters; for (int64_t i = clusters.size() - 1; i >= 0 && !uncovered_child_snarls.empty(); --i) { - for (int64_t j = clusters[i].size() -1 && !uncovered_child_snarls.empty(); j > 0; --j) { + for (int64_t j = clusters[i].size() -1; j > 0 && !uncovered_child_snarls.empty(); --j) { const vector& trav_childs = trav_to_child_snarls[clusters[i][j]]; bool uncovered = false; for (int k : trav_childs) { From 2f80767147298d3b689b5cf026c97a1c5cb1862c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 3 Jun 2024 22:18:04 -0400 Subject: [PATCH 34/67] fix assert bug that crashes on cycle --- src/traversal_clusters.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index a26343892ac..d45944651d2 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -236,14 +236,14 @@ vector> assign_child_snarls_to_traversals(const PathHandleGraph* gra } vector contained_snarls; for (const auto& cs_count : fw_count) { - assert(cs_count.second == 1 || cs_count.second == 2); - if (cs_count.second == 2 || (!fully_contained && cs_count.second == 1)) { + assert(cs_count.second == 1 || cs_count.second >= 2); + if (cs_count.second >= 2 || (!fully_contained && cs_count.second == 1)) { contained_snarls.push_back(cs_count.first); } } for (const auto& cs_count : rv_count) { - assert(cs_count.second == 1 || cs_count.second == 2); - if (cs_count.second == 2 || (!fully_contained && cs_count.second == 1)) { + assert(cs_count.second == 1 || cs_count.second >= 2); + if (cs_count.second >= 2 || (!fully_contained && cs_count.second == 1)) { contained_snarls.push_back(cs_count.first); } } From 0349d28fa82bdc17f88ae2510109fa61ec8f34d3 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 4 Jun 2024 13:35:26 -0400 Subject: [PATCH 35/67] fix more cycle-related assertion fails --- src/deconstructor.cpp | 20 +++++++++++++++----- src/traversal_clusters.cpp | 14 ++++++++------ src/traversal_clusters.hpp | 1 + 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index e869236e173..588682381aa 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -618,8 +618,7 @@ unordered_map> Deconstructor::add_star_traversals(vector& names, vector>& trav_clusters, vector>& trav_cluster_info, - const unordered_map>& parent_haplotypes) const { - + const unordered_map>& parent_haplotypes) const { // todo: refactor this into general genotyping code unordered_map> sample_to_haps; @@ -692,6 +691,8 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // compute all the traversals from embedded paths and gbwt this->get_traversals(snarl_start, snarl_end, travs, trav_path_names, trav_steps); + int64_t trav_count = travs.size(); + int64_t trav_step_count = trav_steps.size(); if (travs.empty()) { return false; @@ -745,7 +746,6 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // in case of cycles, we need our allele traversals to be associated to the correct reference position // this is done with the path jaccard metric over all overlapping reference paths the given path_jaccard_window size - vector ref_travs; // hacky subpath support -- gets added to variant on output vector ref_offsets; @@ -831,6 +831,11 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // we write a variant for every reference traversal // (optionally) selecting the subset of path traversals that are 1:1 for (size_t i = 0; i < ref_travs.size(); ++i) { + // we zap these to their original size, as the nesting logic can add + // dummy traversals and these are reference-specific (and so need to be cleaned each iteration here) + travs.resize(trav_count); + trav_path_names.resize(trav_count); + trav_steps.resize(trav_step_count); auto& ref_trav_idx = ref_travs[i]; auto& ref_trav_offset = ref_offsets[i]; @@ -907,8 +912,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } } + if (std::none_of(use_trav.begin(), use_trav.end(), [](bool b) {return b;})) { + // no alts were jaccard-assigned to this reference, so abort before an assertion gets tripped + continue; + } + // Sort the traversals for clustering - vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, use_trav); + vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, ref_trav_idx, use_trav); // jaccard clustering (using handles for now) on traversals vector> trav_cluster_info; @@ -936,7 +946,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t #endif unordered_map> sample_to_haps; - if (i == 0 && in_nesting_info != nullptr) { + if (in_nesting_info != nullptr) { // if the reference traversal is also an alt traversal, we pop out an extra copy // todo: this is a hack add add off-reference support while keeping the current // logic where the reference traversal is always distinct from the alts. this step diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index d45944651d2..1fb020be263 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -34,6 +34,7 @@ vector get_traversal_order(const PathHandleGraph* graph, const vector& traversals, const vector& trav_path_names, const vector& ref_travs, + int64_t ref_trav_idx, const vector& use_traversal) { set ref_set(ref_travs.begin(), ref_travs.end()); @@ -54,7 +55,7 @@ vector get_traversal_order(const PathHandleGraph* graph, vector sorted_travs; sorted_travs.reserve(traversals.size()); for (int64_t i = 0; i < traversals.size(); ++i) { - if (use_traversal[i]) { + if (use_traversal[i] && (i == ref_trav_idx || !ref_set.count(i))) { sorted_travs.push_back(i); } } @@ -107,9 +108,10 @@ vector> cluster_traversals(const PathHandleGraph* graph, // need the clusters as sorted lists. we'll forget the endpoints while we're at // it since they're always shared. note we work with multisets since we want to // count differences between, say, cycle copy numbers. - vector> sorted_traversals; - for (const Traversal& trav : traversals) { - assert(trav.size() >=2 ); + vector> sorted_traversals(traversals.size()); + for (const int& i : traversal_order) { + const auto& trav = traversals[i]; + assert(trav.size() >=2); // prune snarl nodes as they're always the same // todo: does jaccard properly handle empty sets? int64_t first = trav.size() == 2 ? 0 : 1; @@ -118,7 +120,7 @@ vector> cluster_traversals(const PathHandleGraph* graph, for (int64_t i = first; i < last; ++i) { sorted_trav.insert(trav[i]); } - sorted_traversals.push_back(sorted_trav); + sorted_traversals[i] = sorted_trav; } for (const int& i : traversal_order) { @@ -177,7 +179,7 @@ vector> cluster_traversals(const PathHandleGraph* graph, } } } - assert(uncovered_child_snarls.empty()); + assert(uncovered_child_snarls.empty() || traversal_order.size() < traversals.size()); // fill in the size deltas for (vector& cluster : clusters) { diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index ee850b944be..a4e7710d4b5 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -60,6 +60,7 @@ vector get_traversal_order(const PathHandleGraph* graph, const vector& traversals, const vector& trav_path_names, const vector& ref_travs, + int64_t ref_trav_idx, const vector& use_traversal); From 5fc14ccab340d3054012aeefbc585b614e0ad7d1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 4 Jun 2024 13:55:02 -0400 Subject: [PATCH 36/67] add nested cycle test --- test/nesting/nested_snp_in_ins_cycle.gfa | 21 +++++++++++++++++++++ test/t/26_deconstruct.t | 13 +++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 test/nesting/nested_snp_in_ins_cycle.gfa diff --git a/test/nesting/nested_snp_in_ins_cycle.gfa b/test/nesting/nested_snp_in_ins_cycle.gfa new file mode 100644 index 00000000000..7574da51fdc --- /dev/null +++ b/test/nesting/nested_snp_in_ins_cycle.gfa @@ -0,0 +1,21 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P A#2#y1 1+,2+,3+,5+,6+ * +P A#1#y0 1+,2+,4+,5+,2+,4+,5+,6+ * +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M +L 5 + 2 + 0M + diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 1a0616a3ed3..521f6b9bfb3 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 35 +plan tests 36 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -201,7 +201,16 @@ is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | awk '{print $8}') "AC is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RE=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for level-1 site of double-nested SNP" is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RE=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for level-2 site of double-nested SNP" - rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf +vg deconstruct nesting/nested_snp_in_ins_cycle.gfa -P x -n > nested_snp_in_ins_cycle.vcf +printf "A#1#y0\t6\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RE=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv +printf "x\t1\t>1>6\tC\tCAAGAAG,CATG,CAAG\tAC=1,2,1;AF=0.25,0.5,0.25;AN=4;AT=>1>6,>1>2>4>5>2>4>5>6,>1>2>3>5>6,>1>2>4>5>6;NS=2;PA=0;PL=1;PR=1;RC=x;RE=1;RL=1;RS=1;LV=0\t1|2\t3|2\n" >> nested_snp_in_ins_cycle_truth.tsv +grep -v ^# nested_snp_in_ins_cycle.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $8 "\t" $10 "\t" $11}' > nested_snp_in_ins_cycle.tsv +diff nested_snp_in_ins_cycle.tsv nested_snp_in_ins_cycle_truth.tsv +is "$?" 0 "nested deconstruction handles cycle" + +rm -f nested_snp_in_ins_cycle.vcf nested_snp_in_ins_cycle_truth.tsv nested_snp_in_ins_cycle_truth.tsv + + From f69d5b1724f10bc0f78c06ff39e9f0a1dc32b867 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 4 Jun 2024 16:21:23 -0400 Subject: [PATCH 37/67] add nesting test graphs! --- test/nesting/nested_snp_in_del.gfa | 17 ++++++++++++++ test/nesting/nested_snp_in_ins.gfa | 17 ++++++++++++++ test/nesting/nested_snp_in_ins2.gfa | 19 +++++++++++++++ test/nesting/nested_snp_in_nested_ins.gfa | 28 +++++++++++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 test/nesting/nested_snp_in_del.gfa create mode 100644 test/nesting/nested_snp_in_ins.gfa create mode 100644 test/nesting/nested_snp_in_ins2.gfa create mode 100644 test/nesting/nested_snp_in_nested_ins.gfa diff --git a/test/nesting/nested_snp_in_del.gfa b/test/nesting/nested_snp_in_del.gfa new file mode 100644 index 00000000000..c122d6827be --- /dev/null +++ b/test/nesting/nested_snp_in_del.gfa @@ -0,0 +1,17 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P x 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P a#2#y1 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M diff --git a/test/nesting/nested_snp_in_ins.gfa b/test/nesting/nested_snp_in_ins.gfa new file mode 100644 index 00000000000..0ed215bde22 --- /dev/null +++ b/test/nesting/nested_snp_in_ins.gfa @@ -0,0 +1,17 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M diff --git a/test/nesting/nested_snp_in_ins2.gfa b/test/nesting/nested_snp_in_ins2.gfa new file mode 100644 index 00000000000..0416c076c38 --- /dev/null +++ b/test/nesting/nested_snp_in_ins2.gfa @@ -0,0 +1,19 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +P b#1#y0 1+,2+,4+,5+,6+ * +P b#2#y2 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M diff --git a/test/nesting/nested_snp_in_nested_ins.gfa b/test/nesting/nested_snp_in_nested_ins.gfa new file mode 100644 index 00000000000..98a73015e8e --- /dev/null +++ b/test/nesting/nested_snp_in_nested_ins.gfa @@ -0,0 +1,28 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 31 T +S 32 G +S 33 C +S 34 C +S 35 T +S 4 A +S 5 G +S 6 T +P a#2#y1 1+,2+,3+,31+,32+,34+,35+,5+,6+ * +P a#1#y0 1+,2+,3+,31+,33+,34+,35+,5+,6+ * +P x 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 31 + 0M +L 31 + 32 + 0M +L 31 + 33 + 0M +L 32 + 34 + 0M +L 33 + 34 + 0M +L 34 + 35 + 0M +L 35 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M From c3fbc95adc94315561ea0db15e54b6ff43e9f4b2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 5 Jun 2024 06:32:12 -0400 Subject: [PATCH 38/67] try to make context selection more deterministic --- src/deconstructor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 588682381aa..3315bac1b81 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -822,7 +822,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t context), ref_travs[j])); } - std::sort(ref_mappings.begin(), ref_mappings.end()); + std::stable_sort(ref_mappings.begin(), ref_mappings.end()); // the best is the last, which has the highest jaccard path_trav_to_ref_trav[i] = ref_mappings.back().second; } From dcf4d23153fb0d1f64a25f11a2f82dc825bf24ba Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 5 Jun 2024 06:32:50 -0400 Subject: [PATCH 39/67] add bcftools to ci -- some tests use it apparently --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2bafca1cc8a..4f331e3dfa5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,7 +11,7 @@ before_script: - sudo apt-get -q -y update # Make sure we have some curl stuff for pycurl which we need for some Python stuff # And the CI report upload needs uuidgen from uuid-runtime - - sudo apt-get -q -y install --no-upgrade docker.io python3-pip python3-virtualenv libcurl4-gnutls-dev python-dev npm nodejs node-gyp uuid-runtime libgnutls28-dev doxygen libzstd-dev + - sudo apt-get -q -y install --no-upgrade docker.io python3-pip python3-virtualenv libcurl4-gnutls-dev python-dev npm nodejs node-gyp uuid-runtime libgnutls28-dev doxygen libzstd-dev bcftools - which junit-merge || sudo npm install -g junit-merge # Configure Docker to use a mirror for Docker Hub and restart the daemon - | From 8c3087996e744aa775c10b916f7dfafbecdff2ba Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 5 Jun 2024 07:46:41 -0400 Subject: [PATCH 40/67] try sorting steps --- src/traversal_finder.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index 2c1a1590c95..b45f79474e8 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -887,6 +887,8 @@ pair, vector> PathTraversalFinder::find_pat pair, vector> PathTraversalFinder::find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { vector start_steps = graph.steps_of_handle(snarl_start); + // try to make this stable across different systems + std::sort(start_steps.begin(), start_steps.end()); vector end_steps = graph.steps_of_handle(snarl_end); // use this to skip paths that don't reach the end node From 0caa164691d1b3eccbbf101cc13e16bef9fc0ec9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 11 Jun 2024 13:00:16 -0400 Subject: [PATCH 41/67] RE tag doesnt work with bcftools; use RD instead --- src/deconstructor.cpp | 8 ++++---- test/t/26_deconstruct.t | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 3315bac1b81..b4936a6b03d 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1025,7 +1025,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t v.info["PR"].push_back(std::to_string(ref_info.parent_ref_len)); v.info["RC"].push_back(ref_info.lv0_ref_name); v.info["RS"].push_back(std::to_string(ref_info.lv0_ref_start)); - v.info["RE"].push_back(std::to_string(ref_info.lv0_ref_len)); + v.info["RD"].push_back(std::to_string(ref_info.lv0_ref_len)); v.info["RL"].push_back(std::to_string(ref_info.lv0_alt_len)); } @@ -1167,9 +1167,9 @@ string Deconstructor::get_vcf_header() { stream << "\">" << endl; } if (path_to_sample_phase && cluster_threshold < 1) { - stream << "##FORMAT" + stream << "##FORMAT=" << endl; - stream << "##FORMAT" + stream << "##FORMAT=" << endl; } @@ -1187,7 +1187,7 @@ string Deconstructor::get_vcf_header() { stream << "##INFO=" << endl; stream << "##INFO=" << endl; stream << "##INFO=" << endl; - stream << "##INFO=" << endl; + stream << "##INFO=" << endl; stream << "##INFO=" << endl; } if (untangle_allele_traversals) { diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 521f6b9bfb3..ea7ed91754d 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -197,15 +197,15 @@ rm -f nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv # but seems like something that needs reviewing vg snarls nesting/nested_snp_in_nested_ins.gfa -A cactus > nested_snp_in_nested_ins.snarls vg deconstruct nesting/nested_snp_in_nested_ins.gfa -r nested_snp_in_nested_ins.snarls -P x -n > nested_snp_in_nested_ins.vcf -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RE=1;RL=1;RS=1;LV=0" "INFO tags correct for level-0 site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RE=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for level-1 site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RE=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for level-2 site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0" "INFO tags correct for level-0 site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RD=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for level-1 site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RD=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for level-2 site of double-nested SNP" rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf vg deconstruct nesting/nested_snp_in_ins_cycle.gfa -P x -n > nested_snp_in_ins_cycle.vcf -printf "A#1#y0\t6\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RE=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv -printf "x\t1\t>1>6\tC\tCAAGAAG,CATG,CAAG\tAC=1,2,1;AF=0.25,0.5,0.25;AN=4;AT=>1>6,>1>2>4>5>2>4>5>6,>1>2>3>5>6,>1>2>4>5>6;NS=2;PA=0;PL=1;PR=1;RC=x;RE=1;RL=1;RS=1;LV=0\t1|2\t3|2\n" >> nested_snp_in_ins_cycle_truth.tsv +printf "A#1#y0\t6\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RD=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv +printf "x\t1\t>1>6\tC\tCAAGAAG,CATG,CAAG\tAC=1,2,1;AF=0.25,0.5,0.25;AN=4;AT=>1>6,>1>2>4>5>2>4>5>6,>1>2>3>5>6,>1>2>4>5>6;NS=2;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0\t1|2\t3|2\n" >> nested_snp_in_ins_cycle_truth.tsv grep -v ^# nested_snp_in_ins_cycle.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $8 "\t" $10 "\t" $11}' > nested_snp_in_ins_cycle.tsv diff nested_snp_in_ins_cycle.tsv nested_snp_in_ins_cycle_truth.tsv is "$?" 0 "nested deconstruction handles cycle" From cc83403552113d957217c570618b534b9bff574b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 11 Jun 2024 13:44:27 -0400 Subject: [PATCH 42/67] make star-allele opt-in with -R --- src/deconstructor.cpp | 10 +++++++--- src/deconstructor.hpp | 8 +++++++- src/subcommand/deconstruct_main.cpp | 18 ++++++++++++++---- test/t/26_deconstruct.t | 15 ++++++++++++--- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index b4936a6b03d..e6dbaff7b4d 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -974,8 +974,10 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // add in the star alleles -- these are alleles that were genotyped in the parent but not // the current allele, and are treated as *'s in VCF. - sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, - in_nesting_info->sample_to_haplotypes); + if (this->star_allele) { + sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, + in_nesting_info->sample_to_haplotypes); + } } @@ -1364,7 +1366,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand bool long_ref_contig, double cluster_threshold, gbwt::GBWT* gbwt, - bool nested_decomposition) { + bool nested_decomposition, + bool star_allele) { this->graph = graph; this->ref_paths = set(ref_paths.begin(), ref_paths.end()); @@ -1380,6 +1383,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand this->cluster_threshold = cluster_threshold; this->gbwt = gbwt; this->nested_decomposition = nested_decomposition; + this->star_allele = star_allele; // the need to use nesting is due to a problem with omp tasks and shared state // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 16c75d00c51..ee537495a8e 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -47,7 +47,8 @@ class Deconstructor : public VCFOutputCaller { bool long_ref_contig, double cluster_threshold = 1.0, gbwt::GBWT* gbwt = nullptr, - bool nested_decomposition = false); + bool nested_decomposition = false, + bool star_allele = false); private: @@ -201,6 +202,11 @@ class Deconstructor : public VCFOutputCaller { // (which lives in vcfoutputcaller) but with more of an effort to link // the parent and child snarls, as well as better support for nested insertions bool nested_decomposition = false; + + // use *-alleles to represent spanning alleles that do not cross site but do go around it + // ex: a big containing deletion + // only works with nested_decomposition + bool star_allele = false; }; diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index aa976584d87..0d381732533 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -52,7 +52,8 @@ void help_deconstruct(char** argv){ << " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl << " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl << " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental]" << endl - << " -n, --nested Write a nested VCF, including *-alleles and special tags. [experimental]" << endl + << " -n, --nested Write a nested VCF, including special tags. [experimental]" << endl + << " -R, --star-allele Use *-alleles to denote alleles that span but do not cross the site. Only works with -n" << endl << " -t, --threads N Use N threads" << endl << " -v, --verbose Print some status messages" << endl << endl; @@ -81,6 +82,7 @@ int main_deconstruct(int argc, char** argv){ bool contig_only_ref = false; double cluster_threshold = 1.0; bool nested = false; + bool star_allele = false; int c; optind = 2; // force optind past command positional argument @@ -104,13 +106,14 @@ int main_deconstruct(int argc, char** argv){ {"contig-only-ref", no_argument, 0, 'C'}, {"cluster", required_argument, 0, 'L'}, {"nested", no_argument, 0, 'n'}, + {"start-allele", no_argument, 0, 'R'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nt:v", + c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nRt:v", long_options, &option_index); // Detect the end of the options. @@ -171,6 +174,9 @@ int main_deconstruct(int argc, char** argv){ case 'n': nested = true; break; + case 'R': + star_allele = true; + break; case 't': omp_set_num_threads(parse(optarg)); break; @@ -192,9 +198,12 @@ int main_deconstruct(int argc, char** argv){ cerr << "Error [vg deconstruct]: -C cannot be used with -n" << endl; return 1; } + if (star_allele == true && nested == false) { + cerr << "Error [vg deconstruct]: -R can only be used with -n" << endl; + return 1; + } // Read the graph - unique_ptr path_handle_graph_up; unique_ptr gbz_graph; gbwt::GBWT* gbwt_index = nullptr; @@ -372,7 +381,8 @@ int main_deconstruct(int argc, char** argv){ !contig_only_ref, cluster_threshold, gbwt_index, - nested); + nested, + star_allele); return 0; } diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index ea7ed91754d..7e6d0584899 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 36 +plan tests 37 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -155,7 +155,7 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf -vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +vg deconstruct nesting/nested_snp_in_del.gfa -p x -nR > nested_snp_in_del.vcf grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv @@ -166,6 +166,15 @@ is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both site rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv +printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv +printf "T\tA\t1|.\n" >> nested_snp_in_del_truth.tsv +diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion (without -R)" + +rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv + vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv printf "A\tT\t0|1\n" > nested_snp_in_ins_truth.tsv @@ -184,7 +193,7 @@ is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp insi rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv -vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv +vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -nR | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv printf "A\tT,*\t0|1\t0|2\n" > nested_snp_in_ins2._truth.tsv printf "C\tCAAG,CATG\t1|2\t1|0\n" >> nested_snp_in_ins2._truth.tsv diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv From abfe6479bbdf6330c8c167731384e647f411fd53 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 11 Jun 2024 16:15:58 -0400 Subject: [PATCH 43/67] typo --- src/deconstructor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index e6dbaff7b4d..db912596003 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1169,9 +1169,9 @@ string Deconstructor::get_vcf_header() { stream << "\">" << endl; } if (path_to_sample_phase && cluster_threshold < 1) { - stream << "##FORMAT=" + stream << "##FORMAT=" << endl; - stream << "##FORMAT=" + stream << "##FORMAT=" << endl; } From 5c2450cdb5a08a28202d3ff795c8abd07ef57c34 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 21 Nov 2023 13:11:16 -0800 Subject: [PATCH 44/67] Add an --exact-name/-c option to vg filter for when read names are prefixes of each other --- src/readfilter.hpp | 9 ++++++--- src/subcommand/filter_main.cpp | 13 ++++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 7c0fab58daa..4ca7f053574 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -43,6 +43,8 @@ class ReadFilter{ /// TODO: This should be a trie but I don't have one handy. /// Must be sorted for vaguely efficient search. vector name_prefixes; + /// Read name must not have anything in it besides the prefix + bool exact_name = false; /// Read must not have a refpos set with a contig name containing a match to any of these vector excluded_refpos_contigs; /// Read must contain at least one of these strings as a subsequence @@ -192,7 +194,8 @@ class ReadFilter{ double get_score(const Read& read) const; /** - * Does the read name have one of the indicated prefixes? + * Does the read name have one of the indicated prefixes? If exact_name is + * set, only finds complete matches of a "prefix" to the whole read name. */ bool matches_name(const Read& read) const; @@ -712,7 +715,7 @@ bool ReadFilter::matches_name(const Read& aln) const { right_match++; } - if (left_match == name_prefixes[left_bound].size() || right_match == name_prefixes[right_bound].size()) { + if (!exact_name && (left_match == name_prefixes[left_bound].size() || right_match == name_prefixes[right_bound].size())) { // We found a match already found = true; } else { @@ -729,7 +732,7 @@ bool ReadFilter::matches_name(const Read& aln) const { center_match++; } - if (center_match == name_prefixes[center].size()) { + if (center_match == name_prefixes[center].size() && (!exact_name || center_match == aln.name().size())) { // We found a hit! found = true; break; diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index af1c5678cbb..f5101f83513 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -31,6 +31,7 @@ void help_filter(char** argv) { << " -M, --input-mp-alns input is multipath alignments (GAMP) rather than GAM" << endl << " -n, --name-prefix NAME keep only reads with this prefix in their names [default='']" << endl << " -N, --name-prefixes FILE keep reads with names with one of many prefixes, one per nonempty line" << endl + << " -c, --exact-name match read names exactly instead of by prefix" << endl << " -a, --subsequence NAME keep reads that contain this subsequence" << endl << " -A, --subsequences FILE keep reads that contain one of these subsequences, one per nonempty line" << endl << " -p, --proper-pairs keep reads that are annotated as being properly paired" << endl @@ -73,6 +74,7 @@ int main_filter(int argc, char** argv) { bool input_gam = true; vector name_prefixes; + bool exact_name = false; vector excluded_refpos_contigs; unordered_set excluded_features; vector subsequences; @@ -124,6 +126,7 @@ int main_filter(int argc, char** argv) { {"input-mp-alns", no_argument, 0, 'M'}, {"name-prefix", required_argument, 0, 'n'}, {"name-prefixes", required_argument, 0, 'N'}, + {"exact-name", no_argument, 0, 'c'}, {"subsequence", required_argument, 0, 'a'}, {"subsequences", required_argument, 0, 'A'}, {"proper-pairs", no_argument, 0, 'p'}, @@ -150,14 +153,14 @@ int main_filter(int argc, char** argv) { {"interleaved-all", no_argument, 0, 'I'}, {"min-base-quality", required_argument, 0, 'b'}, {"annotation", required_argument, 0, 'B'}, - {"correctly-mapped", no_argument, 0, 'c'}, + {"correctly-mapped", no_argument, 0, 'g'}, {"complement", no_argument, 0, 'U'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:", + c = getopt_long (argc, argv, "Mn:N:ca:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:gUt:", long_options, &option_index); /* Detect the end of the options. */ @@ -185,6 +188,9 @@ int main_filter(int argc, char** argv) { } }); break; + case 'c': + exact_name = true; + break; case 'a': subsequences.push_back(optarg); break; @@ -324,7 +330,7 @@ int main_filter(int argc, char** argv) { case 'B': annotation = optarg; break; - case 'c': + case 'g': correctly_mapped = true; break; case 'U': @@ -370,6 +376,7 @@ int main_filter(int argc, char** argv) { // template lambda to set parameters auto set_params = [&](auto& filter) { filter.name_prefixes = name_prefixes; + filter.exact_name = exact_name; filter.subsequences = subsequences; filter.excluded_refpos_contigs = excluded_refpos_contigs; filter.excluded_features = excluded_features; From e80da35d6eebc8ee3cd2bda07203c94800a1ebc4 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 11 Jun 2024 17:09:45 -0400 Subject: [PATCH 45/67] Move exact-name to -e and correctly-mapped back to -c --- src/subcommand/filter_main.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index f5101f83513..1cfe6526f07 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -126,7 +126,7 @@ int main_filter(int argc, char** argv) { {"input-mp-alns", no_argument, 0, 'M'}, {"name-prefix", required_argument, 0, 'n'}, {"name-prefixes", required_argument, 0, 'N'}, - {"exact-name", no_argument, 0, 'c'}, + {"exact-name", no_argument, 0, 'e'}, {"subsequence", required_argument, 0, 'a'}, {"subsequences", required_argument, 0, 'A'}, {"proper-pairs", no_argument, 0, 'p'}, @@ -153,14 +153,14 @@ int main_filter(int argc, char** argv) { {"interleaved-all", no_argument, 0, 'I'}, {"min-base-quality", required_argument, 0, 'b'}, {"annotation", required_argument, 0, 'B'}, - {"correctly-mapped", no_argument, 0, 'g'}, + {"correctly-mapped", no_argument, 0, 'c'}, {"complement", no_argument, 0, 'U'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "Mn:N:ca:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:gUt:", + c = getopt_long (argc, argv, "Mn:N:ea:A:pPX:F:s:r:Od:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:", long_options, &option_index); /* Detect the end of the options. */ @@ -188,7 +188,7 @@ int main_filter(int argc, char** argv) { } }); break; - case 'c': + case 'e': exact_name = true; break; case 'a': @@ -330,7 +330,7 @@ int main_filter(int argc, char** argv) { case 'B': annotation = optarg; break; - case 'g': + case 'c': correctly_mapped = true; break; case 'U': From c2870086eaa4db6e76fedd76eabcdc0977750d44 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 22 Nov 2023 13:03:46 -0800 Subject: [PATCH 46/67] Fix exact name search to not miss matches on region ends --- src/readfilter.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 4ca7f053574..07167217a45 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -715,7 +715,8 @@ bool ReadFilter::matches_name(const Read& aln) const { right_match++; } - if (!exact_name && (left_match == name_prefixes[left_bound].size() || right_match == name_prefixes[right_bound].size())) { + if ((left_match == name_prefixes[left_bound].size() && (!exact_name || left_match == aln.name().size())) || + (right_match == name_prefixes[right_bound].size() && (!exact_name || right_match == aln.name().size()))) { // We found a match already found = true; } else { From 9c8e7209291216908c91dd6acfd6c45d519955ab Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 11 Jun 2024 17:25:11 -0400 Subject: [PATCH 47/67] Fix --exact-name short option in help --- src/subcommand/filter_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index 1cfe6526f07..f9054aabd2c 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -31,7 +31,7 @@ void help_filter(char** argv) { << " -M, --input-mp-alns input is multipath alignments (GAMP) rather than GAM" << endl << " -n, --name-prefix NAME keep only reads with this prefix in their names [default='']" << endl << " -N, --name-prefixes FILE keep reads with names with one of many prefixes, one per nonempty line" << endl - << " -c, --exact-name match read names exactly instead of by prefix" << endl + << " -e, --exact-name match read names exactly instead of by prefix" << endl << " -a, --subsequence NAME keep reads that contain this subsequence" << endl << " -A, --subsequences FILE keep reads that contain one of these subsequences, one per nonempty line" << endl << " -p, --proper-pairs keep reads that are annotated as being properly paired" << endl From c15c10e3e92ab35342365af812efecd4f81c8a15 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 12 Jun 2024 15:02:02 -0700 Subject: [PATCH 48/67] remove get_previous_step from position assiggnment --- src/surjector.cpp | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index 83f008ce335..c17d485abaa 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -3016,7 +3016,8 @@ using namespace std; << subgraph_bases << " bp strand split subgraph for read " << source.name() << "; suppressing further warnings." << endl; } - return move(make_null_alignment(source)); + surjected = move(make_null_alignment(source)); + return surjected; } // compute the connectivity between the path chunks @@ -3034,7 +3035,8 @@ using namespace std; // we don't overlap this reference path at all or we filtered out all of the path chunks, so just make a sentinel if (mp_aln_graph.empty()) { - return move(make_null_alignment(source)); + surjected = move(make_null_alignment(source)); + return surjected; } // TODO: is this necessary in a linear graph? @@ -3137,22 +3139,24 @@ using namespace std; #ifdef debug_anchored_surject cerr << "looking for path range on " << (rev_strand ? "reverse" : "forward") << " strand, for " << surj_path.mapping_size() << " mappings" << endl; #endif - step_handle_t step = rev_strand ? graph->get_step_at_position(path_handle, ref_path_interval.second) - : graph->get_step_at_position(path_handle, ref_path_interval.first); - step_handle_t end = rev_strand ? graph->get_previous_step(graph->get_step_at_position(path_handle, ref_path_interval.first)) - : graph->get_next_step(graph->get_step_at_position(path_handle, ref_path_interval.second)); + + step_handle_t step = graph->get_step_at_position(path_handle, ref_path_interval.first); + step_handle_t end = graph->get_next_step(graph->get_step_at_position(path_handle, ref_path_interval.second)); // walk the identified interval - for (; step != end; step = rev_strand ? graph->get_previous_step(step) : graph->get_next_step(step)) { - const auto& pos = surj_path.mapping(mappings_matched).position(); + for (; step != end; step = graph->get_next_step(step)) { + size_t idx = rev_strand ? surj_path.mapping_size() - mappings_matched - 1 : mappings_matched; + const auto& pos = surj_path.mapping(idx).position(); handle_t handle = graph->get_handle_of_step(step); if (graph->get_id(handle) == pos.node_id() && ((graph->get_is_reverse(handle) != pos.is_reverse()) == rev_strand)) { // we found the next position we were expecting to - if (mappings_matched == 0) { + if (mappings_matched == 0 || rev_strand) { path_range_out.first = step; } - path_range_out.second = step; + if (mappings_matched == 0 || !rev_strand) { + path_range_out.second = step; + } ++mappings_matched; #ifdef debug_anchored_surject cerr << "\tmatch at node " << graph->get_id(handle) << " " << graph->get_is_reverse(handle) << " at position " << graph->get_position_of_step(step) << endl; @@ -3172,7 +3176,7 @@ using namespace std; mappings_matched = 0; // and go back to where we started on the path // TODO: this is potentially quadratic, there are faster algorithms - step = path_range_out.first; + step = rev_strand ? path_range_out.second : path_range_out.first; } } } @@ -3183,7 +3187,7 @@ using namespace std; mappings_matched = 0; // and go back to where we started on the path // TODO: this is potentially quadratic, there are faster algorithms - step = path_range_out.first; + step = rev_strand ? path_range_out.second : path_range_out.first; } #ifdef debug_anchored_surject cerr << "\tmismatch at node " << graph->get_id(handle) << " " << graph->get_is_reverse(handle) << " at position " << graph->get_position_of_step(step) << endl; @@ -3205,6 +3209,10 @@ using namespace std; cerr << "Surjected read dump: " << pb2json(surjected) << endl; exit(1); } +#ifdef debug_anchored_surject + // TODO: dump out path_range_out + cerr << "identified path range between steps on " << graph->get_id(graph->get_handle_of_step(path_range_out.first)) << " " << graph->get_position_of_step(path_range_out.first) << ", " << graph->get_id(graph->get_handle_of_step(path_range_out.second)) << " " << graph->get_position_of_step(path_range_out.second) << '\n'; +#endif } else { // sentinel to indicate that surjection is unmapped From ff7e62bf4f95241ab8fd0de09ad1e623649fad63 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 12 Jun 2024 15:41:16 -0700 Subject: [PATCH 49/67] switch to STL paths internally in surject --- src/multipath_alignment.cpp | 10 ++++---- src/multipath_alignment.hpp | 30 +++++++++++----------- src/multipath_alignment_graph.cpp | 23 +++++++++-------- src/multipath_alignment_graph.hpp | 8 +++--- src/path.hpp | 22 ++++++++-------- src/surjector.cpp | 42 +++++++++++++------------------ src/surjector.hpp | 3 ++- 7 files changed, 67 insertions(+), 71 deletions(-) diff --git a/src/multipath_alignment.cpp b/src/multipath_alignment.cpp index 3f5c0d4d51f..ecea6cf4fc9 100644 --- a/src/multipath_alignment.cpp +++ b/src/multipath_alignment.cpp @@ -23,19 +23,19 @@ using namespace vg::io; namespace vg { - multipath_alignment_t::multipath_alignment_t() : _mapping_quality(0) { + multipath_alignment_t::multipath_alignment_t() noexcept : _mapping_quality(0) { // i don't understand why this is necessary, but somehow mapq is getting defaulted to 160? } - multipath_alignment_t::multipath_alignment_t(const multipath_alignment_t& other) { + multipath_alignment_t::multipath_alignment_t(const multipath_alignment_t& other) noexcept { *this = other; } - multipath_alignment_t::multipath_alignment_t(multipath_alignment_t&& other) { + multipath_alignment_t::multipath_alignment_t(multipath_alignment_t&& other) noexcept { *this = move(other); } - multipath_alignment_t& multipath_alignment_t::operator=(const multipath_alignment_t& other) { + multipath_alignment_t& multipath_alignment_t::operator=(const multipath_alignment_t& other) noexcept { _sequence = other._sequence; _quality = other._quality; _subpath = other._subpath; @@ -64,7 +64,7 @@ namespace vg { return *this; } - multipath_alignment_t& multipath_alignment_t::operator=(multipath_alignment_t&& other) { + multipath_alignment_t& multipath_alignment_t::operator=(multipath_alignment_t&& other) noexcept { if (this != &other) { _sequence = move(other._sequence); _quality = move(other._quality); diff --git a/src/multipath_alignment.hpp b/src/multipath_alignment.hpp index 2efa7f27796..134473ceb44 100644 --- a/src/multipath_alignment.hpp +++ b/src/multipath_alignment.hpp @@ -28,12 +28,12 @@ namespace vg { class connection_t { public: - connection_t() = default; - connection_t(const connection_t&) = default; - connection_t(connection_t&&) = default; + connection_t() noexcept = default; + connection_t(const connection_t&) noexcept = default; + connection_t(connection_t&&) noexcept = default; ~connection_t() = default; - connection_t& operator=(const connection_t&) = default; - connection_t& operator=(connection_t&&) = default; + connection_t& operator=(const connection_t&) noexcept = default; + connection_t& operator=(connection_t&&) noexcept = default; inline int32_t next() const; inline void set_next(int32_t n); inline int32_t score() const; @@ -48,12 +48,12 @@ namespace vg { */ class subpath_t { public: - subpath_t() = default; - subpath_t(const subpath_t&) = default; - subpath_t(subpath_t&&) = default; + subpath_t() noexcept = default; + subpath_t(const subpath_t&) noexcept = default; + subpath_t(subpath_t&&) noexcept = default; ~subpath_t() = default; - subpath_t& operator=(const subpath_t&) = default; - subpath_t& operator=(subpath_t&&) = default; + subpath_t& operator=(const subpath_t&) noexcept = default; + subpath_t& operator=(subpath_t&&) noexcept = default; inline const path_t& path() const; inline path_t* mutable_path(); inline bool has_path() const; @@ -86,12 +86,12 @@ namespace vg { // TODO: the metadata could be removed and only added to the protobuf at serialization time class multipath_alignment_t { public: - multipath_alignment_t(); - multipath_alignment_t(const multipath_alignment_t& other); - multipath_alignment_t(multipath_alignment_t&& other); + multipath_alignment_t() noexcept; + multipath_alignment_t(const multipath_alignment_t& other) noexcept; + multipath_alignment_t(multipath_alignment_t&& other) noexcept; ~multipath_alignment_t(); - multipath_alignment_t& operator=(const multipath_alignment_t& other); - multipath_alignment_t& operator=(multipath_alignment_t&& other); + multipath_alignment_t& operator=(const multipath_alignment_t& other) noexcept; + multipath_alignment_t& operator=(multipath_alignment_t&& other) noexcept; inline const string& sequence() const; inline string* mutable_sequence(); inline void set_sequence(const string& s); diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index a9cf989d726..6af414160c6 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -58,7 +58,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, - const vector, Path>>& path_chunks, + const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) { @@ -75,7 +75,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, - const vector, Path>>& path_chunks, + const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) : MultipathAlignmentGraph(graph, path_chunks, alignment, project, @@ -86,7 +86,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, - const vector, Path>>& path_chunks, + const vector, path_t>>& path_chunks, const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) : MultipathAlignmentGraph(graph, path_chunks, alignment, create_projector(projection_trans), @@ -172,8 +172,9 @@ namespace vg { } // shim the aligned path into the path chunks constructor to make a node for it - vector, Path>> path_holder; - path_holder.emplace_back(make_pair(alignment.sequence().begin(), alignment.sequence().end()), alignment.path()); + vector, path_t>> path_holder; + path_holder.emplace_back(make_pair(alignment.sequence().begin(), alignment.sequence().end()), path_t()); + from_proto_path(alignment.path(), path_holder.front().second); create_path_chunk_nodes(graph, path_holder, alignment, project, injection_trans); // cut the snarls out of the aligned path so we can realign through them @@ -205,7 +206,7 @@ namespace vg { // Nothing to do } - void MultipathAlignmentGraph::create_path_chunk_nodes(const HandleGraph& graph, const vector, Path>>& path_chunks, + void MultipathAlignmentGraph::create_path_chunk_nodes(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, vector* path_node_provenance) { @@ -216,7 +217,7 @@ namespace vg { cerr << "performing DFS to walk out path " << pb2json(path_chunk.second) << endl; #endif - const Path& path = path_chunk.second; + const auto& path = path_chunk.second; auto range = injection_trans.equal_range(path.mapping(0).position().node_id()); for (auto iter = range.first; iter != range.second; iter++) { @@ -248,7 +249,7 @@ namespace vg { #endif pair projected_trav = project(graph.get_id(trav)); - const Position& pos = path.mapping(stack.size() - 1).position(); + const auto& pos = path.mapping(stack.size() - 1).position(); if (projected_trav.first == pos.node_id() && projected_trav.second == (projected_trav.second != graph.get_is_reverse(trav))) { @@ -291,8 +292,8 @@ namespace vg { // move over the path for (size_t i = 0; i < path.mapping_size(); i++) { - const Mapping& mapping = path.mapping(i); - const Position& position = mapping.position(); + const auto& mapping = path.mapping(i); + const auto& position = mapping.position(); auto& stack_record = stack[i]; handle_t& trav = stack_record.second[stack_record.first - 1]; @@ -307,7 +308,7 @@ namespace vg { new_position->set_offset(position.offset()); for (int64_t j = 0; j < mapping.edit_size(); j++) { - from_proto_edit(mapping.edit(j), *new_mapping->add_edit()); + *new_mapping->add_edit() = mapping.edit(j); } } diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index 90b3de547a7..752c9f56cd0 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -85,19 +85,19 @@ namespace vg { /// Construct a graph of the reachability between aligned chunks in a linearized /// path graph. Produces a graph with reachability edges. - MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily - MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily /// and using a lambda for a projector - MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); @@ -261,7 +261,7 @@ namespace vg { int64_t* removed_end_from_length = nullptr); /// Add the path chunks as nodes to the connectivity graph - void create_path_chunk_nodes(const HandleGraph& graph, const vector, Path>>& path_chunks, + void create_path_chunk_nodes(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, vector* path_node_provenance = nullptr); diff --git a/src/path.hpp b/src/path.hpp index b50208186b0..7d7800bf03d 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -1,4 +1,4 @@ - #ifndef VG_PATH_HPP_INCLUDED +#ifndef VG_PATH_HPP_INCLUDED #define VG_PATH_HPP_INCLUDED #include @@ -403,12 +403,12 @@ class edit_t { // the mapping_t name is already taken class path_mapping_t { public: - path_mapping_t() = default; - path_mapping_t(const path_mapping_t&) = default; - path_mapping_t(path_mapping_t&&) = default; + path_mapping_t() noexcept = default; + path_mapping_t(const path_mapping_t&) noexcept = default; + path_mapping_t(path_mapping_t&&) noexcept = default; ~path_mapping_t() = default; - path_mapping_t& operator=(const path_mapping_t&) = default; - path_mapping_t& operator=(path_mapping_t&&) = default; + path_mapping_t& operator=(const path_mapping_t&) noexcept = default; + path_mapping_t& operator=(path_mapping_t&&) noexcept = default; inline const position_t& position() const; inline position_t* mutable_position(); inline const vector& edit() const; @@ -426,12 +426,12 @@ class path_mapping_t { class path_t { public: - path_t() = default; - path_t(const path_t&) = default; - path_t(path_t&&) = default; + path_t() noexcept = default; + path_t(const path_t&) noexcept = default; + path_t(path_t&&) noexcept = default; ~path_t() = default; - path_t& operator=(const path_t&) = default; - path_t& operator=(path_t&&) = default; + path_t& operator=(const path_t&) noexcept = default; + path_t& operator=(path_t&&) noexcept = default; inline const vector& mapping() const; inline const path_mapping_t& mapping(size_t i) const; inline vector* mutable_mapping(); diff --git a/src/surjector.cpp b/src/surjector.cpp index c17d485abaa..824815fcf59 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -1687,15 +1687,14 @@ using namespace std; final_position.offset() + mapping_from_length(*final_mapping) == first_position.offset()) { for (const auto& edit : first_mapping.edit()) { - *final_mapping->add_edit() = edit; + from_proto_edit(edit, *final_mapping->add_edit()); } ++k; } // copy over the rest of the mappings for (; k < n; ++k) { auto mapping = path_chunks[i].second.add_mapping(); - *mapping = aln.path().mapping(k); - mapping->set_rank(path_chunks[i].second.mapping_size()); + from_proto_mapping(aln.path().mapping(k), *mapping); } #ifdef debug_constrictions @@ -1735,11 +1734,10 @@ using namespace std; #endif // copy the repair alignment - Path concat_path; + path_t concat_path; for (size_t k = n; k < aln.path().mapping_size(); ++k) { auto mapping = concat_path.add_mapping(); - *mapping = aln.path().mapping(k); - mapping->set_rank(concat_path.mapping_size()); + from_proto_mapping(aln.path().mapping(k), *mapping); } // check if we need to merge the first and last mappings @@ -1761,11 +1759,10 @@ using namespace std; for (; k < path_chunks[i].second.mapping_size(); ++k) { auto mapping = concat_path.add_mapping(); *mapping = path_chunks[i].second.mapping(k); - mapping->set_rank(concat_path.mapping_size()); } // replace the original path - path_chunks[i].second = concat_path; + path_chunks[i].second = std::move(concat_path); #ifdef debug_constrictions cerr << "extended right path " << i << " to " << pb2json(path_chunks[i].second) << endl; @@ -1932,7 +1929,6 @@ using namespace std; for (size_t l = begin_idx; l < end_idx; ++l) { auto mapping = path_chunk.second.add_mapping(); *mapping = path_chunks[i].second.mapping(l); - mapping->set_rank(l - begin_idx + 1); } // identify the read interval path_chunk.first.first = read_begin; @@ -2187,12 +2183,12 @@ using namespace std; surjected.set_mapping_quality(src_mapping_quality); auto surj_subpath = surjected.add_subpath(); - from_proto_path(path_chunks.front().second, *surj_subpath->mutable_path()); + *surj_subpath->mutable_path() = move(path_chunks.front().second); Alignment aln; aln.set_sequence(src_sequence); aln.set_quality(src_quality); - *aln.mutable_path() = move(path_chunks.front().second); + to_proto_path(surj_subpath->path(), *aln.mutable_path()); surj_subpath->set_score(get_aligner(!src_quality.empty())->score_contiguous_alignment(aln)); surjected.add_start(0); @@ -2968,7 +2964,7 @@ using namespace std; // just copy it over surjected.set_sequence(source.sequence()); surjected.set_quality(source.quality()); - *surjected.mutable_path() = path_chunks.front().second; + to_proto_path(path_chunks.front().second, *surjected.mutable_path()); surjected.set_score(get_aligner(!source.quality().empty())->score_contiguous_alignment(surjected)); } @@ -3377,10 +3373,10 @@ using namespace std; prev_edit->set_sequence(prev_edit->sequence() + next_edit.sequence()); } else { - to_proto_edit(next_edit, *prev_mapping->add_edit()); + *prev_mapping->add_edit() = next_edit; } for (size_t k = 1; k < next_mapping.edit_size(); ++k) { - to_proto_edit(next_mapping.edit(k), *prev_mapping->add_edit()); + *prev_mapping->add_edit() = next_mapping.edit(k); } merged_mapping = true; @@ -3388,7 +3384,7 @@ using namespace std; } if (!merged_mapping) { // make a new mapping - to_proto_mapping(next_mapping, *path_chunk.add_mapping()); + *path_chunk.add_mapping() = next_mapping; } } @@ -3643,10 +3639,9 @@ using namespace std; // move the end of the sequence out aln_chunk.first.second = source.sequence().begin() + through_to_length; - Mapping* mapping = aln_chunk.second.add_mapping(); + auto mapping = aln_chunk.second.add_mapping(); // add this mapping - *mapping = path.mapping(i); - mapping->set_rank(aln_chunk.second.mapping(aln_chunk.second.mapping_size() - 2).rank() + 1); + from_proto_mapping(path.mapping(i), *mapping); // in the next iteration, this step should point into the chunk it just extended next_extending_steps[make_pair(step, path_strand)] = extending_steps[make_pair(prev_step, path_strand)]; @@ -3668,9 +3663,8 @@ using namespace std; aln_chunk.first.second = source.sequence().begin() + through_to_length; // and with the first mapping - Mapping* mapping = aln_chunk.second.add_mapping(); - *mapping = path.mapping(i); - mapping->set_rank(1); + auto mapping = aln_chunk.second.add_mapping(); + from_proto_mapping(path.mapping(i), *mapping); // keep track of where this chunk is in the vector and which step it came from // for the next iteration @@ -4092,7 +4086,7 @@ using namespace std; size_t right_overhang = no_right_expansion ? 0 : (get_aligner()->longest_detectable_gap(source, path_chunk.first.second) + (source.sequence().end() - path_chunk.first.second)); - const Position& first_pos = path_chunk.second.mapping(0).position(); + const auto& first_pos = path_chunk.second.mapping(0).position(); if (rev_strand) { size_t path_offset = (graph->get_position_of_step(ref_chunk.first) + graph->get_length(graph->get_handle_of_step(ref_chunk.first)) @@ -4111,8 +4105,8 @@ using namespace std; } } - const Mapping& final_mapping = path_chunk.second.mapping(path_chunk.second.mapping_size() - 1); - const Position& final_pos = final_mapping.position(); + const auto& final_mapping = path_chunk.second.mapping(path_chunk.second.mapping_size() - 1); + const auto& final_pos = final_mapping.position(); if (rev_strand) { size_t path_offset = (graph->get_position_of_step(ref_chunk.second) + graph->get_length(graph->get_handle_of_step(ref_chunk.second)) diff --git a/src/surjector.hpp b/src/surjector.hpp index c1536abb03b..d751c798b27 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -14,6 +14,7 @@ #include "aligner.hpp" #include "handle.hpp" +#include "path.hpp" #include #include "multipath_alignment.hpp" @@ -97,7 +98,7 @@ using namespace std; bool preserve_deletions = false) const; /// a local type that represents a read interval matched to a portion of the alignment path - using path_chunk_t = pair, Path>; + using path_chunk_t = pair, path_t>; /// the minimum length deletion that the spliced algorithm will interpret as a splice event int64_t min_splice_length = 20; From 2d67e4ecaf70029a4295929f900e0d79f87ea1ef Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 12 Jun 2024 19:47:01 -0700 Subject: [PATCH 50/67] use only get_next_step when getting path overlapping chunks --- src/surjector.cpp | 148 ++++++++++++++++++++++++++++------------------ 1 file changed, 89 insertions(+), 59 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index 824815fcf59..7a1f3ed0b92 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -239,7 +239,7 @@ using namespace std; } cerr << endl; cerr << "\tpath interval " << graph->get_position_of_step(surjection_record.second.second[i].first) << " - " << graph->get_position_of_step(surjection_record.second.second[i].second) << endl; - cerr << "\t" << pb2json(anchor.second) << endl; + cerr << "\t" << debug_string(anchor.second) << endl; } if (connections.count(surjection_record.first)) { cerr << "\tconnections" << endl; @@ -1698,7 +1698,7 @@ using namespace std; } #ifdef debug_constrictions - cerr << "extended left path " << i << " to " << pb2json(path_chunks[i].second) << endl; + cerr << "extended left path " << i << " to " << debug_string(path_chunks[i].second) << endl; #endif ++left; } @@ -1765,7 +1765,7 @@ using namespace std; path_chunks[i].second = std::move(concat_path); #ifdef debug_constrictions - cerr << "extended right path " << i << " to " << pb2json(path_chunks[i].second) << endl; + cerr << "extended right path " << i << " to " << debug_string(path_chunks[i].second) << endl; #endif ++right; } @@ -1875,7 +1875,7 @@ using namespace std; // so we record an overlap overlaps.emplace_back(j, k); #ifdef debug_spliced_surject - cerr << "path chunk " << i << " overlaps " << j << " by " << to_walk << " on read, marking an overlap to split before mapping " << k << " at " << pb2json(path_chunks[j].second.mapping(k)) << endl; + cerr << "path chunk " << i << " overlaps " << j << " by " << to_walk << " on read, marking an overlap to split before mapping " << k << " at " << debug_string(path_chunks[j].second.mapping(k)) << endl; #endif } } @@ -1948,7 +1948,7 @@ using namespace std; #ifdef debug_spliced_surject cerr << "next split for chunk " << i << " as " << split_path_chunks.size() - 1 << ", consisting of " << endl; cerr << "\t" << string(path_chunk.first.first, path_chunk.first.second) << endl; - cerr << "\t" << pb2json(path_chunk.second) << endl; + cerr << "\t" << debug_string(path_chunk.second) << endl; cerr << "\t" << graph->get_position_of_step(ref_chunk.first) << " : " << graph->get_position_of_step(ref_chunk.second) << endl; #endif } @@ -1959,7 +1959,7 @@ using namespace std; #ifdef debug_spliced_surject cerr << "no splits on chunk " << i << ", add as " << split_path_chunks.size() << endl; cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << endl; - cerr << "\t" << pb2json(path_chunks[i].second) << endl; + cerr << "\t" << debug_string(path_chunks[i].second) << endl; cerr << "\t" << graph->get_position_of_step(ref_chunks[i].first) << " : " << graph->get_position_of_step(ref_chunks[i].second) << endl; #endif split_path_chunks.emplace_back(move(path_chunks[i])); @@ -2923,7 +2923,7 @@ using namespace std; cerr << "using overlap chunks on path " << graph->get_path_name(path_handle) << " strand " << rev_strand << ", performing realigning surjection" << endl; cerr << "chunks:" << endl; for (size_t i = 0; i < path_chunks.size(); ++i) { - cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << ", " << pb2json(path_chunks[i].second) << endl; + cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << ", " << debug_string(path_chunks[i].second) << endl; } #endif @@ -3433,7 +3433,7 @@ using namespace std; // remember that we've already emitted all the mappings currently on the stack added_new_mappings = false; #ifdef debug_multipath_surject - cerr << "converted stack into path " << pb2json(path_chunk) << endl; + cerr << "converted stack into path " << debug_string(path_chunk) << endl; cerr << "read interval is " << (chunk.first.first - source.sequence().begin()) << ":" << (chunk.first.second - source.sequence().begin()) << " " << string(chunk.first.first, chunk.first.second) << endl; #endif } @@ -3561,7 +3561,7 @@ using namespace std; // for each path that we're extending, the previous step and the strand we were at on it // mapped to the index of that path chunk in the path's vector - unordered_map, size_t> extending_steps; + unordered_map fwd_extending_steps, rev_extending_steps; int64_t through_to_length = 0; for (size_t i = 0; i < path.mapping_size(); i++) { @@ -3576,7 +3576,7 @@ using namespace std; cerr << "looking for paths on mapping " << i << " at position " << make_pos_t(pos) << endl; #endif - unordered_map, size_t> next_extending_steps; + unordered_map next_fwd_extending_steps, next_rev_extending_steps; for (const step_handle_t& step : graph->steps_of_handle(handle)) { @@ -3601,84 +3601,114 @@ using namespace std; // the path does, then the read runs along the path in reverse. bool path_strand = graph->get_is_reverse(handle) != graph->get_is_reverse(graph->get_handle_of_step(step)); - step_handle_t prev_step = path_strand ? graph->get_next_step(step) : graph->get_previous_step(step); - #ifdef debug_anchored_surject - cerr << "path strand is " << (path_strand ? "rev" : "fwd") << ", prev step is "; - if (prev_step == graph->path_end(path_handle)) { - cerr << " path end"; - } - else if (prev_step == graph->path_front_end(path_handle)) { - cerr << " path front end"; - } - else { - cerr << graph->get_id(graph->get_handle_of_step(prev_step)) << (graph->get_is_reverse(graph->get_handle_of_step(prev_step)) ? "-" : "+"); - } - cerr << endl; - cerr << "possible extensions from: " << endl; - for (const auto& record : extending_steps) { - cerr << "\t" << "chunk " << record.second << " at " << graph->get_id(graph->get_handle_of_step(record.first.first)) << (graph->get_is_reverse(graph->get_handle_of_step(record.first.first)) ? "-" : "+") << " on " << graph->get_path_name(graph->get_path_handle_of_step(record.first.first)) << " " << (record.first.second ? "rev" : "fwd") << endl; - } + cerr << "path strand is rev? " << path_strand << endl; #endif - auto& path_chunks = to_return[make_pair(path_handle, path_strand)]; + auto& next_extending_steps = path_strand ? next_rev_extending_steps : next_fwd_extending_steps; - if (extending_steps.count(make_pair(prev_step, path_strand))) { - // we are extending from the previous step, so we continue with the extension + next_extending_steps[step] = numeric_limits::max(); + } + + // TODO: forward and reverse code is repetitive + + // extend forward strand steps + for (const auto& extending_step : fwd_extending_steps) { + auto next_step = graph->get_next_step(extending_step.first); + auto it = next_fwd_extending_steps.find(next_step); + if (it != next_fwd_extending_steps.end()) { + it->second = extending_step.second; + auto& path_chunks = to_return[make_pair(graph->get_path_handle_of_step(extending_step.first), false)]; + auto& aln_chunk = path_chunks.first[extending_step.second]; + auto& ref_chunk = path_chunks.second[extending_step.second]; - size_t chunk_idx = extending_steps[make_pair(prev_step, path_strand)]; - auto& aln_chunk = path_chunks.first[chunk_idx]; - auto& ref_chunk = path_chunks.second[chunk_idx]; + // extend the range of the path on the reference + ref_chunk.second = it->first; + // move the end of the sequence out + aln_chunk.first.second = source.sequence().begin() + through_to_length; + // add this mapping + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); #ifdef debug_anchored_surject - cerr << "comes after chunk " << chunk_idx << endl; + cerr << "step on " << graph->get_id(graph->get_handle_of_step(it->first)) << ", pos " << graph->get_position_of_step(it->first) << " comes after forward chunk " << it->second << endl; #endif + } + } + // initialize new chunks for steps that did not extend + for (pair& extended_step : next_fwd_extending_steps) { + if (extended_step.second == numeric_limits::max()) { - // extend the range of the path on the reference - ref_chunk.second = step; + auto& path_chunks = to_return[make_pair(graph->get_path_handle_of_step(extended_step.first), false)]; + extended_step.second = path_chunks.first.size(); + + path_chunks.first.emplace_back(); + path_chunks.second.emplace_back(); + auto& aln_chunk = path_chunks.first.back(); + auto& ref_chunk = path_chunks.second.back(); + // init the ref interval with the interval along the embedded path + ref_chunk.first = extended_step.first; + ref_chunk.second = extended_step.first; + // init the new chunk with the sequence interval + aln_chunk.first.first = source.sequence().begin() + before_to_length; + aln_chunk.first.second = source.sequence().begin() + through_to_length; + // and with the first mapping + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); + +#ifdef debug_anchored_surject + cerr << "step on " << graph->get_id(graph->get_handle_of_step(extended_step.first)) << ", pos " << graph->get_position_of_step(extended_step.first) << " has no preceeding forward chunk, so start new chunk " << path_chunks.first.size() - 1 << endl; +#endif + } + } + for (pair& extended_step : next_rev_extending_steps) { + + auto& path_chunks = to_return[make_pair(graph->get_path_handle_of_step(extended_step.first), true)]; + + auto prev_step = graph->get_next_step(extended_step.first); + auto it = rev_extending_steps.find(prev_step); + if (it != rev_extending_steps.end()) { + extended_step.second = it->second; + auto& aln_chunk = path_chunks.first[extended_step.second]; + auto& ref_chunk = path_chunks.second[extended_step.second]; + + // extend the range of the path on the reference + ref_chunk.second = extended_step.first; // move the end of the sequence out aln_chunk.first.second = source.sequence().begin() + through_to_length; - auto mapping = aln_chunk.second.add_mapping(); // add this mapping - from_proto_mapping(path.mapping(i), *mapping); + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); - // in the next iteration, this step should point into the chunk it just extended - next_extending_steps[make_pair(step, path_strand)] = extending_steps[make_pair(prev_step, path_strand)]; +#ifdef debug_anchored_surject + cerr << "step on " << graph->get_id(graph->get_handle_of_step(extended_step.first)) << ", pos " << graph->get_position_of_step(extended_step.first) << " comes after reverse chunk " << it->second << endl; +#endif } else { + extended_step.second = path_chunks.first.size(); - // this step does not extend a previous step, so we start a new chunk path_chunks.first.emplace_back(); path_chunks.second.emplace_back(); auto& aln_chunk = path_chunks.first.back(); auto& ref_chunk = path_chunks.second.back(); - // init the ref interval with the interval along the embedded path - ref_chunk.first = step; - ref_chunk.second = step; - + ref_chunk.first = extended_step.first; + ref_chunk.second = extended_step.first; // init the new chunk with the sequence interval aln_chunk.first.first = source.sequence().begin() + before_to_length; aln_chunk.first.second = source.sequence().begin() + through_to_length; - // and with the first mapping - auto mapping = aln_chunk.second.add_mapping(); - from_proto_mapping(path.mapping(i), *mapping); - - // keep track of where this chunk is in the vector and which step it came from - // for the next iteration - next_extending_steps[make_pair(step, path_strand)] = path_chunks.first.size() - 1; + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); #ifdef debug_anchored_surject - cerr << "no preceeding chunk so start new chunk " << path_chunks.first.size() - 1 << endl; + cerr << "step on " << graph->get_id(graph->get_handle_of_step(extended_step.first)) << ", pos " << graph->get_position_of_step(extended_step.first) << " has no preceeding reverse chunk, so start new chunk " << path_chunks.first.size() - 1 << endl; #endif + } } // we've finished extending the steps from the previous mapping, so we replace them // with the steps we found in this iteration that we want to extend on the next one - extending_steps = next_extending_steps; + fwd_extending_steps = std::move(next_fwd_extending_steps); + rev_extending_steps = std::move(next_rev_extending_steps); } return to_return; @@ -3717,7 +3747,7 @@ using namespace std; #ifdef debug_filter_paths cerr << "original order for chunks" << endl; for (size_t i = 0; i < path_chunks.size(); ++i) { - cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << debug_string(path_chunks[i].second) << endl; } cerr << "connections" << endl; for (size_t i = 0; i < outward_connections.size(); ++i) { @@ -3762,7 +3792,7 @@ using namespace std; #ifdef debug_filter_paths cerr << "sort order for chunks" << endl; for (auto i : order) { - cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << debug_string(path_chunks[i].second) << endl; } #endif @@ -3778,7 +3808,7 @@ using namespace std; auto& chunk_here = path_chunks[order[i]]; #ifdef debug_filter_paths cerr << "looking for overlapping chunks for " << order[i] << endl; - cerr << string(chunk_here.first.first, chunk_here.first.second) << " " << pb2json(chunk_here.second) << endl; + cerr << string(chunk_here.first.first, chunk_here.first.second) << " " << debug_string(chunk_here.second) << endl; #endif // remove items from the heap if they are outside the window of this read interval @@ -3811,7 +3841,7 @@ using namespace std; auto remaining = chunk_here.first.first - chunk_over.first.first; #ifdef debug_filter_paths cerr << "overlap candidate " << j << endl; - cerr << string(chunk_over.first.first, chunk_over.first.second) << " " << pb2json(chunk_over.second) << endl; + cerr << string(chunk_over.first.first, chunk_over.first.second) << " " << debug_string(chunk_over.second) << endl; cerr << "at relative read offset " << remaining << endl; #endif @@ -3988,7 +4018,7 @@ using namespace std; for (size_t i = 0; i < path_chunks.size(); ++i) { if (redundant[i]) { #ifdef debug_filter_paths - cerr << "filtering path chunk " << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + cerr << "filtering path chunk " << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << debug_string(path_chunks[i].second) << endl; #endif ++removed_before[i]; } From 321b5a415d7e9cae821820db5012885b280d6f4f Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 12 Jun 2024 20:00:20 -0700 Subject: [PATCH 51/67] memoize path handle queries --- src/surjector.cpp | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index 7a1f3ed0b92..f3892c2fe1f 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -3561,7 +3561,9 @@ using namespace std; // for each path that we're extending, the previous step and the strand we were at on it // mapped to the index of that path chunk in the path's vector - unordered_map fwd_extending_steps, rev_extending_steps; + // note: we also keep the path_handle_t instead of extracting it from the step because this operation + // is slow on GBZ + unordered_map> fwd_extending_steps, rev_extending_steps; int64_t through_to_length = 0; for (size_t i = 0; i < path.mapping_size(); i++) { @@ -3576,7 +3578,7 @@ using namespace std; cerr << "looking for paths on mapping " << i << " at position " << make_pos_t(pos) << endl; #endif - unordered_map next_fwd_extending_steps, next_rev_extending_steps; + unordered_map> next_fwd_extending_steps, next_rev_extending_steps; for (const step_handle_t& step : graph->steps_of_handle(handle)) { @@ -3607,7 +3609,7 @@ using namespace std; auto& next_extending_steps = path_strand ? next_rev_extending_steps : next_fwd_extending_steps; - next_extending_steps[step] = numeric_limits::max(); + next_extending_steps[step] = make_pair(numeric_limits::max(), path_handle); } // TODO: forward and reverse code is repetitive @@ -3617,10 +3619,10 @@ using namespace std; auto next_step = graph->get_next_step(extending_step.first); auto it = next_fwd_extending_steps.find(next_step); if (it != next_fwd_extending_steps.end()) { - it->second = extending_step.second; - auto& path_chunks = to_return[make_pair(graph->get_path_handle_of_step(extending_step.first), false)]; - auto& aln_chunk = path_chunks.first[extending_step.second]; - auto& ref_chunk = path_chunks.second[extending_step.second]; + it->second.first = extending_step.second.first; + auto& path_chunks = to_return[make_pair(it->second.second, false)]; + auto& aln_chunk = path_chunks.first[extending_step.second.first]; + auto& ref_chunk = path_chunks.second[extending_step.second.first]; // extend the range of the path on the reference ref_chunk.second = it->first; @@ -3635,12 +3637,12 @@ using namespace std; } } // initialize new chunks for steps that did not extend - for (pair& extended_step : next_fwd_extending_steps) { - if (extended_step.second == numeric_limits::max()) { + for (pair>& extended_step : next_fwd_extending_steps) { + if (extended_step.second.first == numeric_limits::max()) { - auto& path_chunks = to_return[make_pair(graph->get_path_handle_of_step(extended_step.first), false)]; + auto& path_chunks = to_return[make_pair(extended_step.second.second, false)]; - extended_step.second = path_chunks.first.size(); + extended_step.second.first = path_chunks.first.size(); path_chunks.first.emplace_back(); path_chunks.second.emplace_back(); @@ -3660,16 +3662,16 @@ using namespace std; #endif } } - for (pair& extended_step : next_rev_extending_steps) { + for (pair>& extended_step : next_rev_extending_steps) { - auto& path_chunks = to_return[make_pair(graph->get_path_handle_of_step(extended_step.first), true)]; + auto& path_chunks = to_return[make_pair(extended_step.second.second, true)]; auto prev_step = graph->get_next_step(extended_step.first); auto it = rev_extending_steps.find(prev_step); if (it != rev_extending_steps.end()) { - extended_step.second = it->second; - auto& aln_chunk = path_chunks.first[extended_step.second]; - auto& ref_chunk = path_chunks.second[extended_step.second]; + extended_step.second.first = it->second.first; + auto& aln_chunk = path_chunks.first[extended_step.second.first]; + auto& ref_chunk = path_chunks.second[extended_step.second.first]; // extend the range of the path on the reference ref_chunk.second = extended_step.first; @@ -3683,7 +3685,7 @@ using namespace std; #endif } else { - extended_step.second = path_chunks.first.size(); + extended_step.second.first = path_chunks.first.size(); path_chunks.first.emplace_back(); path_chunks.second.emplace_back(); From 67cca9816045e5b1e4dfe2b5a86482b4c625a03e Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 13 Jun 2024 12:27:44 -0700 Subject: [PATCH 52/67] implement overlay for subpath graph --- src/subpath_overlay.cpp | 105 +++++++++++++++++++++++++++++ src/subpath_overlay.hpp | 107 ++++++++++++++++++++++++++++++ src/unittest/overlays.cpp | 136 ++++++++++++++++++++++++++------------ 3 files changed, 307 insertions(+), 41 deletions(-) create mode 100644 src/subpath_overlay.cpp create mode 100644 src/subpath_overlay.hpp diff --git a/src/subpath_overlay.cpp b/src/subpath_overlay.cpp new file mode 100644 index 00000000000..126c6f97275 --- /dev/null +++ b/src/subpath_overlay.cpp @@ -0,0 +1,105 @@ +#include +#include "subpath_overlay.hpp" + +#include + +namespace vg { + +using namespace std; +using namespace handlegraph; + +SubpathOverlay::SubpathOverlay(const PathHandleGraph* backing, + const step_handle_t& begin, const step_handle_t& end) : backing_graph(backing) { + + for (auto step = begin; step != end; step = backing->get_next_step(step)) { + subpath_handles.emplace_back(backing->get_handle_of_step(step)); + } +} + +bool SubpathOverlay::has_node(nid_t node_id) const { + return node_id <= subpath_handles.size(); +} + +handle_t SubpathOverlay::get_handle(const nid_t& node_id, bool is_reverse) const { + return handlegraph::number_bool_packing::pack(node_id, is_reverse); +} + +nid_t SubpathOverlay::get_id(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle); +} + +bool SubpathOverlay::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); +} + +handle_t SubpathOverlay::flip(const handle_t& handle) const { + return handlegraph::number_bool_packing::toggle_bit(handle); +} + +size_t SubpathOverlay::get_length(const handle_t& handle) const { + return backing_graph->get_length(subpath_handles[get_id(handle) - 1]); +} + +std::string SubpathOverlay::get_sequence(const handle_t& handle) const { + return backing_graph->get_sequence(get_underlying_handle(handle)); +} + +size_t SubpathOverlay::get_node_count() const { + return subpath_handles.size(); +} + +nid_t SubpathOverlay::min_node_id() const { + return 1; +} + +nid_t SubpathOverlay::max_node_id() const { + return subpath_handles.size(); +} + +bool SubpathOverlay::follow_edges_impl(const handle_t& handle, bool go_left, + const std::function& iteratee) const { + if (go_left != get_is_reverse(handle)) { + if (get_id(handle) == 1) { + return true; + } + else { + return iteratee(get_handle(get_id(handle) - 1, get_is_reverse(handle))); + } + } + else { + if (get_id(handle) == subpath_handles.size()) { + return true; + } + else { + return iteratee(get_handle(get_id(handle) + 1, get_is_reverse(handle))); + } + } +} + +bool SubpathOverlay::for_each_handle_impl(const std::function& iteratee, bool parallel) const { + + if (!parallel) { + bool keep_going = true; + for (size_t i = 1; keep_going && i <= subpath_handles.size(); ++i) { + keep_going = iteratee(get_handle(i, false)); + } + return keep_going; + } else { + std::atomic keep_going(true); +#pragma omp parallel for + for (size_t i = 1; i <= subpath_handles.size(); ++i) { + keep_going = keep_going && iteratee(get_handle(i, false)); + } + return keep_going; + } +} + +handle_t SubpathOverlay::get_underlying_handle(const handle_t& handle) const { + handle_t underlying = subpath_handles[get_id(handle) - 1]; + if (get_is_reverse(handle)) { + underlying = backing_graph->flip(underlying); + } + return underlying; +} + +} diff --git a/src/subpath_overlay.hpp b/src/subpath_overlay.hpp new file mode 100644 index 00000000000..91444e14a43 --- /dev/null +++ b/src/subpath_overlay.hpp @@ -0,0 +1,107 @@ +#ifndef VG_SUBGRAPH_OVERLAY_HPP_INCLUDED +#define VG_SUBGRAPH_OVERLAY_HPP_INCLUDED + +/** + * \file subgraph_overlay.hpp + * + * Provides SourceSinkOverlay, a HandleGraph implementation that joins all the + * heads and tails of a backing graph to single source and sink nodes. + * + */ + + +#include "handle.hpp" + +#include + + +namespace vg { + +using namespace handlegraph; + +/** + * Overlay that presents a linear graph corresponding to a subinterval of a path + */ +class SubpathOverlay : public ExpandingOverlayGraph { + +public: + /** + * Construct new SubpathOverlay between two steps. The 'end' step is past-the-last. + */ + SubpathOverlay(const PathHandleGraph* backing, + const step_handle_t& begin, const step_handle_t& end); + SubpathOverlay() = default; + + ~SubpathOverlay() = default; + + //////////////////////////////////////////////////////////////////////////// + // Handle-based interface + //////////////////////////////////////////////////////////////////////////// + + /// Method to check if a node exists by ID + bool has_node(nid_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const nid_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + nid_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + std::string get_sequence(const handle_t& handle) const; + + /// Return the number of nodes in the graph + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + nid_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + nid_t max_node_id() const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + handle_t get_underlying_handle(const handle_t& handle) const; + +protected: + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const std::function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. Returns true if we finished and false if we + /// stopped early. + bool for_each_handle_impl(const std::function& iteratee, bool parallel = false) const; + + /// the backing graph + const HandleGraph* backing_graph = nullptr; + + std::vector subpath_handles; +}; + +} + +#endif diff --git a/src/unittest/overlays.cpp b/src/unittest/overlays.cpp index 01ef1e0e227..2695f4c7c97 100644 --- a/src/unittest/overlays.cpp +++ b/src/unittest/overlays.cpp @@ -8,7 +8,9 @@ #include "vg/io/json2pb.h" #include "../split_strand_graph.hpp" +#include "../subpath_overlay.hpp" #include "../utility.hpp" +#include "../handle.hpp" #include "random_graph.hpp" #include "../vg.hpp" @@ -21,60 +23,112 @@ namespace vg { namespace unittest { using namespace std; +using namespace bdsg; - TEST_CASE("StrandSplitGraph overlay produces graphs that are single stranded", "[overlay][handle]") { +TEST_CASE("StrandSplitGraph overlay produces graphs that are single stranded", "[overlay][handle]") { + + int num_tests = 100; + int seq_size = 500; + int var_len = 10; + int var_count = 30; + + for (int i = 0; i < num_tests; ++i) { - int num_tests = 100; - int seq_size = 500; - int var_len = 10; - int var_count = 30; + bdsg::HashGraph graph; + random_graph(seq_size, var_len, var_count, &graph); + + StrandSplitGraph split(&graph); + REQUIRE(handlealgs::is_single_stranded(&split)); + + REQUIRE(split.get_node_count() == 2 * graph.get_node_count()); + + unordered_map node_count; - for (int i = 0; i < num_tests; ++i) { + int split_edge_trav_count = 0; + int graph_edge_trav_count = 0; + + split.for_each_handle([&](const handle_t& h) { + REQUIRE(split.get_sequence(h) == graph.get_sequence(split.get_underlying_handle(h))); - bdsg::HashGraph graph; - random_graph(seq_size, var_len, var_count, &graph); - - StrandSplitGraph split(&graph); - REQUIRE(handlealgs::is_single_stranded(&split)); + split.follow_edges(h, false, [&](const handle_t& n) { + REQUIRE(graph.has_edge(split.get_underlying_handle(h), + split.get_underlying_handle(n))); + split_edge_trav_count++; + }); + split.follow_edges(h, true, [&](const handle_t& p) { + REQUIRE(graph.has_edge(split.get_underlying_handle(p), + split.get_underlying_handle(h))); + split_edge_trav_count++; + }); - REQUIRE(split.get_node_count() == 2 * graph.get_node_count()); + node_count[split.get_underlying_handle(h)]++; + }); + + graph.for_each_handle([&](const handle_t& h) { + REQUIRE(node_count[h] == 1); + REQUIRE(node_count[graph.flip(h)] == 1); + graph.follow_edges(h, false, [&](const handle_t& n) { + graph_edge_trav_count++; + }); + graph.follow_edges(h, true, [&](const handle_t& p) { + graph_edge_trav_count++; + }); + }); + + REQUIRE(split_edge_trav_count == 2 * graph_edge_trav_count); + } +} + +TEST_CASE("SubpathOverlay works as expected", "[overlay][handle]") { + + + HashGraph graph; + auto h1 = graph.create_handle("A"); + auto h2 = graph.create_handle("C"); + auto h3 = graph.create_handle("G"); + auto h4 = graph.create_handle("T"); + + graph.create_edge(h1, h2); + graph.create_edge(h1, h3); + graph.create_edge(h2, h4); + graph.create_edge(h3, h4); + + auto p = graph.create_path_handle("p"); + + vector steps; + steps.push_back(graph.append_step(p, h1)); + steps.push_back(graph.append_step(p, h2)); + steps.push_back(graph.append_step(p, h4)); + + for (size_t i = 0; i < steps.size(); ++i) { + for (size_t j = i; j < steps.size(); ++j) { - unordered_map node_count; + SubpathOverlay subpath(&graph, steps[i], graph.get_next_step(steps[j])); - int split_edge_trav_count = 0; - int graph_edge_trav_count = 0; + REQUIRE(handlealgs::is_single_stranded(&subpath)); - split.for_each_handle([&](const handle_t& h) { - REQUIRE(split.get_sequence(h) == graph.get_sequence(split.get_underlying_handle(h))); - - split.follow_edges(h, false, [&](const handle_t& n) { - REQUIRE(graph.has_edge(split.get_underlying_handle(h), - split.get_underlying_handle(n))); - split_edge_trav_count++; - }); - split.follow_edges(h, true, [&](const handle_t& p) { - REQUIRE(graph.has_edge(split.get_underlying_handle(p), - split.get_underlying_handle(h))); - split_edge_trav_count++; - }); - - node_count[split.get_underlying_handle(h)]++; + REQUIRE(subpath.get_node_count() == j - i + 1); + size_t manual_node_count = 0; + subpath.for_each_handle([&](const handle_t& h) { + ++manual_node_count; }); + REQUIRE(manual_node_count == subpath.get_node_count()); - graph.for_each_handle([&](const handle_t& h) { - REQUIRE(node_count[h] == 1); - REQUIRE(node_count[graph.flip(h)] == 1); - graph.follow_edges(h, false, [&](const handle_t& n) { - graph_edge_trav_count++; - }); - graph.follow_edges(h, true, [&](const handle_t& p) { - graph_edge_trav_count++; - }); - }); + auto nodes = handlealgs::lazier_topological_order(&subpath); - REQUIRE(split_edge_trav_count == 2 * graph_edge_trav_count); + for (size_t k = 0; k < nodes.size(); ++k) { + auto s = graph.get_handle_of_step(steps[i + k]); + REQUIRE(subpath.get_underlying_handle(nodes[k]) == s); + REQUIRE(subpath.get_sequence(nodes[k]) == graph.get_sequence(s)); + + REQUIRE(subpath.get_degree(nodes[k], true) == (k == 0 ? 0 : 1)); + REQUIRE(subpath.get_degree(nodes[k], false) == (k + 1 == nodes.size() ? 0 : 1)); + } } } } + + +} } From 57b1349149ae2725dbbef72ea18159c7b08373f9 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 13 Jun 2024 12:45:01 -0700 Subject: [PATCH 53/67] switch to subpath overlay in surject --- src/surjector.cpp | 73 +++++++++++++---------------------------------- src/surjector.hpp | 5 ---- 2 files changed, 20 insertions(+), 58 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index f3892c2fe1f..b3068efea1a 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -12,6 +12,8 @@ #include "memoizing_graph.hpp" #include "multipath_alignment_graph.hpp" #include "reverse_graph.hpp" +#include "subpath_overlay.hpp" +#include "identity_overlay.hpp" #include "algorithms/extract_connecting_graph.hpp" #include "algorithms/prune_to_connecting_graph.hpp" @@ -2980,22 +2982,23 @@ using namespace std; assert(ref_path_interval.first <= ref_path_interval.second); // get the path graph corresponding to this interval - bdsg::HashGraph path_graph; - unordered_map> node_trans = extract_linearized_path_graph(path_position_graph, &path_graph, path_handle, - ref_path_interval.first, ref_path_interval.second); + step_handle_t begin = graph->get_step_at_position(path_handle, ref_path_interval.first); + step_handle_t end = graph->get_step_at_position(path_handle, ref_path_interval.second); + if (graph->get_position_of_step(end) <= ref_path_interval.second && end != graph->path_end(path_handle)) { + // we actually want part of this step too, so we use the next one as the end iterator + end = graph->get_next_step(end); + } + SubpathOverlay path_graph(path_position_graph, begin, end); // choose an orientation for the path graph ReverseGraph rev_comp_path_graph(&path_graph, true); - HandleGraph* aln_graph; + IdentityOverlay identity_path_graph(&path_graph); + ExpandingOverlayGraph* aln_graph; if (rev_strand) { - // we align to the reverse strand of the path graph, and the translation chages accordingly aln_graph = &rev_comp_path_graph; - for (pair>& translation : node_trans) { - translation.second.second = !translation.second.second; - } } else { - aln_graph = &path_graph; + aln_graph = &identity_path_graph; } #ifdef debug_anchored_surject @@ -3016,12 +3019,17 @@ using namespace std; return surjected; } + std::function(id_t)> projection_trans = [&](id_t node_id) { + handle_t handle = path_graph.get_underlying_handle(aln_graph->get_underlying_handle(aln_graph->get_handle(node_id))); + return pair(path_position_graph->get_id(handle), path_position_graph->get_is_reverse(handle)); + }; + // compute the connectivity between the path chunks // TODO: i'm not sure if we actually need to preserve all indel anchors in either case, but i don't // want to change too much about the anchoring logic at once while i'm switching from blanket preservation // to a more targeted method bool preserve_tail_indel_anchors = (sinks_are_anchors || sources_are_anchors); - MultipathAlignmentGraph mp_aln_graph(*aln_graph, path_chunks, source, node_trans, !preserve_N_alignments, + MultipathAlignmentGraph mp_aln_graph(*aln_graph, path_chunks, source, projection_trans, !preserve_N_alignments, preserve_tail_indel_anchors); #ifdef debug_anchored_surject @@ -3070,7 +3078,7 @@ using namespace std; unordered_map left_align_strand; left_align_strand.reserve(aln_graph->get_node_count()); aln_graph->for_each_handle([&](const handle_t& handle) { - left_align_strand[handle] = node_trans.at(aln_graph->get_id(handle)).second; + left_align_strand[handle] = projection_trans(aln_graph->get_id(handle)).second; }); // align the intervening segments and store the result in a multipath alignment @@ -3101,7 +3109,7 @@ using namespace std; for (size_t i = 0; i < mp_aln.subpath_size(); i++) { // translate back into the original ID space - translate_oriented_node_ids(*mp_aln.mutable_subpath(i)->mutable_path(), node_trans); + translate_oriented_node_ids(*mp_aln.mutable_subpath(i)->mutable_path(), projection_trans); } // identify the source subpaths (necessary for subpath-global optimal alignment algorithm) @@ -4162,47 +4170,6 @@ using namespace std; return interval; } - - unordered_map> - Surjector::extract_linearized_path_graph(const PathPositionHandleGraph* graph, MutableHandleGraph* into, - path_handle_t path_handle, size_t first, size_t last) const { - - // TODO: we need better semantics than an unsigned interval for surjecting to circular paths - -#ifdef debug_anchored_surject - cerr << "extracting path graph for position interval " << first << ":" << last << " in path of length " << graph->get_path_length(path_handle) << endl; -#endif - - unordered_map> node_trans; - - step_handle_t begin = graph->get_step_at_position(path_handle, first); - step_handle_t end = graph->get_step_at_position(path_handle, last); - - if (graph->get_position_of_step(end) <= last && end != graph->path_end(path_handle)) { - // we actually want part of this step too, so we use the next one as the end iterator - end = graph->get_next_step(end); - } - - handle_t prev_node; - for (step_handle_t step = begin; step != end; step = graph->get_next_step(step)) { - // copy the node with the local orientation now forward - handle_t copying = graph->get_handle_of_step(step); - handle_t node_here = into->create_handle(graph->get_sequence(copying)); - - if (step != begin) { - // add an edge from the previous node - into->create_edge(prev_node, node_here); - } - - // record the translation - node_trans[into->get_id(node_here)] = pair(graph->get_id(copying), - graph->get_is_reverse(copying)); - - prev_node = node_here; - } - - return node_trans; - } void Surjector::set_path_position(const PathPositionHandleGraph* graph, const pos_t& init_surj_pos, const pos_t& final_surj_pos, const step_handle_t& range_begin, const step_handle_t& range_end, diff --git a/src/surjector.hpp b/src/surjector.hpp index d751c798b27..e3c3acec96a 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -195,11 +195,6 @@ using namespace std; const vector>& ref_chunks, bool no_left_expansion, bool no_right_expansion) const; - /// make a linear graph that corresponds to a path interval, possibly duplicating nodes in case of cycles - unordered_map> - extract_linearized_path_graph(const PathPositionHandleGraph* graph, MutableHandleGraph* into, - path_handle_t path_handle, size_t first, size_t last) const; - /// use the graph position bounds and the path range bounds to assign a path position to a surjected read void set_path_position(const PathPositionHandleGraph* graph, const pos_t& init_surj_pos, const pos_t& final_surj_pos, From 4739c56a046788314152137e39c77fdde0660694 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 13 Jun 2024 13:56:39 -0700 Subject: [PATCH 54/67] extend noexcept to position_t --- src/position.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/position.hpp b/src/position.hpp index 69c2154f103..295c6b2e97e 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -22,12 +22,12 @@ using namespace std; // to refactor -- can always eliminate later class position_t { public: - position_t() = default; - position_t(const position_t&) = default; - position_t(position_t&&) = default; + position_t() noexcept = default; + position_t(const position_t&) noexcept = default; + position_t(position_t&&) noexcept = default; ~position_t() = default; - position_t& operator=(const position_t&) = default; - position_t& operator=(position_t&&) = default; + position_t& operator=(const position_t&) noexcept = default; + position_t& operator=(position_t&&) noexcept = default; inline int64_t node_id() const; inline void set_node_id(int64_t i); inline int64_t offset() const; From 252af1af65cc09c8425360bb5655f4a461910edb Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 13 Jun 2024 14:58:54 -0700 Subject: [PATCH 55/67] try non-defaulting noexcept constructors --- src/path.hpp | 54 ++++++++++++++++++++++++++++++++++-------------- src/position.hpp | 20 +++++++++++++----- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/src/path.hpp b/src/path.hpp index 7d7800bf03d..13b1aaaec3e 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -379,12 +379,22 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path); */ class edit_t { public: - edit_t() = default; - edit_t(const edit_t&) = default; - edit_t(edit_t&&) = default; + edit_t() noexcept : _from_length(0), _to_length(0), _sequence() {} + edit_t(const edit_t& other) noexcept : _from_length(other._from_length), _to_length(other._to_length), _sequence(other._sequence) {} + edit_t(edit_t&& other) noexcept : _from_length(other._from_length), _to_length(other._to_length), _sequence(move(other._sequence)) {} ~edit_t() = default; - edit_t& operator=(const edit_t&) = default; - edit_t& operator=(edit_t&&) = default; + edit_t& operator=(const edit_t& other) noexcept { + _from_length = other._from_length; + _to_length = other._to_length; + _sequence = other._sequence; + return *this; + } + edit_t& operator=(edit_t&& other) noexcept { + _from_length = other._from_length; + _to_length = other._to_length; + _sequence = move(other._sequence); + return *this; + } inline int32_t from_length() const; inline void set_from_length(int32_t l); inline int32_t to_length() const; @@ -403,12 +413,20 @@ class edit_t { // the mapping_t name is already taken class path_mapping_t { public: - path_mapping_t() noexcept = default; - path_mapping_t(const path_mapping_t&) noexcept = default; - path_mapping_t(path_mapping_t&&) noexcept = default; + path_mapping_t() noexcept : _position(), _edit() { } + path_mapping_t(const path_mapping_t& other) noexcept : _position(other._position), _edit(other._edit) {} + path_mapping_t(path_mapping_t&& other) noexcept : _position(move(other._position)), _edit(move(other._edit)) {} ~path_mapping_t() = default; - path_mapping_t& operator=(const path_mapping_t&) noexcept = default; - path_mapping_t& operator=(path_mapping_t&&) noexcept = default; + path_mapping_t& operator=(const path_mapping_t& other) noexcept { + _position = other._position; + _edit = other._edit; + return *this; + } + path_mapping_t& operator=(path_mapping_t&& other) noexcept { + _position = std::move(other._position); + _edit = std::move(other._edit); + return *this; + } inline const position_t& position() const; inline position_t* mutable_position(); inline const vector& edit() const; @@ -426,12 +444,18 @@ class path_mapping_t { class path_t { public: - path_t() noexcept = default; - path_t(const path_t&) noexcept = default; - path_t(path_t&&) noexcept = default; + path_t() noexcept : _mapping() {} + path_t(const path_t& other) noexcept : _mapping(other._mapping) {} + path_t(path_t&& other) noexcept : _mapping(move(other._mapping)) {} ~path_t() = default; - path_t& operator=(const path_t&) noexcept = default; - path_t& operator=(path_t&&) noexcept = default; + path_t& operator=(const path_t& other) noexcept { + _mapping = other._mapping; + return *this; + } + path_t& operator=(path_t&& other) noexcept { + _mapping = move(other._mapping); + return *this; + } inline const vector& mapping() const; inline const path_mapping_t& mapping(size_t i) const; inline vector* mutable_mapping(); diff --git a/src/position.hpp b/src/position.hpp index 295c6b2e97e..617ac604205 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -22,12 +22,22 @@ using namespace std; // to refactor -- can always eliminate later class position_t { public: - position_t() noexcept = default; - position_t(const position_t&) noexcept = default; - position_t(position_t&&) noexcept = default; + position_t() noexcept : _node_id(0), _offset(0), _is_reverse(false) {} + position_t(const position_t& other) noexcept : _node_id(other._node_id), _offset(other._offset), _is_reverse(other._is_reverse) {} + position_t(position_t&& other) noexcept : _node_id(other._node_id), _offset(other._offset), _is_reverse(other._is_reverse) {} ~position_t() = default; - position_t& operator=(const position_t&) noexcept = default; - position_t& operator=(position_t&&) noexcept = default; + position_t& operator=(const position_t& other) noexcept { + _node_id = other._node_id; + _offset = other._offset; + _offset = other._offset; + return *this; + } + position_t& operator=(position_t&& other) noexcept { + _node_id = other._node_id; + _offset = other._offset; + _offset = other._offset; + return *this; + } inline int64_t node_id() const; inline void set_node_id(int64_t i); inline int64_t offset() const; From e1619c1a17cff8ff32c58352fe482424100e2e4d Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 13 Jun 2024 15:20:14 -0700 Subject: [PATCH 56/67] try adding noexcept destructors --- src/path.hpp | 58 ++++++++++++++---------------------------------- src/position.hpp | 20 +++++------------ 2 files changed, 22 insertions(+), 56 deletions(-) diff --git a/src/path.hpp b/src/path.hpp index 13b1aaaec3e..12873c29ec2 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -379,22 +379,12 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path); */ class edit_t { public: - edit_t() noexcept : _from_length(0), _to_length(0), _sequence() {} - edit_t(const edit_t& other) noexcept : _from_length(other._from_length), _to_length(other._to_length), _sequence(other._sequence) {} - edit_t(edit_t&& other) noexcept : _from_length(other._from_length), _to_length(other._to_length), _sequence(move(other._sequence)) {} - ~edit_t() = default; - edit_t& operator=(const edit_t& other) noexcept { - _from_length = other._from_length; - _to_length = other._to_length; - _sequence = other._sequence; - return *this; - } - edit_t& operator=(edit_t&& other) noexcept { - _from_length = other._from_length; - _to_length = other._to_length; - _sequence = move(other._sequence); - return *this; - } + edit_t() noexcept = default; + edit_t(const edit_t& other) noexcept = default; + edit_t(edit_t&& other) noexcept = default; + ~edit_t() noexcept = default; + edit_t& operator=(const edit_t& other) noexcept = default; + edit_t& operator=(edit_t&& other) noexcept = default; inline int32_t from_length() const; inline void set_from_length(int32_t l); inline int32_t to_length() const; @@ -413,20 +403,12 @@ class edit_t { // the mapping_t name is already taken class path_mapping_t { public: - path_mapping_t() noexcept : _position(), _edit() { } - path_mapping_t(const path_mapping_t& other) noexcept : _position(other._position), _edit(other._edit) {} - path_mapping_t(path_mapping_t&& other) noexcept : _position(move(other._position)), _edit(move(other._edit)) {} - ~path_mapping_t() = default; - path_mapping_t& operator=(const path_mapping_t& other) noexcept { - _position = other._position; - _edit = other._edit; - return *this; - } - path_mapping_t& operator=(path_mapping_t&& other) noexcept { - _position = std::move(other._position); - _edit = std::move(other._edit); - return *this; - } + path_mapping_t() noexcept = default; + path_mapping_t(const path_mapping_t& other) noexcept = default; + path_mapping_t(path_mapping_t&& other) noexcept = default; + ~path_mapping_t() noexcept = default; + path_mapping_t& operator=(const path_mapping_t& other) noexcept = default; + path_mapping_t& operator=(path_mapping_t&& other) noexcept = default; inline const position_t& position() const; inline position_t* mutable_position(); inline const vector& edit() const; @@ -444,18 +426,12 @@ class path_mapping_t { class path_t { public: - path_t() noexcept : _mapping() {} - path_t(const path_t& other) noexcept : _mapping(other._mapping) {} - path_t(path_t&& other) noexcept : _mapping(move(other._mapping)) {} + path_t() noexcept = default; + path_t(const path_t& other) noexcept = default; + path_t(path_t&& other) noexcept = default; ~path_t() = default; - path_t& operator=(const path_t& other) noexcept { - _mapping = other._mapping; - return *this; - } - path_t& operator=(path_t&& other) noexcept { - _mapping = move(other._mapping); - return *this; - } + path_t& operator=(const path_t& other) noexcept = default; + path_t& operator=(path_t&& other) noexcept = default; inline const vector& mapping() const; inline const path_mapping_t& mapping(size_t i) const; inline vector* mutable_mapping(); diff --git a/src/position.hpp b/src/position.hpp index 617ac604205..0c57e2247a2 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -23,21 +23,11 @@ using namespace std; class position_t { public: position_t() noexcept : _node_id(0), _offset(0), _is_reverse(false) {} - position_t(const position_t& other) noexcept : _node_id(other._node_id), _offset(other._offset), _is_reverse(other._is_reverse) {} - position_t(position_t&& other) noexcept : _node_id(other._node_id), _offset(other._offset), _is_reverse(other._is_reverse) {} - ~position_t() = default; - position_t& operator=(const position_t& other) noexcept { - _node_id = other._node_id; - _offset = other._offset; - _offset = other._offset; - return *this; - } - position_t& operator=(position_t&& other) noexcept { - _node_id = other._node_id; - _offset = other._offset; - _offset = other._offset; - return *this; - } + position_t(const position_t& other) noexcept = default; + position_t(position_t&& other) noexcept = default; + ~position_t() noexcept = default; + position_t& operator=(const position_t& other) noexcept = default; + position_t& operator=(position_t&& other) noexcept = default; inline int64_t node_id() const; inline void set_node_id(int64_t i); inline int64_t offset() const; From 7a1335e6fe845457a839aa9a25b2204a3e2a964f Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 13 Jun 2024 15:37:12 -0700 Subject: [PATCH 57/67] try removing noexcept from assignment operators --- src/multipath_alignment.cpp | 6 +++--- src/multipath_alignment.hpp | 18 +++++++++--------- src/path.hpp | 16 ++++++++-------- src/position.hpp | 4 ++-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/multipath_alignment.cpp b/src/multipath_alignment.cpp index ecea6cf4fc9..df335ca1aa1 100644 --- a/src/multipath_alignment.cpp +++ b/src/multipath_alignment.cpp @@ -35,7 +35,7 @@ namespace vg { *this = move(other); } - multipath_alignment_t& multipath_alignment_t::operator=(const multipath_alignment_t& other) noexcept { + multipath_alignment_t& multipath_alignment_t::operator=(const multipath_alignment_t& other) { _sequence = other._sequence; _quality = other._quality; _subpath = other._subpath; @@ -64,7 +64,7 @@ namespace vg { return *this; } - multipath_alignment_t& multipath_alignment_t::operator=(multipath_alignment_t&& other) noexcept { + multipath_alignment_t& multipath_alignment_t::operator=(multipath_alignment_t&& other) { if (this != &other) { _sequence = move(other._sequence); _quality = move(other._quality); @@ -77,7 +77,7 @@ namespace vg { return *this; } - multipath_alignment_t::~multipath_alignment_t() { + multipath_alignment_t::~multipath_alignment_t() noexcept { while (!_annotation.empty()) { clear_annotation(_annotation.begin()->first); } diff --git a/src/multipath_alignment.hpp b/src/multipath_alignment.hpp index 134473ceb44..ce618ecd351 100644 --- a/src/multipath_alignment.hpp +++ b/src/multipath_alignment.hpp @@ -31,9 +31,9 @@ namespace vg { connection_t() noexcept = default; connection_t(const connection_t&) noexcept = default; connection_t(connection_t&&) noexcept = default; - ~connection_t() = default; - connection_t& operator=(const connection_t&) noexcept = default; - connection_t& operator=(connection_t&&) noexcept = default; + ~connection_t() noexcept = default; + connection_t& operator=(const connection_t&) = default; + connection_t& operator=(connection_t&&) = default; inline int32_t next() const; inline void set_next(int32_t n); inline int32_t score() const; @@ -51,9 +51,9 @@ namespace vg { subpath_t() noexcept = default; subpath_t(const subpath_t&) noexcept = default; subpath_t(subpath_t&&) noexcept = default; - ~subpath_t() = default; - subpath_t& operator=(const subpath_t&) noexcept = default; - subpath_t& operator=(subpath_t&&) noexcept = default; + ~subpath_t() noexcept = default; + subpath_t& operator=(const subpath_t&) = default; + subpath_t& operator=(subpath_t&&) = default; inline const path_t& path() const; inline path_t* mutable_path(); inline bool has_path() const; @@ -89,9 +89,9 @@ namespace vg { multipath_alignment_t() noexcept; multipath_alignment_t(const multipath_alignment_t& other) noexcept; multipath_alignment_t(multipath_alignment_t&& other) noexcept; - ~multipath_alignment_t(); - multipath_alignment_t& operator=(const multipath_alignment_t& other) noexcept; - multipath_alignment_t& operator=(multipath_alignment_t&& other) noexcept; + ~multipath_alignment_t() noexcept; + multipath_alignment_t& operator=(const multipath_alignment_t& other); + multipath_alignment_t& operator=(multipath_alignment_t&& other); inline const string& sequence() const; inline string* mutable_sequence(); inline void set_sequence(const string& s); diff --git a/src/path.hpp b/src/path.hpp index 12873c29ec2..e73014f1dc3 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -379,12 +379,12 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path); */ class edit_t { public: - edit_t() noexcept = default; + edit_t() noexcept : _from_length(0), _to_length(0), _sequence() {} edit_t(const edit_t& other) noexcept = default; edit_t(edit_t&& other) noexcept = default; ~edit_t() noexcept = default; - edit_t& operator=(const edit_t& other) noexcept = default; - edit_t& operator=(edit_t&& other) noexcept = default; + edit_t& operator=(const edit_t& other) = default; + edit_t& operator=(edit_t&& other) = default; inline int32_t from_length() const; inline void set_from_length(int32_t l); inline int32_t to_length() const; @@ -407,8 +407,8 @@ class path_mapping_t { path_mapping_t(const path_mapping_t& other) noexcept = default; path_mapping_t(path_mapping_t&& other) noexcept = default; ~path_mapping_t() noexcept = default; - path_mapping_t& operator=(const path_mapping_t& other) noexcept = default; - path_mapping_t& operator=(path_mapping_t&& other) noexcept = default; + path_mapping_t& operator=(const path_mapping_t& other) = default; + path_mapping_t& operator=(path_mapping_t&& other) = default; inline const position_t& position() const; inline position_t* mutable_position(); inline const vector& edit() const; @@ -429,9 +429,9 @@ class path_t { path_t() noexcept = default; path_t(const path_t& other) noexcept = default; path_t(path_t&& other) noexcept = default; - ~path_t() = default; - path_t& operator=(const path_t& other) noexcept = default; - path_t& operator=(path_t&& other) noexcept = default; + ~path_t() noexcept = default; + path_t& operator=(const path_t& other) = default; + path_t& operator=(path_t&& other) = default; inline const vector& mapping() const; inline const path_mapping_t& mapping(size_t i) const; inline vector* mutable_mapping(); diff --git a/src/position.hpp b/src/position.hpp index 0c57e2247a2..d751a02ef8b 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -26,8 +26,8 @@ class position_t { position_t(const position_t& other) noexcept = default; position_t(position_t&& other) noexcept = default; ~position_t() noexcept = default; - position_t& operator=(const position_t& other) noexcept = default; - position_t& operator=(position_t&& other) noexcept = default; + position_t& operator=(const position_t& other) = default; + position_t& operator=(position_t&& other) = default; inline int64_t node_id() const; inline void set_node_id(int64_t i); inline int64_t offset() const; From 4bec3db549fbae1c068c46e1b9443944e84f34ca Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 13 Jun 2024 17:36:45 -0700 Subject: [PATCH 58/67] give up on noexcept for now --- src/multipath_alignment.cpp | 6 +++--- src/multipath_alignment.hpp | 24 ++++++++++++------------ src/path.hpp | 22 +++++++++++----------- src/position.hpp | 8 ++++---- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/multipath_alignment.cpp b/src/multipath_alignment.cpp index df335ca1aa1..d91ae71a03a 100644 --- a/src/multipath_alignment.cpp +++ b/src/multipath_alignment.cpp @@ -23,15 +23,15 @@ using namespace vg::io; namespace vg { - multipath_alignment_t::multipath_alignment_t() noexcept : _mapping_quality(0) { + multipath_alignment_t::multipath_alignment_t() : _mapping_quality(0) { // i don't understand why this is necessary, but somehow mapq is getting defaulted to 160? } - multipath_alignment_t::multipath_alignment_t(const multipath_alignment_t& other) noexcept { + multipath_alignment_t::multipath_alignment_t(const multipath_alignment_t& other) { *this = other; } - multipath_alignment_t::multipath_alignment_t(multipath_alignment_t&& other) noexcept { + multipath_alignment_t::multipath_alignment_t(multipath_alignment_t&& other) { *this = move(other); } diff --git a/src/multipath_alignment.hpp b/src/multipath_alignment.hpp index ce618ecd351..2efa7f27796 100644 --- a/src/multipath_alignment.hpp +++ b/src/multipath_alignment.hpp @@ -28,10 +28,10 @@ namespace vg { class connection_t { public: - connection_t() noexcept = default; - connection_t(const connection_t&) noexcept = default; - connection_t(connection_t&&) noexcept = default; - ~connection_t() noexcept = default; + connection_t() = default; + connection_t(const connection_t&) = default; + connection_t(connection_t&&) = default; + ~connection_t() = default; connection_t& operator=(const connection_t&) = default; connection_t& operator=(connection_t&&) = default; inline int32_t next() const; @@ -48,10 +48,10 @@ namespace vg { */ class subpath_t { public: - subpath_t() noexcept = default; - subpath_t(const subpath_t&) noexcept = default; - subpath_t(subpath_t&&) noexcept = default; - ~subpath_t() noexcept = default; + subpath_t() = default; + subpath_t(const subpath_t&) = default; + subpath_t(subpath_t&&) = default; + ~subpath_t() = default; subpath_t& operator=(const subpath_t&) = default; subpath_t& operator=(subpath_t&&) = default; inline const path_t& path() const; @@ -86,10 +86,10 @@ namespace vg { // TODO: the metadata could be removed and only added to the protobuf at serialization time class multipath_alignment_t { public: - multipath_alignment_t() noexcept; - multipath_alignment_t(const multipath_alignment_t& other) noexcept; - multipath_alignment_t(multipath_alignment_t&& other) noexcept; - ~multipath_alignment_t() noexcept; + multipath_alignment_t(); + multipath_alignment_t(const multipath_alignment_t& other); + multipath_alignment_t(multipath_alignment_t&& other); + ~multipath_alignment_t(); multipath_alignment_t& operator=(const multipath_alignment_t& other); multipath_alignment_t& operator=(multipath_alignment_t&& other); inline const string& sequence() const; diff --git a/src/path.hpp b/src/path.hpp index e73014f1dc3..e1039393a4b 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -380,9 +380,9 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path); class edit_t { public: edit_t() noexcept : _from_length(0), _to_length(0), _sequence() {} - edit_t(const edit_t& other) noexcept = default; - edit_t(edit_t&& other) noexcept = default; - ~edit_t() noexcept = default; + edit_t(const edit_t& other) = default; + edit_t(edit_t&& other) = default; + ~edit_t() = default; edit_t& operator=(const edit_t& other) = default; edit_t& operator=(edit_t&& other) = default; inline int32_t from_length() const; @@ -403,10 +403,10 @@ class edit_t { // the mapping_t name is already taken class path_mapping_t { public: - path_mapping_t() noexcept = default; - path_mapping_t(const path_mapping_t& other) noexcept = default; - path_mapping_t(path_mapping_t&& other) noexcept = default; - ~path_mapping_t() noexcept = default; + path_mapping_t() = default; + path_mapping_t(const path_mapping_t& other) = default; + path_mapping_t(path_mapping_t&& other) = default; + ~path_mapping_t() = default; path_mapping_t& operator=(const path_mapping_t& other) = default; path_mapping_t& operator=(path_mapping_t&& other) = default; inline const position_t& position() const; @@ -426,10 +426,10 @@ class path_mapping_t { class path_t { public: - path_t() noexcept = default; - path_t(const path_t& other) noexcept = default; - path_t(path_t&& other) noexcept = default; - ~path_t() noexcept = default; + path_t() = default; + path_t(const path_t& other) = default; + path_t(path_t&& other) = default; + ~path_t() = default; path_t& operator=(const path_t& other) = default; path_t& operator=(path_t&& other) = default; inline const vector& mapping() const; diff --git a/src/position.hpp b/src/position.hpp index d751a02ef8b..4719b42c325 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -22,10 +22,10 @@ using namespace std; // to refactor -- can always eliminate later class position_t { public: - position_t() noexcept : _node_id(0), _offset(0), _is_reverse(false) {} - position_t(const position_t& other) noexcept = default; - position_t(position_t&& other) noexcept = default; - ~position_t() noexcept = default; + position_t() : _node_id(0), _offset(0), _is_reverse(false) {} + position_t(const position_t& other) = default; + position_t(position_t&& other) = default; + ~position_t() = default; position_t& operator=(const position_t& other) = default; position_t& operator=(position_t&& other) = default; inline int64_t node_id() const; From 7bb1e38bfa89e5f4c7bcc5e3dfca8053641d545c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 24 Jun 2024 11:32:49 -0400 Subject: [PATCH 59/67] disable cyclic refs inside nested insertions until all tags fleshed out to support them --- src/deconstructor.cpp | 8 ++++++++ src/traversal_clusters.cpp | 21 ++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index db912596003..0b3f42f0e1e 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -785,6 +785,14 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t return false; } + if (ref_travs.size() > 1 && this->nested_decomposition) { +#ifdef debug +#pragma omp critical (cerr) + cerr << "Multiple ref traversals not yet supported with nested decomposition: removing all but first" << endl; + ref_travs.resize(1); +#endif + } + // XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles // we collect windows around the reference traversals diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 1fb020be263..b18348e16c0 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -37,7 +37,21 @@ vector get_traversal_order(const PathHandleGraph* graph, int64_t ref_trav_idx, const vector& use_traversal) { set ref_set(ref_travs.begin(), ref_travs.end()); - + + // we want to avoid cyclic references whenever possible, so we downweight in ranking + vector trav_to_occurrences(traversals.size(), 1); + unordered_map sample_to_occurrence; + for (int i = 0; i < traversals.size(); ++i) { + if (use_traversal[i] && !trav_path_names[i].empty()) { + sample_to_occurrence[PathMetadata::parse_sample_name(trav_path_names[i])] += 1; + } + } + for (int i = 0; i < traversals.size(); ++i) { + if (use_traversal[i] && !trav_path_names[i].empty()) { + trav_to_occurrences[i] = sample_to_occurrence[trav_path_names[i]]; + } + } + function trav_less = [&](int i, int j) { // give reference set preference priority bool iref = ref_set.count(i); @@ -45,8 +59,9 @@ vector get_traversal_order(const PathHandleGraph* graph, if (iref != jref) { return iref; } - // fall back to name comparison - if (!trav_path_names[i].empty() && (trav_path_names[j].empty() || trav_path_names[i] < trav_path_names[j])) { + // fall back to occurrences then name comparison + if (trav_to_occurrences[i] < trav_to_occurrences[j] || + (trav_to_occurrences[i] == trav_to_occurrences[j] && !trav_path_names[i].empty() && (trav_path_names[j].empty() || trav_path_names[i] < trav_path_names[j]))) { return true; } return false; From 061735e760fcb470893ac5ae58315b86977e1825 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 25 Jun 2024 09:44:49 -0400 Subject: [PATCH 60/67] fix typo in previous commit --- src/deconstructor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 0b3f42f0e1e..5a1e6a9cfaa 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -789,8 +789,8 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t #ifdef debug #pragma omp critical (cerr) cerr << "Multiple ref traversals not yet supported with nested decomposition: removing all but first" << endl; - ref_travs.resize(1); #endif + ref_travs.resize(1); } // XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles From ee450fb72c4348f5a931a1313cf40634b512a843 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 26 Jun 2024 12:56:31 -0400 Subject: [PATCH 61/67] deterministically pick first ref trav when choosing one --- src/deconstructor.cpp | 14 +++++++++++++- test/t/26_deconstruct.t | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 5a1e6a9cfaa..2c1580b2fd1 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -790,7 +790,19 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t #pragma omp critical (cerr) cerr << "Multiple ref traversals not yet supported with nested decomposition: removing all but first" << endl; #endif - ref_travs.resize(1); + size_t min_start_pos = numeric_limits::max(); + int64_t first_ref_trav; + for (int64_t i = 0; i < ref_travs.size(); ++i) { + auto& ref_trav_idx = ref_travs[i]; + step_handle_t start_step = trav_steps[ref_trav_idx].first; + step_handle_t end_step = trav_steps[ref_trav_idx].second; + size_t ref_trav_pos = min(graph->get_position_of_step(start_step), graph->get_position_of_step(end_step)); + if (ref_trav_pos < min_start_pos) { + min_start_pos = ref_trav_pos; + first_ref_trav = i; + } + } + ref_travs = {ref_travs[first_ref_trav]}; } // XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 7e6d0584899..03c7f8f3a87 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -213,7 +213,7 @@ is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf vg deconstruct nesting/nested_snp_in_ins_cycle.gfa -P x -n > nested_snp_in_ins_cycle.vcf -printf "A#1#y0\t6\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RD=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv +printf "A#1#y0\t3\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RD=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv printf "x\t1\t>1>6\tC\tCAAGAAG,CATG,CAAG\tAC=1,2,1;AF=0.25,0.5,0.25;AN=4;AT=>1>6,>1>2>4>5>2>4>5>6,>1>2>3>5>6,>1>2>4>5>6;NS=2;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0\t1|2\t3|2\n" >> nested_snp_in_ins_cycle_truth.tsv grep -v ^# nested_snp_in_ins_cycle.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $8 "\t" $10 "\t" $11}' > nested_snp_in_ins_cycle.tsv diff nested_snp_in_ins_cycle.tsv nested_snp_in_ins_cycle_truth.tsv From 4f4947def28d0f28f024d7cea235b15d8c0aa88e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 26 Jun 2024 12:26:01 -0400 Subject: [PATCH 62/67] check zstdlib error codes --- src/graph_caller.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 1ce56a05765..4e3a4d4c203 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -243,7 +243,8 @@ void VCFOutputCaller::add_variant(vcflib::Variant& var) const { stringstream ss; ss << var; string dest; - zstdutil::CompressString(ss.str(), dest); + int ret = zstdutil::CompressString(ss.str(), dest); + assert(ret == 0); // the Variant object is too big to keep in memory when there are many genotypes, so we // store it in a zstd-compressed string output_variants[omp_get_thread_num()].push_back(make_pair(make_pair(var.sequenceName, var.position), dest)); @@ -265,7 +266,8 @@ void VCFOutputCaller::write_variants(ostream& out_stream, const SnarlManager* sn }); for (auto v : all_variants) { string dest; - zstdutil::DecompressString(v.second, dest); + int ret = zstdutil::DecompressString(v.second, dest); + assert(ret == 0); out_stream << dest << endl; } } @@ -1013,7 +1015,8 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager for (auto& thread_buf : output_variants) { for (auto& output_variant_record : thread_buf) { string output_variant_string; - zstdutil::DecompressString(output_variant_record.second, output_variant_string); + int ret = zstdutil::DecompressString(output_variant_record.second, output_variant_string); + assert(ret == 0); vector toks = split_delims(output_variant_string, "\t", 4); names_in_vcf.insert(toks[2]); } @@ -1046,7 +1049,8 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager auto& thread_buf = output_variants[i]; for (auto& output_variant_record : thread_buf) { string output_variant_string; - zstdutil::DecompressString(output_variant_record.second, output_variant_string); + int ret = zstdutil::DecompressString(output_variant_record.second, output_variant_string); + assert(ret == 0); //string& output_variant_string = output_variant_record.second; vector toks = split_delims(output_variant_string, "\t", 9); const string& name = toks[2]; @@ -1070,7 +1074,8 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager } } output_variant_record.second.clear(); - zstdutil::CompressString(output_variant_string, output_variant_record.second); + int ret = zstdutil::CompressString(output_variant_string, output_variant_record.second); + assert(ret == 0); } } } From 8d46086ee71dca496a557026cc297d8d949cd1e2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 26 Jun 2024 16:35:18 -0400 Subject: [PATCH 63/67] cap output vcf line lengths at 2Gi --- src/deconstructor.cpp | 10 +++++++++- src/graph_caller.cpp | 40 ++++++++++++++++++++++++++++------------ src/graph_caller.hpp | 9 +++++++-- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index db912596003..b1f640af782 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1069,7 +1069,15 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { // run vcffixup to add some basic INFO like AC vcf_fixup(v); - add_variant(v); + bool added = add_variant(v); + if (!added) { + stringstream ss; + ss << v; + cerr << "Warning [vg deconstruct]: Skipping variant at " << v.sequenceName << ":" << v.position + << " with ID=" << v.id << " because its line length of " << ss.str().length() << " exceeds vg's limit of " + << VCFOutputCaller::max_vcf_line_length << endl; + return false; + } } } return true; diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 4e3a4d4c203..e3ce68321f2 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -238,16 +238,20 @@ string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector VCFOutputCaller::max_vcf_line_length) { + return false; + } int ret = zstdutil::CompressString(ss.str(), dest); assert(ret == 0); // the Variant object is too big to keep in memory when there are many genotypes, so we // store it in a zstd-compressed string output_variants[omp_get_thread_num()].push_back(make_pair(make_pair(var.sequenceName, var.position), dest)); + return true; } void VCFOutputCaller::write_variants(ostream& out_stream, const SnarlManager* snarl_manager) { @@ -440,7 +444,7 @@ string VCFOutputCaller::trav_string(const HandleGraph& graph, const SnarlTravers return seq; } -void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, +bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, const Snarl& snarl, const vector& called_traversals, const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, @@ -599,8 +603,17 @@ void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa #endif if (genotype_snarls || !out_variant.alt.empty()) { - add_variant(out_variant); + bool added = add_variant(out_variant); + if (!added) { + stringstream ss; + ss << out_variant; + cerr << "Warning [vg call]: Skipping variant at " << out_variant.sequenceName << ":" << out_variant.position + << " with ID=" << out_variant.id << " because its line length of " << ss.str().length() << " exceeds vg's limit of " + << VCFOutputCaller::max_vcf_line_length << endl; + } + return added; } + return true; } tuple VCFOutputCaller::get_ref_interval( @@ -1074,7 +1087,7 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager } } output_variant_record.second.clear(); - int ret = zstdutil::CompressString(output_variant_string, output_variant_record.second); + ret = zstdutil::CompressString(output_variant_string, output_variant_record.second); assert(ret == 0); } } @@ -1463,10 +1476,9 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); // emit our vcf variant - emit_variant(graph, snarl_caller, snarl, called_traversals, genotype, 0, call_info, path_name, ref_offsets.find(path_name)->second, false, + was_called = emit_variant(graph, snarl_caller, snarl, called_traversals, genotype, 0, call_info, path_name, ref_offsets.find(path_name)->second, false, ploidy); - was_called = true; } } if (!is_vg) { @@ -1876,15 +1888,16 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { assert(trav_genotype.empty() || trav_genotype.size() == ploidy); + bool added = true; if (!gaf_output) { - emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, - ref_offsets[ref_path_name], genotype_snarls, ploidy); + added = emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, + ref_offsets[ref_path_name], genotype_snarls, ploidy); } else { pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); } - ret_val = trav_genotype.size() == ploidy; + ret_val = trav_genotype.size() == ploidy && added; } return ret_val; @@ -2267,10 +2280,13 @@ bool NestedFlowCaller::emit_snarl_recursive(const Snarl& managed_snarl, int ploi return flatten_alt_allele(allele_string, std::min(allele_ploidy-1, genotype_allele), allele_ploidy, call_table); } }; - + if (!gaf_output) { - emit_variant(graph, snarl_caller, managed_snarl, record.travs, genotype.first, record.ref_trav_idx, genotype.second, record.ref_path_name, - ref_offsets[record.ref_path_name], genotype_snarls, ploidy, trav_to_flat_string); + bool added = emit_variant(graph, snarl_caller, managed_snarl, record.travs, genotype.first, record.ref_trav_idx, genotype.second, record.ref_path_name, + ref_offsets[record.ref_path_name], genotype_snarls, ploidy, trav_to_flat_string); + if (!added) { + return false; + } } else { // todo: // emit_gaf_variant(graph, snarl, travs, trav_genotype); diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 0485d03db3f..526dcbef573 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -85,7 +85,8 @@ class VCFOutputCaller { const vector& contig_length_overrides) const; /// Add a variant to our buffer - void add_variant(vcflib::Variant& var) const; + /// Returns false if the variant line length exceeds VCFOutputCaller::max_vcf_line_length + bool add_variant(vcflib::Variant& var) const; /// Sort then write variants in the buffer /// snarl_manager needed if include_nested is true @@ -113,7 +114,8 @@ class VCFOutputCaller { string trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const; /// print a vcf variant - void emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, + /// return value is taken from add_variant (see above) + bool emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, const Snarl& snarl, const vector& called_traversals, const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, @@ -166,6 +168,9 @@ class VCFOutputCaller { // need to write LV/PS info tags bool include_nested; + + // prevent giant variants + static const int64_t max_vcf_line_length = 2000000000; }; /** From 19bf23a8b338a43d3b9d95ed6058c8a59c4fb326 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 26 Jun 2024 20:55:54 -0400 Subject: [PATCH 64/67] terrible typo --- src/deconstructor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index b1f640af782..24a5c116228 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1285,7 +1285,7 @@ void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { queue.pop_back(); snarls.push_back(snarl); const vector& children = snarl_manager->children_of(snarl); - snarls.insert(snarls.end(), children.begin(), children.end()); + queue.insert(queue.end(), children.begin(), children.end()); } } else { swap(snarls, queue); From 761cc486660129f4bfcfae9bdaa0bb1e861b794b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 27 Jun 2024 11:55:01 -0400 Subject: [PATCH 65/67] Pad out fallback pair alignments This should fix #4315 --- src/minimizer_mapper.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index 4cc4224368e..f3a4ad1d34d 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -1364,6 +1364,14 @@ pair, vector> MinimizerMapper::map_paired(Alignment // Map single-ended and bail std::array, 2> mapped_pair = {map(aln1), map(aln2)}; + // We have no way to know which mapping ought to go with each other, so pad out + // the shorter list with copies of the first item, so we can report all mappings + // we got. + for (size_t index = 0; index < 2; index++) { + while (mapped_pair[index].size() < mapped_pair[1 - index].size()) { + mapped_pair[index].push_back(mapped_pair[index].first()); + } + } pair_all(mapped_pair); return {std::move(mapped_pair[0]), std::move(mapped_pair[1])}; } From 870339b277d540918b394c961135ea80849757f9 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 28 Jun 2024 11:12:23 -0400 Subject: [PATCH 66/67] Use right method to get first element --- src/minimizer_mapper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index f3a4ad1d34d..9def875f42e 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -1369,7 +1369,7 @@ pair, vector> MinimizerMapper::map_paired(Alignment // we got. for (size_t index = 0; index < 2; index++) { while (mapped_pair[index].size() < mapped_pair[1 - index].size()) { - mapped_pair[index].push_back(mapped_pair[index].first()); + mapped_pair[index].push_back(mapped_pair[index].front()); } } pair_all(mapped_pair); From af500c97b988ae90be173d44198173502200ccc2 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 28 Jun 2024 11:45:21 -0400 Subject: [PATCH 67/67] Make vg inject use AlignmentEmitter and support GAF --- src/subcommand/inject_main.cpp | 36 +++++++++++++++++++++++++--------- test/t/39_vg_inject.t | 4 +++- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/subcommand/inject_main.cpp b/src/subcommand/inject_main.cpp index a9ed8cbcb8f..99b1c8fe234 100644 --- a/src/subcommand/inject_main.cpp +++ b/src/subcommand/inject_main.cpp @@ -14,6 +14,7 @@ #include "../alignment.hpp" #include "../vg.hpp" #include "../xg.hpp" +#include "../hts_alignment_emitter.hpp" #include #include #include @@ -23,10 +24,11 @@ using namespace vg; using namespace vg::subcommand; void help_inject(char** argv) { - cerr << "usage: " << argv[0] << " inject [options] input.[bam|sam|cram] >output.gam" << endl + cerr << "usage: " << argv[0] << " inject -x graph.xg [options] input.[bam|sam|cram] >output.gam" << endl << endl << "options:" << endl << " -x, --xg-name FILE use this graph or xg index (required, non-XG formats also accepted)" << endl + << " -o, --output-format NAME output the alignments in NAME format (gam / gaf / json) [gam]" << endl << " -t, --threads N number of threads to use" << endl; } @@ -37,6 +39,8 @@ int main_inject(int argc, char** argv) { } string xg_name; + string output_format = "GAM"; + std::set output_formats = { "GAM", "GAF", "JSON" }; int threads = get_thread_count(); int c; @@ -46,12 +50,13 @@ int main_inject(int argc, char** argv) { { {"help", no_argument, 0, 'h'}, {"xg-name", required_argument, 0, 'x'}, + {"output-format", required_argument, 0, 'o'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:t:", + c = getopt_long (argc, argv, "hx:o:t:", long_options, &option_index); // Detect the end of the options. @@ -63,6 +68,19 @@ int main_inject(int argc, char** argv) { case 'x': xg_name = optarg; break; + + case 'o': + { + output_format = optarg; + for (char& c : output_format) { + c = std::toupper(c); + } + if (output_formats.find(output_format) == output_formats.end()) { + std::cerr << "error: [vg inject] Invalid output format: " << optarg << std::endl; + std::exit(1); + } + } + break; case 't': threads = parse(optarg); @@ -90,14 +108,14 @@ int main_inject(int argc, char** argv) { } unique_ptr path_handle_graph = vg::io::VPKG::load_one(xg_name); bdsg::PathPositionOverlayHelper overlay_helper; - PathPositionHandleGraph* xgidx = overlay_helper.apply(path_handle_graph.get()); + PathPositionHandleGraph* xgidx = overlay_helper.apply(path_handle_graph.get()); - vg::io::ProtobufEmitter buf(cout); - function lambda = [&buf](Alignment& aln) { -#pragma omp critical (buf) - { - buf.write(std::move(aln)); - } + // We don't do HTS output formats but we do need an empty paths collection to make an alignment emitter + vector> paths; + unique_ptr alignment_emitter = get_alignment_emitter("-", output_format, paths, threads, xgidx); + + function lambda = [&](Alignment& aln) { + alignment_emitter->emit_mapped_single({std::move(aln)}); }; if (threads > 1) { hts_for_each_parallel(file_name, lambda, xgidx); diff --git a/test/t/39_vg_inject.t b/test/t/39_vg_inject.t index 541c2e9cbf6..f1aa362cee1 100644 --- a/test/t/39_vg_inject.t +++ b/test/t/39_vg_inject.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 12 +plan tests 13 vg construct -r small/x.fa > j.vg vg index -x j.xg j.vg @@ -52,5 +52,7 @@ cat <(samtools view -H small/x.bam) <(printf "name\t4\t*\t0\t0\t*\t*\t0\t0\tACGT is "$(vg inject -x x.xg unmapped.sam | vg view -aj - | grep "path" | wc -l)" 0 "vg inject does not make an alignment for an umapped read" is "$(echo $?)" 0 "vg inject does not crash on an unmapped read" +is $(vg inject -x x.xg small/x.bam -o GAF | wc -l) \ + 1000 "vg inject supports GAF output" rm j.vg j.xg x.vg x.gcsa x.gcsa.lcp x.xg unmapped.sam