diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index e869236e173..588682381aa 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -618,8 +618,7 @@ unordered_map> Deconstructor::add_star_traversals(vector& names, vector>& trav_clusters, vector>& trav_cluster_info, - const unordered_map>& parent_haplotypes) const { - + const unordered_map>& parent_haplotypes) const { // todo: refactor this into general genotyping code unordered_map> sample_to_haps; @@ -692,6 +691,8 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // compute all the traversals from embedded paths and gbwt this->get_traversals(snarl_start, snarl_end, travs, trav_path_names, trav_steps); + int64_t trav_count = travs.size(); + int64_t trav_step_count = trav_steps.size(); if (travs.empty()) { return false; @@ -745,7 +746,6 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // in case of cycles, we need our allele traversals to be associated to the correct reference position // this is done with the path jaccard metric over all overlapping reference paths the given path_jaccard_window size - vector ref_travs; // hacky subpath support -- gets added to variant on output vector ref_offsets; @@ -831,6 +831,11 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // we write a variant for every reference traversal // (optionally) selecting the subset of path traversals that are 1:1 for (size_t i = 0; i < ref_travs.size(); ++i) { + // we zap these to their original size, as the nesting logic can add + // dummy traversals and these are reference-specific (and so need to be cleaned each iteration here) + travs.resize(trav_count); + trav_path_names.resize(trav_count); + trav_steps.resize(trav_step_count); auto& ref_trav_idx = ref_travs[i]; auto& ref_trav_offset = ref_offsets[i]; @@ -907,8 +912,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } } + if (std::none_of(use_trav.begin(), use_trav.end(), [](bool b) {return b;})) { + // no alts were jaccard-assigned to this reference, so abort before an assertion gets tripped + continue; + } + // Sort the traversals for clustering - vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, use_trav); + vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, ref_trav_idx, use_trav); // jaccard clustering (using handles for now) on traversals vector> trav_cluster_info; @@ -936,7 +946,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t #endif unordered_map> sample_to_haps; - if (i == 0 && in_nesting_info != nullptr) { + if (in_nesting_info != nullptr) { // if the reference traversal is also an alt traversal, we pop out an extra copy // todo: this is a hack add add off-reference support while keeping the current // logic where the reference traversal is always distinct from the alts. this step diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index d45944651d2..1fb020be263 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -34,6 +34,7 @@ vector get_traversal_order(const PathHandleGraph* graph, const vector& traversals, const vector& trav_path_names, const vector& ref_travs, + int64_t ref_trav_idx, const vector& use_traversal) { set ref_set(ref_travs.begin(), ref_travs.end()); @@ -54,7 +55,7 @@ vector get_traversal_order(const PathHandleGraph* graph, vector sorted_travs; sorted_travs.reserve(traversals.size()); for (int64_t i = 0; i < traversals.size(); ++i) { - if (use_traversal[i]) { + if (use_traversal[i] && (i == ref_trav_idx || !ref_set.count(i))) { sorted_travs.push_back(i); } } @@ -107,9 +108,10 @@ vector> cluster_traversals(const PathHandleGraph* graph, // need the clusters as sorted lists. we'll forget the endpoints while we're at // it since they're always shared. note we work with multisets since we want to // count differences between, say, cycle copy numbers. - vector> sorted_traversals; - for (const Traversal& trav : traversals) { - assert(trav.size() >=2 ); + vector> sorted_traversals(traversals.size()); + for (const int& i : traversal_order) { + const auto& trav = traversals[i]; + assert(trav.size() >=2); // prune snarl nodes as they're always the same // todo: does jaccard properly handle empty sets? int64_t first = trav.size() == 2 ? 0 : 1; @@ -118,7 +120,7 @@ vector> cluster_traversals(const PathHandleGraph* graph, for (int64_t i = first; i < last; ++i) { sorted_trav.insert(trav[i]); } - sorted_traversals.push_back(sorted_trav); + sorted_traversals[i] = sorted_trav; } for (const int& i : traversal_order) { @@ -177,7 +179,7 @@ vector> cluster_traversals(const PathHandleGraph* graph, } } } - assert(uncovered_child_snarls.empty()); + assert(uncovered_child_snarls.empty() || traversal_order.size() < traversals.size()); // fill in the size deltas for (vector& cluster : clusters) { diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index ee850b944be..a4e7710d4b5 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -60,6 +60,7 @@ vector get_traversal_order(const PathHandleGraph* graph, const vector& traversals, const vector& trav_path_names, const vector& ref_travs, + int64_t ref_trav_idx, const vector& use_traversal);