Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sample fragmented haplotypes #4523

Merged
merged 17 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deps/gbwt
53 changes: 53 additions & 0 deletions src/algorithms/extract_subchain.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "extract_subchain.hpp"

#include <stack>

namespace vg {

//------------------------------------------------------------------------------

hash_set<nid_t> extract_subchain(const HandleGraph& graph, handle_t start, handle_t end) {
hash_set<nid_t> result;
nid_t start_id = graph.get_id(start);
bool start_is_reverse = graph.get_is_reverse(start);
nid_t end_id = graph.get_id(end);
bool end_is_reverse = graph.get_is_reverse(end);

std::stack<nid_t> active;
active.push(start_id);
active.push(end_id);
while (!active.empty()) {
nid_t node_id = active.top(); active.pop();
if (result.count(node_id)) {
continue;
}
result.insert(node_id);
bool go_forward = true, go_backward = true;
if (node_id == start_id) {
go_forward &= !start_is_reverse;
go_backward &= start_is_reverse;

}
if (node_id == end_id) {
go_forward &= end_is_reverse;
go_backward &= !end_is_reverse;
}
handle_t handle = graph.get_handle(node_id, false);
if (go_forward) {
graph.follow_edges(handle, false, [&](const handle_t& next) {
active.push(graph.get_id(next));
});
}
if (go_backward) {
graph.follow_edges(handle, true, [&](const handle_t& prev) {
active.push(graph.get_id(prev));
});
}
}

return result;
}

//------------------------------------------------------------------------------

} // namespace vg
36 changes: 36 additions & 0 deletions src/algorithms/extract_subchain.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED
#define VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED

/**
* \file extract_subchain.hpp
*
* Algorithm that extracts all node ids in a subchain of the snarl decomposition.
*/

#include "../handle.hpp"
#include "../hash_map.hpp"

namespace vg {

//------------------------------------------------------------------------------

/**
* Extracts all node ids in a subchain of the snarl decomposition.
*
* The subchain is defined by two handles, start and end.
* If the corresponding nodes are on the same chain in the snarl decomposition
* and end is after start, the function returns all node ids in the subchain
* between them. This means the identifiers of start and end, as well as all
* node identifiers in the weakly connected component formed by removing start
* and end from the graph.
*
* If the nodes are not on the same chain or end is before start, the behavior
* is undefined.
*/
hash_set<nid_t> extract_subchain(const HandleGraph& graph, handle_t start, handle_t end);

//------------------------------------------------------------------------------

} // namespace vg

#endif // VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED
4 changes: 3 additions & 1 deletion src/gaf_sorter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,15 @@ void GAFSorterRecord::set_key(key_type type) {
}
} else if (type == key_gbwt_pos) {
std::uint32_t node_id = std::numeric_limits<std::uint32_t>::max();
bool is_reverse = false;
std::uint32_t offset = std::numeric_limits<std::uint32_t>::max();
this->for_each_field([&](size_t i, str_view value) -> bool {
if (i == PATH_FIELD && value.size > 1) {
auto result = std::from_chars(value.data + 1, value.data + value.size, node_id);
if (result.ec != std::errc()) {
return false;
}
is_reverse = (value[0] == '<');
} else if (i >= MANDATORY_FIELDS) {
size_t tag_size = GBWT_OFFSET_TAG.size();
if (value.size > tag_size && value.substr(0, tag_size) == GBWT_OFFSET_TAG) {
Expand All @@ -77,7 +79,7 @@ void GAFSorterRecord::set_key(key_type type) {
// We either did not find both fields or failed to parse them.
this->key = MISSING_KEY;
} else {
this->key = (static_cast<std::uint64_t>(node_id) << 32) | offset;
this->key = (static_cast<std::uint64_t>(node_id) << 33) | (static_cast<std::uint64_t>(is_reverse) << 32) | offset;
}
} else if (type == key_hash) {
this->key = hasher(this->value);
Expand Down
Loading
Loading