diff --git a/deps/gbwt b/deps/gbwt index 99daaccf4c..bde6858046 160000 --- a/deps/gbwt +++ b/deps/gbwt @@ -1 +1 @@ -Subproject commit 99daaccf4c8141e5e28680be031a481eb843e2c6 +Subproject commit bde6858046580d1b9dbfa54f48ab187c85998ffe diff --git a/deps/gbwtgraph b/deps/gbwtgraph index ebb77a18c1..d5e1955e9f 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit ebb77a18c1fd5d13bb9ac3cc1f6fc75807b742f2 +Subproject commit d5e1955e9f8e478c0af02966f932f6820b78a58b diff --git a/src/algorithms/extract_subchain.cpp b/src/algorithms/extract_subchain.cpp new file mode 100644 index 0000000000..57898489ac --- /dev/null +++ b/src/algorithms/extract_subchain.cpp @@ -0,0 +1,53 @@ +#include "extract_subchain.hpp" + +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +hash_set extract_subchain(const HandleGraph& graph, handle_t start, handle_t end) { + hash_set result; + nid_t start_id = graph.get_id(start); + bool start_is_reverse = graph.get_is_reverse(start); + nid_t end_id = graph.get_id(end); + bool end_is_reverse = graph.get_is_reverse(end); + + std::stack active; + active.push(start_id); + active.push(end_id); + while (!active.empty()) { + nid_t node_id = active.top(); active.pop(); + if (result.count(node_id)) { + continue; + } + result.insert(node_id); + bool go_forward = true, go_backward = true; + if (node_id == start_id) { + go_forward &= !start_is_reverse; + go_backward &= start_is_reverse; + + } + if (node_id == end_id) { + go_forward &= end_is_reverse; + go_backward &= !end_is_reverse; + } + handle_t handle = graph.get_handle(node_id, false); + if (go_forward) { + graph.follow_edges(handle, false, [&](const handle_t& next) { + active.push(graph.get_id(next)); + }); + } + if (go_backward) { + graph.follow_edges(handle, true, [&](const handle_t& prev) { + active.push(graph.get_id(prev)); + }); + } + } + + return result; +} + +//------------------------------------------------------------------------------ + +} // namespace vg diff --git a/src/algorithms/extract_subchain.hpp b/src/algorithms/extract_subchain.hpp new file mode 100644 index 0000000000..13da6d04a3 --- /dev/null +++ b/src/algorithms/extract_subchain.hpp @@ -0,0 +1,36 @@ +#ifndef VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED +#define VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED + +/** + * \file extract_subchain.hpp + * + * Algorithm that extracts all node ids in a subchain of the snarl decomposition. + */ + +#include "../handle.hpp" +#include "../hash_map.hpp" + +namespace vg { + +//------------------------------------------------------------------------------ + +/** + * Extracts all node ids in a subchain of the snarl decomposition. + * + * The subchain is defined by two handles, start and end. + * If the corresponding nodes are on the same chain in the snarl decomposition + * and end is after start, the function returns all node ids in the subchain + * between them. This means the identifiers of start and end, as well as all + * node identifiers in the weakly connected component formed by removing start + * and end from the graph. + * + * If the nodes are not on the same chain or end is before start, the behavior + * is undefined. + */ +hash_set extract_subchain(const HandleGraph& graph, handle_t start, handle_t end); + +//------------------------------------------------------------------------------ + +} // namespace vg + +#endif // VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED diff --git a/src/gaf_sorter.cpp b/src/gaf_sorter.cpp index 12d158b0fd..08e2dfff0e 100644 --- a/src/gaf_sorter.cpp +++ b/src/gaf_sorter.cpp @@ -57,6 +57,7 @@ void GAFSorterRecord::set_key(key_type type) { } } else if (type == key_gbwt_pos) { std::uint32_t node_id = std::numeric_limits::max(); + bool is_reverse = false; std::uint32_t offset = std::numeric_limits::max(); this->for_each_field([&](size_t i, str_view value) -> bool { if (i == PATH_FIELD && value.size > 1) { @@ -64,6 +65,7 @@ void GAFSorterRecord::set_key(key_type type) { if (result.ec != std::errc()) { return false; } + is_reverse = (value[0] == '<'); } else if (i >= MANDATORY_FIELDS) { size_t tag_size = GBWT_OFFSET_TAG.size(); if (value.size > tag_size && value.substr(0, tag_size) == GBWT_OFFSET_TAG) { @@ -77,7 +79,7 @@ void GAFSorterRecord::set_key(key_type type) { // We either did not find both fields or failed to parse them. this->key = MISSING_KEY; } else { - this->key = (static_cast(node_id) << 32) | offset; + this->key = (static_cast(node_id) << 33) | (static_cast(is_reverse) << 32) | offset; } } else if (type == key_hash) { this->key = hasher(this->value); diff --git a/src/recombinator.cpp b/src/recombinator.cpp index 65d540b2d0..598542283b 100644 --- a/src/recombinator.cpp +++ b/src/recombinator.cpp @@ -3,6 +3,7 @@ #include "kff.hpp" #include "statistics.hpp" #include "algorithms/component.hpp" +#include "algorithms/extract_subchain.hpp" #include #include @@ -180,8 +181,8 @@ double Haplotypes::Subchain::badness(const gbwtgraph::GBZ& gbz) const { } } size_t length = this->distance(gbz, selected); - if (length < expected_length) { - result += std::log(static_cast(expected_length) / static_cast(length)); + if (length > expected_length) { + result += std::log(static_cast(length) / static_cast(expected_length)); } } @@ -366,7 +367,7 @@ void Haplotypes::simple_sds_load(std::istream& in) { this->chains.resize(this->header.top_level_chains); for (auto& chain : this->chains) { - if (this->header.version == Header::VERSION) { + if (this->header.version >= 3) { chain.simple_sds_load(in); } else if (this->header.version == 2) { chain.load_v2(in); @@ -388,16 +389,92 @@ size_t Haplotypes::simple_sds_size() const { return result; } +std::vector Haplotypes::assign_reference_paths(const gbwtgraph::GBZ& gbz, const gbwt::FragmentMap& fragment_map, Verbosity verbosity) const { + if (verbosity >= verbosity_basic) { + std::cerr << "Assigning reference paths to GBWT construction jobs" << std::endl; + } + double start = gbwt::readTimer(); + + // All paths are initially unassigned. + std::vector result (gbz.named_paths(), this->jobs()); + + auto path_id_to_named_path_offset = [&](gbwt::size_type path_id) -> size_t { + path_handle_t handle = gbz.graph.path_to_handle(path_id); + size_t as_integer = handlegraph::as_integer(handle); + if (as_integer >= gbz.named_paths()) { + return std::numeric_limits::max(); + } else { + return as_integer; + } + }; + + // We do a lot of redundant work here. We iterate over every sequence in every subchain + // and top-level chain. If the sequence is a cached (generic or reference) path, we + // assign it to the job corresponding to the top-level chain. + for (const TopLevelChain& chain : this->chains) { + size_t job = chain.job_id; + for (const Subchain& subchain : chain.subchains) { + for (const compact_sequence_type& sequence : subchain.sequences) { + gbwt::size_type path_id = gbwt::Path::id(sequence.first); + size_t offset = path_id_to_named_path_offset(path_id); + if (offset < gbz.named_paths()) { + result[offset] = job; + } + } + } + } + + // TODO: Fragment map could provide an iterator. + // If some paths were left unassigned because they are located within a subchain, we + // try to use the fragment map to rescue them. + size_t unassigned = 0; + for (size_t i = 0; i < result.size(); i++) { + if (result[i] < this->jobs()) { + continue; + } + gbwt::size_type first = gbz.graph.named_paths[i].id; + while (true) { + gbwt::size_type prev = fragment_map.prev(first); + if (prev == gbwt::invalid_sequence()) { + break; + } + first = prev; + } + for (gbwt::size_type curr = first; curr != gbwt::invalid_sequence(); curr = fragment_map.next(curr)) { + size_t offset = path_id_to_named_path_offset(curr); + if (offset < gbz.named_paths() && result[offset] < this->jobs()) { + result[i] = result[offset]; + break; + } + } + if (result[i] > this->jobs()) { + unassigned++; + } + } + + if (verbosity >= verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Assigned " << (result.size() - unassigned) << " reference paths (" << unassigned << " unassigned) in " << seconds << " seconds" << std::endl; + } + return result; +} + //------------------------------------------------------------------------------ -HaplotypePartitioner::HaplotypePartitioner(const gbwtgraph::GBZ& gbz, +HaplotypePartitioner::HaplotypePartitioner( + const gbwtgraph::GBZ& gbz, const gbwt::FastLocate& r_index, const SnarlDistanceIndex& distance_index, const minimizer_index_type& minimizer_index, - Verbosity verbosity) : - gbz(gbz), r_index(r_index), distance_index(distance_index), minimizer_index(minimizer_index), + Verbosity verbosity +) : + gbz(gbz), fragment_map(gbz.index.metadata, verbosity >= Haplotypes::verbosity_extra_debug), r_index(r_index), + distance_index(distance_index), minimizer_index(minimizer_index), verbosity(verbosity) { + if (this->verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "HaplotypePartitioner: " << this->gbz.index.metadata.paths() << " fragments for " << this->fragment_map.size() << " haplotype sequences" << std::endl; + } } void HaplotypePartitioner::Parameters::print(std::ostream& out) const { @@ -480,7 +557,7 @@ Haplotypes HaplotypePartitioner::partition_haplotypes(const Parameters& paramete } // Assign named and reference paths to jobs. - result.jobs_for_cached_paths = std::vector(this->gbz.graph.named_paths.size(), result.jobs()); + result.jobs_for_cached_paths = std::vector(this->gbz.named_paths(), result.jobs()); // Again, we do not use a path filter, because a GBZ graph should not contain alt paths. auto assignments = gbwtgraph::assign_paths(this->gbz.graph, jobs, nullptr, nullptr); for (size_t job = 0; job < assignments.size(); job++) { @@ -678,86 +755,205 @@ HaplotypePartitioner::get_subchains(const gbwtgraph::TopLevelChain& chain, const //------------------------------------------------------------------------------ -std::vector HaplotypePartitioner::get_sequence_visits(handle_t handle) const { - std::vector sa = this->r_index.decompressSA(gbwtgraph::GBWTGraph::handle_to_node(handle)); - std::vector result; +// Returns (SA[i], i) for the handle, ordered by i. +std::vector get_sequence_visits(handle_t handle, const gbwt::FastLocate& r_index) { + std::vector sa = r_index.decompressSA(gbwtgraph::GBWTGraph::handle_to_node(handle)); + std::vector result; result.reserve(sa.size()); for (size_t i = 0; i < sa.size(); i++) { result.push_back({ sa[i], i }); } + return result; +} + +std::vector HaplotypePartitioner::get_sequences(handle_t handle) const { + auto result = get_sequence_visits(handle, this->r_index); std::sort(result.begin(), result.end(), [&](sequence_type a, sequence_type b) -> bool { - gbwt::size_type a_id = r_index.seqId(a.first); - gbwt::size_type a_offset = r_index.seqOffset(a.first); - gbwt::size_type b_id = r_index.seqId(b.first); - gbwt::size_type b_offset = r_index.seqOffset(b.first); + gbwt::size_type a_id = this->r_index.seqId(a.first); + gbwt::size_type a_offset = this->r_index.seqOffset(a.first); + gbwt::size_type b_id = this->r_index.seqId(b.first); + gbwt::size_type b_offset = this->r_index.seqOffset(b.first); return ((a_id < b_id) || ((a_id == b_id) && (a_offset > b_offset))); }); + for (auto& sequence : result) { + sequence.first = this->r_index.seqId(sequence.first); + } return result; } -void sa_to_da(std::vector& sequences, const gbwt::FastLocate& r_index) { - for (auto& sequence : sequences) { - sequence.first = r_index.seqId(sequence.first); +// A sequence visit (SA[i], i) decomposed into a visit in a fragmented haplotype. +// Visits can be sorted by (chain, orientation, position within the chain). +struct FragmentedHaplotypeVisit { + // (DA[i], i). + gbwt::size_type sequence_id; + gbwt::size_type gbwt_node_offset; + + // Information used for ordering visits. + gbwt::size_type oriented_chain_id; + gbwt::size_type rank_in_chain; + gbwt::size_type rank_in_sequence; + + FragmentedHaplotypeVisit(HaplotypePartitioner::sequence_type sequence_visit, const HaplotypePartitioner& partitioner) { + // Sequence id is encoded in the SA value and node offset is already given. + this->sequence_id = partitioner.r_index.seqId(sequence_visit.first); + this->gbwt_node_offset = sequence_visit.second; + + // We get the chain id from the fragment map and the orientation from the sequence id. + // Then we abuse the path/sequence name encoding to encode the orientation in the id. + gbwt::size_type path_id = gbwt::Path::id(this->sequence_id); + bool is_reverse = gbwt::Path::is_reverse(this->sequence_id); + this->oriented_chain_id = gbwt::Path::encode(partitioner.fragment_map.chain(path_id), is_reverse); + + // Rank in chain is based on the count/fragment/starting offset field in path name. + // But we must consider the orientation of the chain. + gbwt::PathName path_name = partitioner.gbz.index.metadata.path(path_id); + this->rank_in_chain = path_name.count; + if (is_reverse) { + this->rank_in_chain = std::numeric_limits::max() - this->rank_in_chain; + } + + // Sequence offset in the SA value is the distance to the end of the sequence in nodes. + this->rank_in_sequence = std::numeric_limits::max() - partitioner.r_index.seqOffset(sequence_visit.first); } -} -std::vector HaplotypePartitioner::get_sequences(handle_t handle) const { - auto result = this->get_sequence_visits(handle); - sa_to_da(result, this->r_index); - return result; -} + // Returns true if the visits are in the same chain in the same orientation. + bool same_chain(const FragmentedHaplotypeVisit& another) const { + return (this->oriented_chain_id == another.oriented_chain_id); + } -std::vector HaplotypePartitioner::get_sequences(Subchain subchain) const { - if (subchain.type == Haplotypes::Subchain::prefix) { - return this->get_sequences(subchain.end); + // Orders the visits by chain and orientation, and visits within the chain + // by their position in the chain. + bool operator<(const FragmentedHaplotypeVisit& another) const { + if (this->oriented_chain_id != another.oriented_chain_id) { + return (this->oriented_chain_id < another.oriented_chain_id); + } else if (this->rank_in_chain != another.rank_in_chain) { + return (this->rank_in_chain < another.rank_in_chain); + } else { + return (this->rank_in_sequence < another.rank_in_sequence); + } } - if (subchain.type == Haplotypes::Subchain::suffix) { - return this->get_sequences(subchain.start); + + // Returns (DA[i], i). + HaplotypePartitioner::sequence_type to_sequence() const { + return HaplotypePartitioner::sequence_type(this->sequence_id, this->gbwt_node_offset); } - auto from = this->get_sequence_visits(subchain.start); - auto to = this->get_sequence_visits(subchain.end); +}; + +// Matches the minimal sequence visits in `from` and `to` by the same oriented chain of haplotype sequences. +// Requires that the fragments remain within the subchain defined by start and end. +// Returns (DA[i], i) for the matching visits in `from`, sorted by rank in the chain. +std::vector match_visits( + handle_t start, handle_t end, + const std::vector& from_visits, + const std::vector& to_visits, + const HaplotypePartitioner& partitioner +) { + std::vector from, to; + for (auto& visit : from_visits) { + from.push_back(FragmentedHaplotypeVisit(visit, partitioner)); + } + std::sort(from.begin(), from.end()); + for (auto& visit : to_visits) { + to.push_back(FragmentedHaplotypeVisit(visit, partitioner)); + } + std::sort(to.begin(), to.end()); + + // When there are multiple fragments, some of them could be outside the subchain. + // We want to avoid that to keep the kmers we use specific to the subchain. + hash_set node_ids = extract_subchain(partitioner.gbz.graph, start, end); + auto within_subchain = [&](const FragmentedHaplotypeVisit& from_visit, const FragmentedHaplotypeVisit& to_visit) -> bool { + gbwt::size_type sequence_id = from_visit.sequence_id; + gbwt::edge_type pos(gbwtgraph::GBWTGraph::handle_to_node(start), from_visit.gbwt_node_offset); + while (sequence_id != to_visit.sequence_id) { + while (pos.first != gbwt::ENDMARKER) { + if (node_ids.find(gbwt::Node::id(pos.first)) == node_ids.end()) { + // The fragment is outside the subchain. + return false; + } + pos = partitioner.gbz.index.LF(pos); + } + sequence_id = partitioner.fragment_map.oriented_next(sequence_id); + if (sequence_id == gbwt::invalid_sequence()) { + // No more fragments in the chain. + return false; + } + pos = partitioner.gbz.index.start(sequence_id); + if (pos.first != gbwt::ENDMARKER && node_ids.find(gbwt::Node::id(pos.first)) == node_ids.end()) { + // The fragment starts outside the subchain. + return false; + } + } + + // We have ensured that this is a minimal end-to-end visit. Once the sequence ids match + // and we know that the position is within the subchain, we cannot leave the subchain and + // later reach the end without re-entering the subchain. + return true; + }; + std::vector result; auto from_iter = from.begin(); auto to_iter = to.begin(); - std::vector result; while (from_iter != from.end() && to_iter != to.end()) { - gbwt::size_type from_id = this->r_index.seqId(from_iter->first); - gbwt::size_type to_id = this->r_index.seqId(to_iter->first); - if (from_id == to_id) { - // If a haplotype crosses the subchain multiple times, we take the last entry before - // each exit. - gbwt::size_type to_offset = this->r_index.seqOffset(to_iter->first); - if (this->r_index.seqOffset(from_iter->first) >= to_offset) { - auto peek = from_iter +1; - while (peek != from.end() && this->r_index.seqId(peek->first) == from_id && this->r_index.seqOffset(peek->first) >= to_offset) { + if (*from_iter < *to_iter) { + if (from_iter->same_chain(*to_iter)) { + auto peek = from_iter + 1; + while (peek != from.end() && *peek < *to_iter) { from_iter = peek; ++peek; } - result.push_back(*from_iter); + if (within_subchain(*from_iter, *to_iter)) { + result.push_back(from_iter->to_sequence()); + } ++from_iter; ++to_iter; } else { - ++to_iter; + ++from_iter; } - } else if (from_id < to_id) { - ++from_iter; - } else if (from_id > to_id) { + } else { ++to_iter; } } - sa_to_da(result, this->r_index); return result; } +std::vector HaplotypePartitioner::get_sequences(Subchain subchain) const { + if (subchain.type == Haplotypes::Subchain::prefix) { + // NOTE: If a sequence flips after this subchain and returns to it, we will + // take the initial prefix that exits the subchain multiple times. + return this->get_sequences(subchain.end); + } + if (subchain.type == Haplotypes::Subchain::suffix) { + // NOTE: If the sequence flips, exits the subchain, and later returns to it, + // some of the haplotypes will overlap the preceding subchain(s). + return this->get_sequences(subchain.start); + } + auto from = get_sequence_visits(subchain.start, this->r_index); + auto to = get_sequence_visits(subchain.end, this->r_index); + return match_visits(subchain.start, subchain.end, from, to, *this); +} + //------------------------------------------------------------------------------ // Generate a haplotype over the closed range from `pos` to `end`. +// The haplotype may consist of multiple fragments. // Take at most start_max and end_max characters from the initial and the final -// node, respectively +// node, respectively. // Returns an empty haplotype if there is only one node. +// Set `pos.first = gbwt::ENDMARKER` to start from the beginning. // Set `end = empty_gbwtgraph_handle()` to continue until the end without a final node. -std::string generate_haplotype(gbwt::edge_type pos, handle_t end, size_t start_max, size_t end_max, const gbwtgraph::GBWTGraph& graph) { - std::string haplotype; +std::vector generate_haplotype( + gbwt::size_type sequence_id, + gbwt::edge_type pos, handle_t end, size_t start_max, size_t end_max, + const HaplotypePartitioner& partitioner +) { + // Determine the start for a prefix. + bool single_fragment = (pos.first == gbwt::ENDMARKER || end == empty_gbwtgraph_handle()); + if (pos.first == gbwt::ENDMARKER) { + pos = partitioner.gbz.index.start(sequence_id); + } + + // Empty haplotype. + std::vector haplotype; if (pos == gbwt::invalid_edge() || pos.first == gbwt::ENDMARKER) { return haplotype; } @@ -767,37 +963,55 @@ std::string generate_haplotype(gbwt::edge_type pos, handle_t end, size_t start_m if (curr == end) { return haplotype; } - gbwtgraph::view_type view = graph.get_sequence_view(curr); + gbwtgraph::view_type view = partitioner.gbz.graph.get_sequence_view(curr); size_t offset = (view.second > start_max ? view.second - start_max : 0); - haplotype.append(view.first + offset, view.second - offset); + haplotype.emplace_back(); + haplotype.back().append(view.first + offset, view.second - offset); while (true) { - pos = graph.index->LF(pos); + pos = partitioner.gbz.index.LF(pos); if (pos.first == gbwt::ENDMARKER) { - break; + if (single_fragment) { + break; + } + // Try to continue with the next non-empty fragment. + while (pos.first == gbwt::ENDMARKER) { + sequence_id = partitioner.fragment_map.oriented_next(sequence_id); + if (sequence_id == gbwt::invalid_sequence()) { + break; + } + pos = partitioner.gbz.index.start(sequence_id); + } + if (sequence_id == gbwt::invalid_sequence()) { + break; + } + haplotype.emplace_back(); } curr = gbwtgraph::GBWTGraph::node_to_handle(pos.first); - view = graph.get_sequence_view(curr); + view = partitioner.gbz.graph.get_sequence_view(curr); if (curr == end) { - haplotype.append(view.first, std::min(view.second, end_max)); + haplotype.back().append(view.first, std::min(view.second, end_max)); break; } else { - haplotype.append(view.first, view.second); + haplotype.back().append(view.first, view.second); } } return haplotype; } -// Return the sorted set of kmers that are minimizers in the sequence and have a single +// Return the sorted set of kmers that are minimizers in the sequences and have a single // occurrence in the graph. -std::vector take_unique_minimizers(const std::string& sequence, const HaplotypePartitioner::minimizer_index_type& minimizer_index) { +std::vector take_unique_minimizers( + const std::vector& sequences, const HaplotypePartitioner::minimizer_index_type& minimizer_index +) { std::vector result; - auto minimizers = minimizer_index.minimizers(sequence); - result.reserve(minimizers.size()); - for (auto& minimizer : minimizers) { - if (minimizer_index.count(minimizer) == 1) { - result.push_back(minimizer.key.get_key()); + for (const std::string& sequence : sequences) { + auto minimizers = minimizer_index.minimizers(sequence); + for (auto& minimizer : minimizers) { + if (minimizer_index.count(minimizer) == 1) { + result.push_back(minimizer.key.get_key()); + } } } gbwt::removeDuplicates(result, false); @@ -805,22 +1019,28 @@ std::vector take_unique_minimizers(const std::s } std::vector HaplotypePartitioner::unique_minimizers(gbwt::size_type sequence_id) const { - gbwt::edge_type pos = this->gbz.index.start(sequence_id); size_t limit = std::numeric_limits::max(); - std::string haplotype = generate_haplotype(pos, empty_gbwtgraph_handle(), limit, limit, this->gbz.graph); + std::vector haplotype = generate_haplotype( + sequence_id, + gbwt::edge_type(gbwt::ENDMARKER, 0), empty_gbwtgraph_handle(), limit, limit, + *this + ); return take_unique_minimizers(haplotype, this->minimizer_index); } -std::vector HaplotypePartitioner::unique_minimizers(sequence_type sequence, Subchain subchain) const { - gbwt::edge_type pos; +std::vector HaplotypePartitioner::unique_minimizers(sequence_type sequence, Subchain subchain, size_t& fragments) const { + gbwt::edge_type pos(gbwt::ENDMARKER, 0); size_t start_max = std::numeric_limits::max(), end_max = this->minimizer_index.k() - 1; if (subchain.has_start()) { pos = gbwt::edge_type(gbwtgraph::GBWTGraph::handle_to_node(subchain.start), sequence.second); start_max = this->minimizer_index.k() - 1; - } else { - pos = this->gbz.index.start(sequence.first); } - std::string haplotype = generate_haplotype(pos, subchain.end, start_max, end_max, this->gbz.graph); + std::vector haplotype = generate_haplotype( + sequence.first, + pos, subchain.end, start_max, end_max, + *this + ); + fragments = haplotype.size(); return take_unique_minimizers(haplotype, this->minimizer_index); } @@ -898,20 +1118,25 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain extra_snarls += subchain.extra_snarls; } } - #pragma omp critical - { - std::cerr << "Chain " << chain.offset << ": " << long_subchains << " long subchains (" - << with_extra_snarls << " with " << extra_snarls << " extra snarls)" << std::endl; + // The second condition should be redundant. + if (long_subchains > 0 || with_extra_snarls > 0) { + #pragma omp critical + { + std::cerr << "Chain " << chain.offset << " (" << output.contig_name << "): " << long_subchains << " long subchains (" + << with_extra_snarls << " with " << extra_snarls << " additional snarls)" << std::endl; + } } } // Convert the subchains to actual subchains. + size_t fragmented_subchains = 0, fragmented_sequences = 0, additional_fragments = 0; for (const Subchain& subchain : subchains) { std::vector>> to_process; auto sequences = this->get_sequences(subchain); if (sequences.empty()) { // There are no haplotypes crossing the subchain, so we break it into // a suffix and a prefix. + // NOTE: See the general get_sequences() for the asymmetry in handling prefixes and suffixes. to_process.push_back({ { Haplotypes::Subchain::suffix, subchain.start, empty_gbwtgraph_handle(), 0, 0 }, this->get_sequences(subchain.start) @@ -924,6 +1149,7 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain to_process.push_back({ subchain, std::move(sequences) }); } for (auto iter = to_process.begin(); iter != to_process.end(); ++iter) { + bool fragmented = false; output.subchains.push_back({ iter->first.type, gbwtgraph::GBWTGraph::handle_to_node(iter->first.start), gbwtgraph::GBWTGraph::handle_to_node(iter->first.end), @@ -933,13 +1159,30 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain std::vector> kmers_by_sequence; kmers_by_sequence.reserve(iter->second.size()); for (sequence_type sequence : iter->second) { - kmers_by_sequence.emplace_back(this->unique_minimizers(sequence, iter->first)); + size_t fragments = 0; + kmers_by_sequence.emplace_back(this->unique_minimizers(sequence, iter->first, fragments)); + if (fragments > 1) { + fragmented = true; + fragmented_sequences++; + additional_fragments += fragments - 1; + } } present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmer_counts, subchain.kmers_present); subchain.sequences = std::vector(iter->second.size()); for (size_t i = 0; i < iter->second.size(); i++) { subchain.sequences[i] = Haplotypes::compact_sequence_type(iter->second[i].first, iter->second[i].second); } + if (fragmented) { + fragmented_subchains++; + } + } + } + + if (fragmented_subchains > 0 && this->verbosity >= Haplotypes::verbosity_debug) { + #pragma omp critical + { + std::cerr << "Chain " << chain.offset << " (" << output.contig_name << "): " << fragmented_subchains << " fragmented subchains (" + << fragmented_sequences << " fragmented sequences with " << additional_fragments << " additional fragments)" << std::endl; } } @@ -983,6 +1226,13 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain struct RecombinatorHaplotype { typedef Recombinator::sequence_type sequence_type; + // Recombinator instance we are using. + const Recombinator& recombinator; + + // GBWT builder and metadata builder used by this job. + gbwt::GBWTBuilder& builder; + gbwtgraph::MetadataBuilder& metadata; + // Contig name in GBWT metadata. const std::string& contig_name; @@ -1005,9 +1255,21 @@ struct RecombinatorHaplotype { // The path being generated. gbwt::vector_type path; + // Constructor that starts a new haplotype with the given contig name and identifier. + RecombinatorHaplotype( + const Recombinator& recombinator, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata, + const std::string& contig_name, size_t id + ) : + recombinator(recombinator), + builder(builder), metadata(metadata), + contig_name(contig_name), id(id), fragment(0), + sequence_id(gbwt::invalid_sequence()), position(gbwt::invalid_edge()) {} + /* * Extends the haplotype over the given subchain by using the given - * original haplotype. + * original haplotype. If the haplotype is fragmented, this may + * finish the current fragment and start a new one. * * This assumes that the original haplotype crosses the subchain. * @@ -1019,117 +1281,120 @@ struct RecombinatorHaplotype { * take the prefix of the original original haplotype until the start * of the subchain. */ - void extend( - sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, - gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata - ); + void extend(sequence_type sequence, const Haplotypes::Subchain& subchain); // Takes an existing haplotype from the GBWT index and inserts it into - /// the builder. This is intended for fragments that do not contain - /// subchains crossed by the original haplotypes. The call will fail if - /// `extend()` has been called. - void take( - gbwt::size_type sequence_id, const Recombinator& recombinator, - gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata - ); + // the builder. This is intended for fragments that do not contain + // subchains crossed by the original haplotypes. The call will fail if + // `extend()` has been called. + void take(gbwt::size_type sequence_id); - // Extends the original haplotype from the latest `extend()` call until - // the end, inserts it into the builder, and starts a new fragment. - // The call will fail if `extend()` has not been called for this - // fragment. - void finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata); + // Finishes the current haplotype fragment and starts a new one. + // If `with_suffix` is true, the current fragment will be extended + // until the end of the corresponding path. In that case, the call will + // fail if `extend()` has not been called for this fragment. + void finish(bool with_suffix); private: // Extends the haplotype over a unary path from a previous subchain. - void connect(gbwt::node_type until, const gbwtgraph::GBWTGraph& graph); + void connect(gbwt::node_type until); - // Takes a prefix of a sequence. - void prefix(gbwt::size_type sequence_id, gbwt::node_type until, const gbwt::GBWT& index); + // Takes a prefix of the current sequence until `until`, assuming that + // `sequence_id` has been set. + void prefix(gbwt::node_type until); // Extends the haplotype from the previous subchain until the end. - void suffix(const gbwt::GBWT& index); + void suffix(); // Inserts the current fragment into the builder. - void insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata); + void insert(); }; -void RecombinatorHaplotype::extend( - sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, - gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata -) { +void RecombinatorHaplotype::extend(sequence_type sequence, const Haplotypes::Subchain& subchain) { if (subchain.type == Haplotypes::Subchain::full_haplotype) { throw std::runtime_error("Haplotype::extend(): cannot extend a full haplotype"); } + if (sequence.first == gbwt::invalid_sequence()) { + throw std::runtime_error("Haplotype::extend(): invalid sequence id"); + } + this->sequence_id = sequence.first; + if (subchain.type == Haplotypes::Subchain::prefix) { if (!this->path.empty()) { throw std::runtime_error("Haplotype::extend(): got a prefix subchain after the start of a fragment"); } - this->prefix(sequence.first, subchain.end, recombinator.gbz.index); + this->prefix(subchain.end); return; } // Suffixes and normal subchains have a start node, so we must reach it first. if (!this->path.empty()) { - this->connect(subchain.start, recombinator.gbz.graph); + this->connect(subchain.start); } else { - this->prefix(sequence.first, subchain.start, recombinator.gbz.index); + this->prefix(subchain.start); } - gbwt::edge_type curr(subchain.start, sequence.second); - if (!recombinator.gbz.index.contains(curr)) { - throw std::runtime_error("Haplotype::extend(): the GBWT index does not contain position (" + std::to_string(curr.first) + ", " + std::to_string(curr.second) + ")"); + const gbwt::GBWT& index = this->recombinator.gbz.index; + this->position = gbwt::edge_type(subchain.start, sequence.second); + if (!index.contains(this->position)) { + throw std::runtime_error("Haplotype::extend(): the GBWT index does not contain position (" + std::to_string(this->position.first) + ", " + std::to_string(this->position.second) + ")"); } if (subchain.type == Haplotypes::Subchain::suffix) { - this->position = curr; - this->finish(recombinator, builder, metadata); + this->finish(true); return; } // This is a normal subchain. - while (curr.first != subchain.end) { - curr = recombinator.gbz.index.LF(curr); - if (curr.first == gbwt::ENDMARKER) { - throw std::runtime_error("Haplotype::extend(): the sequence did not reach the end of the subchain at GBWT node " + std::to_string(subchain.end)); + while (this->position.first != subchain.end) { + this->position = index.LF(this->position); + if (this->position.first == gbwt::ENDMARKER) { + gbwt::size_type prev = this->sequence_id; + this->finish(false); + do { + // Find the next non-empty fragment. + this->sequence_id = this->recombinator.fragment_map.oriented_next(prev); + if (this->sequence_id == gbwt::invalid_sequence()) { + std::string msg = "Haplotype::extend(): no successor for GBWT sequence " + std::to_string(prev); + throw std::runtime_error(msg); + } + this->position = index.start(this->sequence_id); + prev = this->sequence_id; + } while (this->position.first == gbwt::ENDMARKER); } - this->path.push_back(curr.first); + this->path.push_back(this->position.first); } - this->sequence_id = sequence.first; - this->position = curr; } -void RecombinatorHaplotype::take( - gbwt::size_type sequence_id, const Recombinator& recombinator, - gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata -) { +void RecombinatorHaplotype::take(gbwt::size_type sequence_id) { + const gbwt::GBWT& index = this->recombinator.gbz.index; if (!this->path.empty()) { throw std::runtime_error("Haplotype::take(): the current fragment is not empty"); } - if (sequence_id >= recombinator.gbz.index.sequences()) { + if (sequence_id >= index.sequences()) { throw std::runtime_error("Haplotype::take(): the GBWT index does not contain sequence " + std::to_string(sequence_id)); } - this->path = recombinator.gbz.index.extract(sequence_id); - this->insert(builder, metadata); - this->fragment++; - this->sequence_id = gbwt::invalid_sequence(); - this->position = gbwt::invalid_edge(); - this->path.clear(); + this->path = index.extract(sequence_id); + this->finish(false); } -void RecombinatorHaplotype::finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) { - if (this->position == gbwt::invalid_edge()) { - throw std::runtime_error("Haplotype::finish(): there is no current position"); +void RecombinatorHaplotype::finish(bool with_suffix) { + if (with_suffix) { + if (this->position == gbwt::invalid_edge()) { + throw std::runtime_error("Haplotype::finish(): there is no current position"); + } + this->suffix(); } - this->suffix(recombinator.gbz.index); - this->insert(builder, metadata); + this->insert(); this->fragment++; this->sequence_id = gbwt::invalid_sequence(); this->position = gbwt::invalid_edge(); this->path.clear(); } -void RecombinatorHaplotype::connect(gbwt::node_type until, const gbwtgraph::GBWTGraph& graph) { +void RecombinatorHaplotype::connect(gbwt::node_type until) { + const gbwtgraph::GBWTGraph& graph = this->recombinator.gbz.graph; handle_t curr = gbwtgraph::GBWTGraph::node_to_handle(this->position.first); handle_t end = gbwtgraph::GBWTGraph::node_to_handle(until); this->position = gbwt::invalid_edge(); @@ -1153,32 +1418,32 @@ void RecombinatorHaplotype::connect(gbwt::node_type until, const gbwtgraph::GBWT } } -void RecombinatorHaplotype::prefix(gbwt::size_type sequence_id, gbwt::node_type until, const gbwt::GBWT& index) { +void RecombinatorHaplotype::prefix(gbwt::node_type until) { + const gbwt::GBWT& index = this->recombinator.gbz.index; this->position = gbwt::invalid_edge(); - if (sequence_id >= index.sequences()) { + if (this->sequence_id >= index.sequences()) { throw std::runtime_error("Haplotype::prefix(): invalid GBWT sequence id " + std::to_string(sequence_id)); } - this->sequence_id = sequence_id; - for (gbwt::edge_type curr = index.start(sequence_id); curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) { - this->path.push_back(curr.first); - if (curr.first == until) { - this->position = curr; + for (this->position = index.start(this->sequence_id); this->position.first != gbwt::ENDMARKER; this->position = index.LF(this->position)) { + this->path.push_back(this->position.first); + if (this->position.first == until) { return; } } - throw std::runtime_error("Haplotype::prefix(): GBWT sequence " + std::to_string(sequence_id) + " did not reach GBWT node " + std::to_string(until)); + throw std::runtime_error("Haplotype::prefix(): GBWT sequence " + std::to_string(this->sequence_id) + " did not reach GBWT node " + std::to_string(until)); } -void RecombinatorHaplotype::suffix(const gbwt::GBWT& index) { +void RecombinatorHaplotype::suffix() { + const gbwt::GBWT& index = this->recombinator.gbz.index; for (gbwt::edge_type curr = index.LF(this->position); curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) { this->path.push_back(curr.first); } } -void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) { +void RecombinatorHaplotype::insert() { std::string sample_name = "recombination"; // TODO: Make this a static class variable. - metadata.add_haplotype(sample_name, this->contig_name, this->id, this->fragment); - builder.insert(this->path, true); + this->metadata.add_haplotype(sample_name, this->contig_name, this->id, this->fragment); + this->builder.insert(this->path, true); } //------------------------------------------------------------------------------ @@ -1189,50 +1454,83 @@ void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder, gbwtgraph::Metada * GBWT metadata will be set as following: * * * Sample name is "fragment". - * * Contig name is taken from the top-level chain. + * * Contig name is "contig_N", with the name taken from the top-level chain and the number from the subchain. * * Haplotype identifier is set during construction. - * * Fragment identifier is the identifier of the subchain. + * * Fragment identifier is used if the fragment itself is fragmented. */ struct RecombinatorFragment { + // Sequence identifier for the start of the fragment. + gbwt::size_type sequence_id; + // GBWT starting position (inclusive). gbwt::edge_type from; // GBWT ending position (inclusive). gbwt::node_type to; - // Haplotype identifier. - size_t haplotype; + // Identifier of the subchain. + size_t subchain_id; - // Fragment / subchain identifier. - size_t fragment; + // Haplotype identifier. + size_t haplotype_id; - RecombinatorFragment(gbwt::edge_type from, gbwt::node_type to, size_t haplotype, size_t fragment) : - from(from), to(to), haplotype(haplotype), fragment(fragment) {} + RecombinatorFragment(gbwt::size_type sequence_id, gbwt::edge_type from, gbwt::node_type to, size_t subchain_id, size_t haplotype_id) : + sequence_id(sequence_id), from(from), to(to), subchain_id(subchain_id), haplotype_id(haplotype_id) {} - // Generates the GBWT path and the metadata for the fragment. - void generate( - const gbwt::GBWT& index, + // Generates the GBWT path(s) and the metadata for the fragment. + // Returns the number of paths generated. + size_t generate( + const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata, const std::string& contig_name ) const; }; -void RecombinatorFragment::generate( - const gbwt::GBWT& index, +size_t RecombinatorFragment::generate( + const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata, const std::string& contig_name ) const { gbwt::vector_type path; - for (gbwt::edge_type curr = this->from; curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) { - path.push_back(curr.first); - if (curr.first == this->to) { + std::string sample = "fragment"; // TODO: Make this a static class variable. + std::string contig = contig_name + "_" + std::to_string(this->subchain_id); + size_t fragment = 0; + auto finish_fragment = [&]() { + if (!path.empty()) { + metadata.add_haplotype(sample, contig, this->haplotype_id, fragment); + builder.insert(path, true); + fragment++; + path.clear(); + } + }; + + size_t sequence = this->sequence_id; + gbwt::edge_type pos = this->from; + while (true) { + if (pos.first == gbwt::ENDMARKER) { + if (this->to == gbwt::ENDMARKER) { + // We are in a suffix subchain, so we are not interested in subsequent fragments. + break; + } + do { + finish_fragment(); + sequence = fragment_map.oriented_next(sequence); + if (sequence == gbwt::invalid_sequence()) { + std::string msg = "RecombinatorFragment::generate(): no successor for GBWT sequence " + std::to_string(this->sequence_id); + throw std::runtime_error(msg); + } + pos = index.start(sequence); + } while (pos.first == gbwt::ENDMARKER); + } + path.push_back(pos.first); + if (pos.first == this->to) { break; } + pos = index.LF(pos); } + finish_fragment(); - std::string sample_name = "fragment"; // TODO: Make this a static class variable. - metadata.add_haplotype(sample_name, contig_name, this->haplotype, this->fragment); - builder.insert(path, true); + return fragment; } //------------------------------------------------------------------------------ @@ -1274,8 +1572,13 @@ std::ostream& Recombinator::Statistics::print(std::ostream& out) const { //------------------------------------------------------------------------------ Recombinator::Recombinator(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, Verbosity verbosity) : - gbz(gbz), haplotypes(haplotypes), verbosity(verbosity) + gbz(gbz), haplotypes(haplotypes), fragment_map(gbz.index.metadata, verbosity >= Haplotypes::verbosity_extra_debug), verbosity(verbosity) { + if (this->verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "Recombinator: " << this->gbz.index.metadata.paths() << " fragments for " << this->fragment_map.size() << " haplotype sequences" << std::endl; + } + + this->jobs_for_cached_paths = this->haplotypes.assign_reference_paths(this->gbz, this->fragment_map, this->verbosity); } //------------------------------------------------------------------------------ @@ -1459,7 +1762,7 @@ gbwt::GBWT Recombinator::generate_haplotypes(const std::string& kff_file, const std::vector> reference_paths(this->haplotypes.jobs()); if (parameters.include_reference) { for (size_t i = 0; i < this->gbz.graph.named_paths.size(); i++) { - size_t job_id = this->haplotypes.jobs_for_cached_paths[i]; + size_t job_id = this->jobs_for_cached_paths[i]; if (job_id < this->haplotypes.jobs()) { reference_paths[job_id].push_back(this->gbz.graph.named_paths[i].id); } @@ -1580,43 +1883,6 @@ std::vector> classify_kmers( return kmer_types; } -std::vector Recombinator::classify_kmers( - const std::string& kff_file, const Recombinator::Parameters& parameters -) const { - // Get kmer counts (may throw) and determine coverage. - hash_map counts = this->haplotypes.kmer_counts(kff_file, this->verbosity); - double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity); - - // Classify the kmers in each subchain. - std::vector classifications; - classifications.reserve(this->haplotypes.kmers()); - for (const auto& chain : this->haplotypes.chains) { - for (const auto& subchain : chain.subchains) { - std::vector> kmer_types = vg::classify_kmers( - subchain, counts, coverage, nullptr, parameters - ); - for (const auto& type : kmer_types) { - switch (type.first) { - case Recombinator::absent: - classifications.push_back('A'); - break; - case Recombinator::heterozygous: - classifications.push_back('H'); - break; - case Recombinator::present: - classifications.push_back('P'); - break; - case Recombinator::frequent: - classifications.push_back('F'); - break; - } - } - } - } - - return classifications; -} - //------------------------------------------------------------------------------ // Select the best pair of haplotypes from the candidates. Each haplotype gets @@ -1628,15 +1894,17 @@ std::vector> select_diploid( const Haplotypes::Subchain& subchain, const std::vector>& candidates, const std::vector>& kmer_types, - size_t original_selected, const Recombinator::Parameters& parameters ) { std::int64_t best_score = std::numeric_limits::min(); - size_t best_left = 0, best_right = 1; + size_t best_left = 0, best_right = 0; + // We now consider taking the same haplotype twice. If this is a bad subchain + // and we sample extra fragments, the number of fragments will depend on + // whether we take the same haplotype twice. for (size_t left = 0; left < candidates.size(); left++) { size_t left_offset = candidates[left].first * subchain.kmers.size(); - for (size_t right = left + 1; right < candidates.size(); right++) { + for (size_t right = left; right < candidates.size(); right++) { std::int64_t score = 0; size_t right_offset = candidates[right].first * subchain.kmers.size(); for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { @@ -1664,7 +1932,7 @@ std::vector> select_diploid( } // If this is a bad subchain, we move the selected haplotypes to the front - // and return the remaining non-duplicated haplotypes as extra fragments. + // and return the rest as extra fragments. // Otherwise we return only the selected haplotypes. if (parameters.extra_fragments) { double badness = subchain.badness(gbz); @@ -1672,7 +1940,7 @@ std::vector> select_diploid( std::vector> result; result.push_back(candidates[best_left]); result.push_back(candidates[best_right]); - for (size_t i = 0; i < original_selected; i++) { + for (size_t i = 0; i < candidates.size(); i++) { if (i != best_left && i != best_right) { result.push_back(candidates[i]); } @@ -1762,19 +2030,17 @@ std::vector> select_haplotypes( } } - // If we did not have enough haplotypes in the subchain, repeat them as necessary. - size_t original_selected = selected_haplotypes.size(); - for (size_t i = original_selected; i < parameters.num_haplotypes; i++) { - auto next = selected_haplotypes[i % original_selected]; - selected_haplotypes.push_back(next); - } - - // Do diploid sampling. If this is a bad subchain, we also return the - // extra fragments starting from the third haplotype. But we don't return - // duplicated ones. if (parameters.diploid_sampling) { - return select_diploid(gbz, subchain, selected_haplotypes, kmer_types, original_selected, parameters); + // Do diploid sampling. If this is a bad subchain, we also return the + // extra fragments starting from the third haplotype. + return select_diploid(gbz, subchain, selected_haplotypes, kmer_types, parameters); } else { + // If we did not have enough haplotypes in the subchain, repeat them as necessary. + size_t original_selected = selected_haplotypes.size(); + for (size_t i = original_selected; i < parameters.num_haplotypes; i++) { + auto next = selected_haplotypes[i % original_selected]; + selected_haplotypes.push_back(next); + } return selected_haplotypes; } } @@ -1788,7 +2054,7 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top size_t final_haplotypes = (parameters.diploid_sampling ? 2 : parameters.num_haplotypes); std::vector haplotypes; for (size_t i = 0; i < final_haplotypes; i++) { - haplotypes.push_back({ chain.contig_name, i + 1, 0, gbwt::invalid_sequence(), gbwt::invalid_edge(), {} }); + haplotypes.emplace_back(*this, builder, metadata, chain.contig_name, i + 1); } Statistics statistics; @@ -1800,7 +2066,7 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { assert(!subchain.sequences.empty()); size_t seq = haplotype % subchain.sequences.size(); - haplotypes[haplotype].take(subchain.sequences[seq].first, *this, builder, metadata); + haplotypes[haplotype].take(subchain.sequences[seq].first); } statistics.full_haplotypes = 1; } else { @@ -1820,11 +2086,17 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top ); if (parameters.diploid_sampling && selected_haplotypes.size() > 2) { for (size_t i = 2; i < selected_haplotypes.size(); i++) { - gbwt::edge_type start(subchain.start, subchain.sequences[selected_haplotypes[i].first].second); - extra_fragments.emplace_back(start, subchain.end, i - 1, subchain_id); + gbwt::size_type sequence_id = subchain.sequences[selected_haplotypes[i].first].first; + gbwt::edge_type start; + if (subchain.has_start()) { + start = gbwt::edge_type(subchain.start, subchain.sequences[selected_haplotypes[i].first].second); + } else { + // This is a prefix. + start = this->gbz.index.start(sequence_id); + } + extra_fragments.emplace_back(sequence_id, start, subchain.end, subchain_id, i - 1); } statistics.bad_subchains++; - statistics.extra_fragments += selected_haplotypes.size() - 2; selected_haplotypes.resize(2); } @@ -1864,21 +2136,27 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top size_t selected = haplotype_to_selected[haplotype]; size_t seq_offset = selected_haplotypes[selected].first; statistics.score += selected_haplotypes[selected].second; - haplotypes[haplotype].extend(subchain.sequences[seq_offset], subchain, *this, builder, metadata); + haplotypes[haplotype].extend(subchain.sequences[seq_offset], subchain); } have_haplotypes = subchain.has_end(); statistics.subchains++; } if (have_haplotypes) { for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { - haplotypes[haplotype].finish(*this, builder, metadata); + haplotypes[haplotype].finish(true); } } - statistics.fragments = haplotypes.front().fragment; + + // Each haplotype should consist of at least one fragment. + statistics.fragments = 0; + for (const RecombinatorHaplotype& haplotype : haplotypes) { + statistics.fragments += haplotype.fragment; + } // Add the extra fragments as separate paths. + statistics.extra_fragments = 0; for (const RecombinatorFragment& fragment : extra_fragments) { - fragment.generate(this->gbz.index, builder, metadata, chain.contig_name); + statistics.extra_fragments += fragment.generate(this->gbz.index, fragment_map, builder, metadata, chain.contig_name); } } @@ -1887,50 +2165,4 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top //------------------------------------------------------------------------------ -std::vector Recombinator::extract_sequences( - const std::string& kff_file, size_t chain_id, size_t subchain_id, const Parameters& parameters -) const { - // Sanity checks. - if (chain_id >= this->haplotypes.chains.size()) { - std::string msg = "Recombinator::extract_sequences(): invalid chain id " + std::to_string(chain_id); - throw std::runtime_error(msg); - } - if (subchain_id >= this->haplotypes.chains[chain_id].subchains.size()) { - std::string msg = "Recombinator::extract_sequences(): invalid subchain id " + std::to_string(subchain_id) + - " in chain " + std::to_string(chain_id); - throw std::runtime_error(msg); - } - recombinator_sanity_checks(parameters); - - // Extract the haplotypes. - const Haplotypes::Subchain& subchain = this->haplotypes.chains[chain_id].subchains[subchain_id]; - std::vector result(subchain.sequences.size()); - for (size_t i = 0; i < subchain.sequences.size(); i++) { - size_t path_id = gbwt::Path::id(subchain.sequences[i].first); - path_handle_t path_handle = this->gbz.graph.path_to_handle(path_id); - result[i].name = this->gbz.graph.get_path_name(path_handle); - - gbwt::edge_type pos; - if (subchain.has_start()) { - pos = gbwt::edge_type(subchain.start, subchain.sequences[i].second); - } else { - pos = this->gbz.index.start(subchain.sequences[i].first); - } - handle_t until = gbwtgraph::GBWTGraph::node_to_handle(subchain.end); - size_t limit = std::numeric_limits::max(); - result[i].sequence = generate_haplotype(pos, until, limit, limit, this->gbz.graph); - } - - // Get kmer counts (may throw) and determine coverage. - hash_map counts = this->haplotypes.kmer_counts(kff_file, this->verbosity); - double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity); - - // Fill in the scores. - select_haplotypes(this->gbz, subchain, counts, coverage, nullptr, &result, parameters); - - return result; -} - -//------------------------------------------------------------------------------ - } // namespace vg diff --git a/src/recombinator.hpp b/src/recombinator.hpp index b8969917fd..6799ee8f66 100644 --- a/src/recombinator.hpp +++ b/src/recombinator.hpp @@ -1,7 +1,7 @@ #ifndef VG_RECOMBINATOR_HPP_INCLUDED #define VG_RECOMBINATOR_HPP_INCLUDED -/** \file +/** \file recombinator.hpp * Tools for generating synthetic haplotypes as recombinations of existing * haplotypes. */ @@ -37,6 +37,9 @@ namespace vg { * * Versions: * + * * Version 4: Subchains can have fragmented haplotypes instead of a single + * GBWT sequence always crossing from start to end. Compatible with version 3. + * * * Version 3: Subchains use smaller integers when possible. Compatible with * version 2. * @@ -58,13 +61,16 @@ class Haplotypes { verbosity_detailed = 2, /// Basic information, detailed statistics, and debug information. - verbosity_debug = 3 + verbosity_debug = 3, + + /// Hidden level; potentially tens of thousands of lines of debugging information. + verbosity_extra_debug = 4 }; /// Header of the serialized file. struct Header { constexpr static std::uint32_t MAGIC_NUMBER = 0x4C504148; // "HAPL" - constexpr static std::uint32_t VERSION = 3; + constexpr static std::uint32_t VERSION = 4; constexpr static std::uint32_t MIN_VERSION = 1; constexpr static std::uint64_t DEFAULT_K = 29; @@ -132,7 +138,7 @@ class Haplotypes { /// Sequences as (GBWT sequence id, offset in the relevant node). std::vector sequences; - // TODO v3: Use an extra bit for each sequence to mark whether the presence for that sequence + // TODO v5: Use an extra bit for each sequence to mark whether the presence for that sequence // is stored explicitly or relative to the last explicit sequence. // We need to cluster the sequences by similarity and store the clusters consecutively. // And then use sd_vector for the sequences with relative presence. @@ -231,7 +237,9 @@ class Haplotypes { Header header; + // TODO v5: Remove this. // Job ids for each cached path in the GBWTGraph, or `jobs()` if the path is empty. + // This is no longer in use. std::vector jobs_for_cached_paths; std::vector chains; @@ -254,6 +262,14 @@ class Haplotypes { /// Returns the size of the object in elements. size_t simple_sds_size() const; + + /** + * Assigns each reference and generic path in the graph to a GBWT construction job. + * + * For each path handle from 0 to gbz.named_paths() - 1, we assign the path to + * the given construction job, or jobs() if the path is empty. + */ + std::vector assign_reference_paths(const gbwtgraph::GBZ& gbz, const gbwt::FragmentMap& fragment_map, Verbosity verbosity) const; }; //------------------------------------------------------------------------------ @@ -355,10 +371,12 @@ class HaplotypePartitioner { * Each top-level chain is partitioned into subchains that consist of one or * more snarls. Multiple snarls are combined into the same subchain if the * minimum distance over the subchain is at most the target length and there - * are GBWT haplotypes that cross the subchain. We also keep extending the - * subchain if a haplotype would cross the end in both directions. By doing - * this, we can avoid sequence loss with haplotypes reversing their direction, - * while keeping kmers specific to each subchain. + * are GBWT haplotypes that cross the subchain. + * + * With the right option, we keep extending the subchain if a haplotype would + * cross the end in both directions. By doing this, we can avoid sequence loss + * with haplotypes reversing their direction, while keeping kmers specific to + * each subchain. * * If there are no snarls in a top-level chain, it is represented as a single * subchain without boundary nodes. @@ -372,6 +390,7 @@ class HaplotypePartitioner { Haplotypes partition_haplotypes(const Parameters& parameters) const; const gbwtgraph::GBZ& gbz; + gbwt::FragmentMap fragment_map; const gbwt::FastLocate& r_index; const SnarlDistanceIndex& distance_index; const minimizer_index_type& minimizer_index; @@ -388,29 +407,34 @@ class HaplotypePartitioner { // Partition the top-level chain into subchains. std::vector get_subchains(const gbwtgraph::TopLevelChain& chain, const Parameters& parameters) const; - // Return (SA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id - // and the number of the visit. - std::vector get_sequence_visits(handle_t handle) const; - - // Return (DA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id. + // Return (DA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id + // and the rank of the visit for the same sequence. std::vector get_sequences(handle_t handle) const; - // Get all GBWT sequences crossing the subchain. The sequences will be at - // start for normal subchains and suffixes and at end for prefixes. + // Get all GBWT sequences crossing the subchain. + // + // * If the subchain is a prefix (suffix), the sequences will be at the end + // (start) of the subchain. + // * If the subchain is normal, the sequences will be at the start and + // correspond to minimal end-to-end visits to the subchain. A sequence + // that ends within the subchain may be selected if subsequent fragments + // of the same haplotype remain within the subchain and reach the end. std::vector get_sequences(Subchain subchain) const; // Return the sorted set of kmers that are minimizers in the sequence and have // a single occurrence in the graph. std::vector unique_minimizers(gbwt::size_type sequence_id) const; - // Count the number of minimizers in the sequence over the subchain with a single - // occurrence in the graph. Return the sorted set of kmers that are minimizers in - // the sequence over the subchain and have a single occurrence in the graph. + // Returns the sorted set of kmers that are minimizers in the sequence over the + // subchain and have a single occurrence in the graph. If the sequence does not + // reach the end of the subchain, this will try to continue with the next fragment(s). + // + // Also reports the number of fragments that were used to generate the kmers. // // To avoid using kmers shared between all haplotypes in the subchain, and // potentially with neighboring subchains, this does not include kmers contained // entirely in the shared initial/final nodes. - std::vector unique_minimizers(sequence_type sequence, Subchain subchain) const; + std::vector unique_minimizers(sequence_type sequence, Subchain subchain, size_t& fragments) const; // Build subchains for a specific top-level chain. void build_subchains(const gbwtgraph::TopLevelChain& chain, Haplotypes::TopLevelChain& output, const Parameters& parameters) const; @@ -470,10 +494,11 @@ class Recombinator { /// Number of subchains exceeding the badness threshold. size_t bad_subchains = 0; - /// Number of fragments. + /// Total number of fragments in the generated haplotypes. size_t fragments = 0; /// Number of top-level chains where full haplotypes were taken. + /// These are not counted as fragments. size_t full_haplotypes = 0; /// Number of haplotypes generated. @@ -482,7 +507,7 @@ class Recombinator { /// Number of additional haplotype fragments in bad subchains. size_t extra_fragments = 0; - /// Number of times a haplotype was extended from a subchain to the next subchain. + /// Number of times the same haplotype was extended from a subchain to the next subchain. size_t connections = 0; /// Number of reference paths included. @@ -549,7 +574,7 @@ class Recombinator { /// Include named and reference paths. bool include_reference = false; - // TODO: Should be use extra_fragments? + // TODO: Should we use extra_fragments? /// Preset parameters for common use cases. enum preset_t { /// Default parameters. @@ -574,9 +599,10 @@ class Recombinator { * (component). * * Each generated haplotype has a single source haplotype in each subchain. - * The subchains are connected by unary paths. Suffix / prefix subchains in - * the middle of a chain create fragment breaks. If the chain starts without - * a prefix (ends without a suffix), the haplotype chosen for the first (last) + * The source haplotype may consist of multiple fragments. Subchains are + * by unary paths. Suffix / prefix subchains in the middle of a chain create + * fragment breaks in every haplotype. If the chain starts without a prefix + * (ends without a suffix), the haplotype chosen for the first (last) * subchain is used from the start (continued until the end). * * Throws `std::runtime_error` on error in single-threaded parts and exits @@ -600,32 +626,15 @@ class Recombinator { /// Kmer classification. enum kmer_presence { absent, heterozygous, present, frequent }; - /** - * Classifies the kmers used for describing the haplotypes according to - * their frequency in the KFF file. Uses `A`, `H`, `P`, and `F` to represent - * absent, heterozygous, present, and frequent kmers, respectively. - * - * Throws `std::runtime_error` on error. - */ - std::vector classify_kmers(const std::string& kff_file, const Parameters& parameters) const; - - /** - * Extracts the local haplotypes in the given subchain. In addition to the - * haplotype sequence, this also reports the name of the corresponding path - * as well as (rank, score) for the haplotype in each round of haplotype - * selection. The number of rounds is `parameters.num_haplotypes`, but if - * the haplotype is selected earlier, it will not get further scores. - * - * Throws `std::runtime_error` on error. - */ - std::vector extract_sequences( - const std::string& kff_file, size_t chain_id, size_t subchain_id, const Parameters& parameters - ) const; - const gbwtgraph::GBZ& gbz; const Haplotypes& haplotypes; + gbwt::FragmentMap fragment_map; Verbosity verbosity; + // This used to be in the Haplotypes object. For each path handle from 0 to gbz.named_paths() - 1, + // this stores the job id for the corresponding generic/reference path. + std::vector jobs_for_cached_paths; + private: // Generate haplotypes for the given chain. Statistics generate_haplotypes(const Haplotypes::TopLevelChain& chain, diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index 4eca4e47a8..e7d2609187 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -158,7 +158,7 @@ void validate_gbwt_config(GBWTConfig& config); void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); void step_2_merge_gbwts(GBWTHandler& gbwts, GBWTConfig& config); -void step_3_alter_gbwt(GBWTHandler& gbwts, GBWTConfig& config); +void step_3_alter_gbwt(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config); @@ -195,7 +195,7 @@ int main_gbwt(int argc, char** argv) { // Edit the GBWT (remove samples, apply tags). if (!config.to_remove.empty() || !config.tags_to_set.empty()) { - step_3_alter_gbwt(gbwts, config); + step_3_alter_gbwt(gbwts, graphs, config); } // Path cover construction. @@ -1501,28 +1501,38 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { report_time_memory("Samples removed", start, config); } -void set_tags(GBWTHandler& gbwts, GBWTConfig& config) { +void set_tags(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config) { double start = gbwt::readTimer(); if (config.show_progress) { std::cerr << "Setting " << config.tags_to_set.size() << " tags on the GBWT" << std::endl; } gbwts.use_compressed(); + bool set_reference_samples = false; for (auto& kv : config.tags_to_set) { - gbwts.compressed.tags.set(kv.first, kv.second); + gbwts.compressed.tags.set(kv.first, kv.second); + if (kv.first == gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG) { + set_reference_samples = true; + } } // We modified the GBWT (we assume some tags got set) gbwts.unbacked(); - + + // If we updated reference samples and have an existing GBWTGraph, we need to recache the named paths. + // This is because we may want to include the reference paths in a path cover GBWT. + if (set_reference_samples && (graphs.in_use == GraphHandler::graph_gbwtgraph || graphs.in_use == GraphHandler::graph_gbz)) { + graphs.gbwt_graph->set_gbwt(gbwts.compressed); + } + report_time_memory("Tags set", start, config); } -void step_3_alter_gbwt(GBWTHandler& gbwts, GBWTConfig& config) { +void step_3_alter_gbwt(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config) { if (!config.to_remove.empty()) { remove_samples(gbwts, config); } if (!config.tags_to_set.empty()) { - set_tags(gbwts, config); + set_tags(gbwts, graphs, config); } } diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 05a204fdf8..e1395d8e64 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -665,7 +665,12 @@ std::string strip_suffixes(std::string filename, const std::vector& } // Returns the name of the sampled GBZ. -string sample_haplotypes(const vector>& indexes, string& basename, string& sample_name, string& haplotype_file, string& kff_file, bool progress); +std::string sample_haplotypes( + const std::vector>& indexes, + const std::unordered_set& reference_samples, + const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file, + bool progress +); //---------------------------------------------------------------------------- @@ -711,7 +716,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map reference_samples; string output_basename; string report_name; @@ -1116,6 +1124,7 @@ int main_giraffe(int argc, char** argv) { {"haplotype-name", required_argument, 0, OPT_HAPLOTYPE_NAME}, {"kff-name", required_argument, 0, OPT_KFF_NAME}, {"index-basename", required_argument, 0, OPT_INDEX_BASENAME}, + {"set-reference", required_argument, 0, OPT_SET_REFERENCE}, {"gam-in", required_argument, 0, 'G'}, {"fastq-in", required_argument, 0, 'f'}, {"interleaved", no_argument, 0, 'i'}, @@ -1176,11 +1185,11 @@ int main_giraffe(int argc, char** argv) { { case 'Z': if (!optarg || !*optarg) { - cerr << "error:[vg giraffe] Must provide GBZ file with -Z." << endl; + cerr << "error: [vg giraffe] Must provide GBZ file with -Z." << endl; exit(1); } if (!std::ifstream(optarg).is_open()) { - cerr << "error:[vg giraffe] Couldn't open GBZ file " << optarg << endl; + cerr << "error: [vg giraffe] Couldn't open GBZ file " << optarg << endl; exit(1); } provided_indexes.emplace_back("Giraffe GBZ", optarg); @@ -1193,11 +1202,11 @@ int main_giraffe(int argc, char** argv) { case 'x': if (!optarg || !*optarg) { - cerr << "error:[vg giraffe] Must provide graph file with -x." << endl; + cerr << "error: [vg giraffe] Must provide graph file with -x." << endl; exit(1); } if (!std::ifstream(optarg).is_open()) { - cerr << "error:[vg giraffe] Couldn't open graph file " << optarg << endl; + cerr << "error: [vg giraffe] Couldn't open graph file " << optarg << endl; exit(1); } provided_indexes.emplace_back("XG", optarg); @@ -1210,11 +1219,11 @@ int main_giraffe(int argc, char** argv) { case 'g': if (!optarg || !*optarg) { - cerr << "error:[vg giraffe] Must provide GBWTGraph file with -g." << endl; + cerr << "error: [vg giraffe] Must provide GBWTGraph file with -g." << endl; exit(1); } if (!std::ifstream(optarg).is_open()) { - cerr << "error:[vg giraffe] Couldn't open GBWTGraph file " << optarg << endl; + cerr << "error: [vg giraffe] Couldn't open GBWTGraph file " << optarg << endl; exit(1); } provided_indexes.emplace_back("GBWTGraph", optarg); @@ -1227,11 +1236,11 @@ int main_giraffe(int argc, char** argv) { case 'H': if (!optarg || !*optarg) { - cerr << "error:[vg giraffe] Must provide GBWT file with -H." << endl; + cerr << "error: [vg giraffe] Must provide GBWT file with -H." << endl; exit(1); } if (!std::ifstream(optarg).is_open()) { - cerr << "error:[vg giraffe] Couldn't open GBWT file " << optarg << endl; + cerr << "error: [vg giraffe] Couldn't open GBWT file " << optarg << endl; exit(1); } provided_indexes.emplace_back("Giraffe GBWT", optarg); @@ -1239,11 +1248,11 @@ int main_giraffe(int argc, char** argv) { case 'm': if (!optarg || !*optarg) { - cerr << "error:[vg giraffe] Must provide minimizer file with -m." << endl; + cerr << "error: [vg giraffe] Must provide minimizer file with -m." << endl; exit(1); } if (!std::ifstream(optarg).is_open()) { - cerr << "error:[vg giraffe] Couldn't open minimizer file " << optarg << endl; + cerr << "error: [vg giraffe] Couldn't open minimizer file " << optarg << endl; exit(1); } provided_indexes.emplace_back("Long Read Minimizers", optarg); @@ -1252,11 +1261,11 @@ int main_giraffe(int argc, char** argv) { case 'z': if (!optarg || !*optarg) { - cerr << "error:[vg giraffe] Must provide zipcode index file with -z." << endl; + cerr << "error: [vg giraffe] Must provide zipcode index file with -z." << endl; exit(1); } if (!std::ifstream(optarg).is_open()) { - cerr << "error:[vg giraffe] Couldn't open zipcode index file " << optarg << endl; + cerr << "error: [vg giraffe] Couldn't open zipcode index file " << optarg << endl; exit(1); } provided_indexes.emplace_back("Long Read Zipcodes", optarg); @@ -1264,11 +1273,11 @@ int main_giraffe(int argc, char** argv) { break; case 'd': if (!optarg || !*optarg) { - cerr << "error:[vg giraffe] Must provide distance index file with -d." << endl; + cerr << "error: [vg giraffe] Must provide distance index file with -d." << endl; exit(1); } if (!std::ifstream(optarg).is_open()) { - cerr << "error:[vg giraffe] Couldn't open distance index file " << optarg << endl; + cerr << "error: [vg giraffe] Couldn't open distance index file " << optarg << endl; exit(1); } provided_indexes.emplace_back("Giraffe Distance Index", optarg); @@ -1291,7 +1300,7 @@ int main_giraffe(int argc, char** argv) { case 'G': gam_filename = optarg; if (gam_filename.empty()) { - cerr << "error:[vg giraffe] Must provide GAM file with -G." << endl; + cerr << "error: [vg giraffe] Must provide GAM file with -G." << endl; exit(1); } break; @@ -1300,19 +1309,19 @@ int main_giraffe(int argc, char** argv) { if (fastq_filename_1.empty()) { fastq_filename_1 = optarg; if (fastq_filename_1.empty()) { - cerr << "error:[vg giraffe] Must provide FASTQ file with -f." << endl; + cerr << "error: [vg giraffe] Must provide FASTQ file with -f." << endl; exit(1); } } else if (fastq_filename_2.empty()) { fastq_filename_2 = optarg; if (fastq_filename_2.empty()) { - cerr << "error:[vg giraffe] Must provide FASTQ file with -f." << endl; + cerr << "error: [vg giraffe] Must provide FASTQ file with -f." << endl; exit(1); } paired = true; } else { - cerr << "error:[vg giraffe] Cannot specify more than two FASTQ files." << endl; + cerr << "error: [vg giraffe] Cannot specify more than two FASTQ files." << endl; exit(1); } break; @@ -1366,7 +1375,11 @@ int main_giraffe(int argc, char** argv) { case OPT_OUTPUT_BASENAME: output_basename = optarg; break; - + + case OPT_SET_REFERENCE: + reference_samples.insert(optarg); + break; + case OPT_REPORT_NAME: report_name = optarg; break; @@ -1442,7 +1455,7 @@ int main_giraffe(int argc, char** argv) { { int num_threads = parse(optarg); if (num_threads <= 0) { - cerr << "error:[vg giraffe] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + cerr << "error: [vg giraffe] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; exit(1); } omp_set_num_threads(num_threads); @@ -1470,7 +1483,7 @@ int main_giraffe(int argc, char** argv) { fasta_parts = split_ext(fasta_parts.first); } if (fasta_parts.second != "fa" && fasta_parts.second != "fasta" && fasta_parts.second != "fna") { - cerr << "error:[vg giraffe] FASTA file " << fasta_filename << " is not named like a FASTA" << endl; + cerr << "error: [vg giraffe] FASTA file " << fasta_filename << " is not named like a FASTA" << endl; exit(1); } @@ -1489,7 +1502,7 @@ int main_giraffe(int argc, char** argv) { vcf_parts = split_ext(vcf_parts.first); } if (vcf_parts.second != "vcf") { - cerr << "error:[vg giraffe] VCF file " << vcf_filename << " is not named like a VCF" << endl; + cerr << "error: [vg giraffe] VCF file " << vcf_filename << " is not named like a VCF" << endl; exit(1); } @@ -1514,33 +1527,33 @@ int main_giraffe(int argc, char** argv) { bool hts_output = (output_format == "SAM" || output_format == "BAM" || output_format == "CRAM"); if (!ref_paths_name.empty() && !hts_output) { - cerr << "warning:[vg giraffe] Reference path file (--ref-paths) is only used when output format (-o) is SAM, BAM, or CRAM." << endl; + cerr << "warning: [vg giraffe] Reference path file (--ref-paths) is only used when output format (-o) is SAM, BAM, or CRAM." << endl; ref_paths_name = ""; } if (output_format != "GAM" && !output_basename.empty()) { - cerr << "error:[vg giraffe] Using an output basename (--output-basename) only makes sense for GAM format (-o)" << endl; + cerr << "error: [vg giraffe] Using an output basename (--output-basename) only makes sense for GAM format (-o)" << endl; exit(1); } if (interleaved && !fastq_filename_2.empty()) { - cerr << "error:[vg giraffe] Cannot designate both interleaved paired ends (-i) and separate paired end file (-f)." << endl; + cerr << "error: [vg giraffe] Cannot designate both interleaved paired ends (-i) and separate paired end file (-f)." << endl; exit(1); } if (!fastq_filename_1.empty() && !gam_filename.empty()) { - cerr << "error:[vg giraffe] Cannot designate both FASTQ input (-f) and GAM input (-G) in same run." << endl; + cerr << "error: [vg giraffe] Cannot designate both FASTQ input (-f) and GAM input (-G) in same run." << endl; exit(1); } if (have_input_file(optind, argc, argv)) { // TODO: work out how to interpret additional files as reads. - cerr << "error:[vg giraffe] Extraneous input file: " << get_input_file_name(optind, argc, argv) << endl; + cerr << "error: [vg giraffe] Extraneous input file: " << get_input_file_name(optind, argc, argv) << endl; exit(1); } if ((forced_mean && ! forced_stdev) || (!forced_mean && forced_stdev)) { - cerr << "warning:[vg giraffe] Both a mean and standard deviation must be specified for the fragment length distribution" << endl; + cerr << "warning: [vg giraffe] Both a mean and standard deviation must be specified for the fragment length distribution" << endl; cerr << " Detecting fragment length distribution automatically" << endl; forced_mean = false; forced_stdev = false; @@ -1548,12 +1561,12 @@ int main_giraffe(int argc, char** argv) { fragment_stdev = 0.0; } if ((forced_mean || forced_stdev || forced_rescue_attempts) && (!paired)) { - cerr << "warning:[vg giraffe] Attempting to set paired-end parameters but running in single-end mode" << endl; + cerr << "warning: [vg giraffe] Attempting to set paired-end parameters but running in single-end mode" << endl; } if (parser->get_option_value("align-from-chains") && paired) { // TODO: Implement chaining for paired-end alignment - cerr << "error:[vg giraffe] Paired-end alignment is not yet implemented for --align-from-chains or chaining-based presets" << endl; + cerr << "error: [vg giraffe] Paired-end alignment is not yet implemented for --align-from-chains or chaining-based presets" << endl; exit(1); } @@ -1563,7 +1576,7 @@ int main_giraffe(int argc, char** argv) { } if (haplotype_sampling) { // If we do haplotype sampling, we get a new GBZ and later build indexes for it. - string gbz_name = sample_haplotypes(provided_indexes, index_basename, sample_name, haplotype_name, kff_name, show_progress); + string gbz_name = sample_haplotypes(provided_indexes, reference_samples, index_basename, sample_name, haplotype_name, kff_name, show_progress); registry.provide("Giraffe GBZ", gbz_name); index_basename = split_ext(gbz_name).first; } else { @@ -1604,7 +1617,7 @@ int main_giraffe(int argc, char** argv) { // A file with the appropriate name exists and we can read it. if (haplotype_sampling) { // If we did haplotype sampling, we are going to overwrite existing indexes. - cerr << "warning:[vg giraffe] " << inferred_filename << " exists and will be overwritten" << endl; + cerr << "warning: [vg giraffe] " << inferred_filename << " exists and will be overwritten" << endl; } else { // Report it because this may not be desired behavior. cerr << "Guessing that " << inferred_filename << " is " << index_and_extensions.first << endl; @@ -1639,7 +1652,7 @@ int main_giraffe(int argc, char** argv) { registry.make_indexes(index_targets); } catch (InsufficientInputException ex) { - cerr << "error:[vg giraffe] Input is not sufficient to create indexes" << endl; + cerr << "error: [vg giraffe] Input is not sufficient to create indexes" << endl; cerr << ex.what(); return 1; } @@ -2226,12 +2239,12 @@ int main_giraffe(int argc, char** argv) { long long thread_instructions; if (read(perf_fd, &thread_instructions, sizeof(long long)) != sizeof(long long)) { // Read failed for some reason. - cerr << "warning:[vg giraffe] Could not count CPU instructions executed" << endl; + cerr << "warning: [vg giraffe] Could not count CPU instructions executed" << endl; thread_instructions = 0; } if (close(perf_fd)) { int problem = errno; - cerr << "warning:[vg giraffe] Error closing perf event instruction counter: " << strerror(problem) << endl; + cerr << "warning: [vg giraffe] Error closing perf event instruction counter: " << strerror(problem) << endl; } total_instructions += thread_instructions; } @@ -2285,15 +2298,19 @@ int main_giraffe(int argc, char** argv) { //---------------------------------------------------------------------------- -string sample_haplotypes(const vector>& indexes, string& basename, string& sample_name, string& haplotype_file, string& kff_file, bool progress) { - +std::string sample_haplotypes( + const std::vector>& indexes, + const std::unordered_set& reference_samples, + const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file, + bool progress +) { if (progress) { std::cerr << "Sampling haplotypes" << std::endl; } // Sanity checks. if (haplotype_file.empty() || kff_file.empty()) { - std::cerr << "error:[vg giraffe] Haplotype sampling requires --haplotype-name and --kff-name." << std::endl; + std::cerr << "error: [vg giraffe] Haplotype sampling requires --haplotype-name and --kff-name." << std::endl; std::exit(EXIT_FAILURE); } @@ -2306,7 +2323,7 @@ string sample_haplotypes(const vector>& indexes, string& ba } } if (sample == "giraffe") { - std::cerr << "warning:[vg giraffe] Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; + std::cerr << "warning: [vg giraffe] Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; } std::string output_name = basename + "." + sample + ".gbz"; @@ -2319,10 +2336,21 @@ string sample_haplotypes(const vector>& indexes, string& ba } else if (indexes.size() == 2 && indexes[0].first == "GBWTGraph" && indexes[1].first == "Giraffe GBWT") { load_gbz(gbz, indexes[1].second, indexes[0].second, progress); } else { - std::cerr << "error:[vg giraffe] Haplotype sampling requires either -Z or -g and -H with no other indexes." << std::endl; + std::cerr << "error: [vg giraffe] Haplotype sampling requires either -Z or (-g and -H) with no other indexes." << std::endl; std::exit(EXIT_FAILURE); } + // Override reference samples if necessary. + if (!reference_samples.empty()) { + if (progress) { + std::cerr << "Updating reference samples" << std::endl; + } + size_t present = gbz.set_reference_samples(reference_samples); + if (present != reference_samples.size()) { + std::cerr << "warning: [vg giraffe] Only " << present << " out of " << reference_samples.size() << " reference samples are present" << std::endl; + } + } + // Load haplotype information. if (progress) { std::cerr << "Loading haplotype information from " << haplotype_file << std::endl; @@ -2338,7 +2366,7 @@ string sample_haplotypes(const vector>& indexes, string& ba try { sampled_gbwt = recombinator.generate_haplotypes(kff_file, parameters); } catch (const std::runtime_error& e) { - std::cerr << "error:[vg giraffe] Haplotype sampling failed: " << e.what() << std::endl; + std::cerr << "error: [vg giraffe] Haplotype sampling failed: " << e.what() << std::endl; std::exit(EXIT_FAILURE); } diff --git a/src/subcommand/haplotypes_main.cpp b/src/subcommand/haplotypes_main.cpp index ec7a9f97bf..3a73d928d2 100644 --- a/src/subcommand/haplotypes_main.cpp +++ b/src/subcommand/haplotypes_main.cpp @@ -2,19 +2,14 @@ * * Defines the "vg haplotypes" subcommand, which samples haplotypes by kmer counts in the reads. * - * TODO: - * - * 1. Option to sample non-contiguous haplotypes. This may be important in large - * snarls. Select a suffix of a fragment, all fragments fully within the snarl, - * and the prefix of the fragment exiting the snarl. - * - * 2. Tests for --linear-structure, --extra-fragments, and #1. + * TODO: Tests for --linear-structure, --extra-fragments, and fragmented haplotypes. */ #include "subcommand.hpp" #include "../hash_map.hpp" #include "../recombinator.hpp" +#include "../algorithms/extract_subchain.hpp" #include #include @@ -95,9 +90,6 @@ struct HaplotypesConfig { mode_sample_graph, mode_preprocess, mode_sample_haplotypes, - mode_map_variants, - mode_extract, - mode_classify, mode_statistics, }; @@ -106,21 +98,15 @@ struct HaplotypesConfig { // File names. std::string graph_name; - std::string gbz_output, haplotype_output, score_output, kmer_output; + std::string gbz_output, haplotype_output; std::string distance_name, r_index_name; - std::string haplotype_input, kmer_input, vcf_input; + std::string haplotype_input, kmer_input; // Computational parameters. size_t k = haplotypes_defaults::k(), w = haplotypes_defaults::w(); HaplotypePartitioner::Parameters partitioner_parameters; Recombinator::Parameters recombinator_parameters; - - // A prefix to add to VCF contig names to get GBWT contig names. - std::string contig_prefix; - - // For extracting local haplotypes. - size_t chain_id = std::numeric_limits::max(); - size_t subchain_id = std::numeric_limits::max(); + std::unordered_set reference_samples; // Overrides those specified in the graph. // For subchain statistics. std::string ref_sample; @@ -134,13 +120,9 @@ struct HaplotypesConfig { void preprocess_graph(const gbwtgraph::GBZ& gbz, Haplotypes& haplotypes, HaplotypesConfig& config); -void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); +void set_reference_samples(gbwtgraph::GBZ& gbz, const HaplotypesConfig& config); -void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); - -void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); - -void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); +void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); @@ -185,24 +167,13 @@ int main_haplotypes(int argc, char** argv) { // Sample the haplotypes. if (config.mode == HaplotypesConfig::mode_sample_graph || config.mode == HaplotypesConfig::mode_sample_haplotypes) { + // Update reference samples if necessary. + if (!config.reference_samples.empty()) { + set_reference_samples(gbz, config); + } sample_haplotypes(gbz, haplotypes, config); } - // Map variants to subchains. - if (config.mode == HaplotypesConfig::mode_map_variants) { - map_variants(gbz, haplotypes, config); - } - - // Extract local haplotypes in FASTA format. - if (config.mode == HaplotypesConfig::mode_extract) { - extract_haplotypes(gbz, haplotypes, config); - } - - // Classify kmers. - if (config.mode == HaplotypesConfig::mode_classify) { - classify_kmers(gbz, haplotypes, config); - } - // Output statistics on subchain lengths and kmer counts. if (config.mode == HaplotypesConfig::mode_statistics) { subchain_statistics(gbz, haplotypes, config); @@ -227,9 +198,6 @@ void help_haplotypes(char** argv, bool developer_options) { std::cerr << usage << "-H output.hapl graph.gbz" << std::endl; std::cerr << usage << "-i graph.hapl -k kmers.kff -g output.gbz graph.gbz" << std::endl; if (developer_options) { - std::cerr << usage << "-i graph.hapl --vcf-input variants.vcf graph.gbz > output.tsv" << std::endl; - std::cerr << usage << "-i graph.hapl -k kmers.kff --extract M:N graph.gbz > output.fa" << std::endl; - std::cerr << usage << "-i graph.hapl -k kmers.kff --classify output.kmers graph.gbz" << std::endl; std::cerr << usage << "-i graph.hapl --statistics ref_sample graph.gbz > output.tsv" << std::endl; } std::cerr << std::endl; @@ -265,6 +233,7 @@ void help_haplotypes(char** argv, bool developer_options) { std::cerr << " --extra-fragments in diploid sampling, select all candidates in bad subchains" << std::endl; std::cerr << " --badness F threshold for the badness of a subchain (default: " << haplotypes_defaults::badness() << ")" << std::endl; std::cerr << " --include-reference include named and reference paths in the output" << std::endl; + std::cerr << " --set-reference X use sample X as a reference sample (may repeat)" << std::endl; std::cerr << std::endl; std::cerr << "Other options:" << std::endl; std::cerr << " -v, --verbosity N verbosity level (0 = silent, 1 = basic, 2 = detailed, 3 = debug; default: 0)" << std::endl; @@ -273,11 +242,6 @@ void help_haplotypes(char** argv, bool developer_options) { if (developer_options) { std::cerr << "Developer options:" << std::endl; std::cerr << " --validate validate the generated information (may be slow)" << std::endl; - std::cerr << " --vcf-input X map the variants in VCF file X to subchains" << std::endl; - std::cerr << " --contig-prefix X a prefix for transforming VCF contig names into GBWT contig names" << std::endl; - std::cerr << " --extract M:N extract haplotypes in chain M, subchain N in FASTA format" << std::endl; - std::cerr << " --score-output X write haplotype scores to X (use with --extract)" << std::endl; - std::cerr << " --classify X classify kmers and write output to X" << std::endl; std::cerr << " --statistics X output subchain statistics over reference sample X" << std::endl; std::cerr << std::endl; } @@ -301,13 +265,9 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { constexpr int OPT_EXTRA_FRAGMENTS = 1308; constexpr int OPT_BADNESS = 1309; constexpr int OPT_INCLUDE_REFERENCE = 1310; + constexpr int OPT_SET_REFERENCE = 1311; constexpr int OPT_VALIDATE = 1400; - constexpr int OPT_VCF_INPUT = 1500; - constexpr int OPT_CONTIG_PREFIX = 1501; - constexpr int OPT_EXTRACT = 1600; - constexpr int OPT_SCORE_OUTPUT = 1601; - constexpr int OPT_CLASSIFY = 1602; - constexpr int OPT_STATISTICS = 1603; + constexpr int OPT_STATISTICS = 1500; static struct option long_options[] = { @@ -332,14 +292,10 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { { "extra-fragments", no_argument, 0, OPT_EXTRA_FRAGMENTS }, { "badness", required_argument, 0, OPT_BADNESS }, { "include-reference", no_argument, 0, OPT_INCLUDE_REFERENCE }, + { "set-reference", required_argument, 0, OPT_SET_REFERENCE }, { "verbosity", required_argument, 0, 'v' }, { "threads", required_argument, 0, 't' }, { "validate", no_argument, 0, OPT_VALIDATE }, - { "vcf-input", required_argument, 0, OPT_VCF_INPUT }, - { "contig-prefix", required_argument, 0, OPT_CONTIG_PREFIX }, - { "extract", required_argument, 0, OPT_EXTRACT }, - { "score-output", required_argument, 0, OPT_SCORE_OUTPUT }, - { "classify", required_argument, 0, OPT_CLASSIFY }, { "statistics", required_argument, 0, OPT_STATISTICS }, { 0, 0, 0, 0 } }; @@ -468,6 +424,9 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { case OPT_INCLUDE_REFERENCE: this->recombinator_parameters.include_reference = true; break; + case OPT_SET_REFERENCE: + this->reference_samples.insert(optarg); + break; case 'v': { @@ -491,30 +450,6 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { case OPT_VALIDATE: this->validate = true; break; - case OPT_VCF_INPUT: - this->vcf_input = optarg; - break; - case OPT_CONTIG_PREFIX: - this->contig_prefix = optarg; - break; - case OPT_EXTRACT: - { - std::string arg = optarg; - size_t offset = arg.find(':'); - if (offset == 0 || offset == std::string::npos || offset + 1 >= arg.length()) { - std::cerr << "error: [vg haplotypes] cannot parse chain:subchain from " << arg << std::endl; - std::exit(EXIT_FAILURE); - } - this->chain_id = parse(arg.substr(0, offset)); - this->subchain_id = parse(arg.substr(offset + 1)); - } - break; - case OPT_SCORE_OUTPUT: - this->score_output = optarg; - break; - case OPT_CLASSIFY: - this->kmer_output = optarg; - break; case OPT_STATISTICS: this->ref_sample = optarg; break; @@ -540,13 +475,6 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { this->mode = mode_preprocess; } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && !this->gbz_output.empty()) { this->mode = mode_sample_haplotypes; - } else if (!this->haplotype_input.empty() && !this->vcf_input.empty()) { - this->mode = mode_map_variants; - } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && - this->chain_id < std::numeric_limits::max() && this->subchain_id < std::numeric_limits::max()) { - this->mode = mode_extract; - } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && !this->kmer_output.empty()) { - this->mode = mode_classify; } else if (!this->haplotype_input.empty() && !this->ref_sample.empty()) { this->mode = mode_statistics; } @@ -655,6 +583,33 @@ void preprocess_graph(const gbwtgraph::GBZ& gbz, Haplotypes& haplotypes, Haploty //---------------------------------------------------------------------------- +void set_reference_samples(gbwtgraph::GBZ& gbz, const HaplotypesConfig& config) { + omp_set_num_threads(config.threads); + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Updating reference samples" << std::endl; + } + if (config.verbosity >= Haplotypes::verbosity_debug) { + std::cerr << "Reference samples:"; + for (const std::string& sample : config.reference_samples) { + std::cerr << " " << sample; + } + std::cerr << std::endl; + } + + double start = gbwt::readTimer(); + size_t present = gbz.set_reference_samples(config.reference_samples); + if (present < config.reference_samples.size()) { + std::cerr << "warning: [vg haplotypes] only " << present << " out of " << config.reference_samples.size() << " reference samples are present" << std::endl; + } + + if (config.verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Updated reference samples in " << seconds << " seconds" << std::endl; + } +} + +//---------------------------------------------------------------------------- + size_t threads_to_jobs(size_t threads) { size_t jobs = std::round(0.85 * threads); return std::max(jobs, size_t(1)); @@ -695,23 +650,6 @@ void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, //---------------------------------------------------------------------------- -gbwt::size_type path_for_contig(const gbwtgraph::GBZ& gbz, gbwt::size_type contig_id, const std::string& contig_name) { - gbwt::size_type path_id = gbz.index.metadata.paths(); - size_t found_paths = 0; - for (size_t i = 0; i < gbz.graph.named_paths.size(); i++) { - gbwt::size_type candidate = gbz.graph.named_paths[i].id; - if (gbz.index.metadata.path(candidate).contig == contig_id) { - path_id = candidate; - found_paths++; - } - } - if (found_paths != 1) { - std::cerr << "error: [vg haplotypes] found " << found_paths << " named/reference paths for contig " << contig_name << std::endl; - std::exit(EXIT_FAILURE); - } - return path_id; -} - // Returns the GBWT sequence id for the given path in the orientation it appears in this top-level chain. // Returns `gbwt::invalid_sequence()` if the path is not found. gbwt::size_type seq_for_chain(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, size_t chain_id) { @@ -730,24 +668,6 @@ gbwt::size_type seq_for_chain(const gbwtgraph::GBZ& gbz, const Haplotypes& haplo return gbwt::invalid_sequence(); } -std::pair seq_chain_for_path(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, const std::string& contig_name) { - size_t found_chains = 0; - std::pair result(gbwt::invalid_sequence(), haplotypes.components()); - for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { - gbwt::size_type seq_id = seq_for_chain(gbz, haplotypes, path_id, chain_id); - if (seq_id != gbwt::invalid_sequence()) { - result.first = seq_id; - result.second = chain_id; - found_chains++; - } - } - if (found_chains != 1) { - std::cerr << "error: [vg haplotypes] found " << found_chains << " top-level chains for contig " << contig_name << std::endl; - std::exit(EXIT_FAILURE); - } - return result; -} - struct ReferenceInterval { enum order { before, overlap, after }; @@ -864,202 +784,6 @@ std::pair, size_t> subchain_intervals(const gbwtg return { result, total_length }; } -void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { - if (!gbz.index.metadata.hasContigNames()) { - std::cerr << "error: [vg haplotypes] cannot map variant positions without contig names in the GBWT index" << std::endl; - } - - // Read variants from the VCF file. - if (config.verbosity >= Haplotypes::verbosity_basic) { - std::cerr << "Reading VCF file " << config.vcf_input << std::endl; - } - vcflib::VariantCallFile variant_file; - variant_file.parseSamples = false; // Just in case there are many samples. - std::string temp_filename = config.vcf_input; - variant_file.open(temp_filename); - if (!variant_file.is_open()) { - std::cerr << "error: [vg haplotypes] cannot open VCF file " << config.vcf_input << std::endl; - std::exit(EXIT_FAILURE); - } - std::unordered_map contig_to_offset; // VCF contig name to offset in `variant positions`. - std::vector>> variant_positions; // Semiopen 0-based ranges of sequence positions. - vcflib::Variant var(variant_file); - size_t total_variants = 0; - while (variant_file.is_open() && variant_file.getNextVariant(var)) { - size_t offset; - auto iter = contig_to_offset.find(var.sequenceName); - if (iter == contig_to_offset.end()) { - offset = variant_positions.size(); - contig_to_offset[var.sequenceName] = offset; - variant_positions.push_back({}); - } else { - offset = iter->second; - } - size_t start = var.zeroBasedPosition(); - variant_positions[offset].push_back({ start, start + var.ref.length() }); - total_variants++; - } - for (auto& positions : variant_positions) { - std::sort(positions.begin(), positions.end()); - } - if (config.verbosity >= Haplotypes::verbosity_detailed) { - std::cerr << "Read " << total_variants << " variants over " << variant_positions.size() << " contigs" << std::endl; - } - - // Map VCF contig names to GBWT sequence ids for named/reference paths and top-level chain. - std::vector contig_names(contig_to_offset.size(), ""); - std::vector> offset_to_seq_chain(contig_to_offset.size(), { gbwt::invalid_sequence(), haplotypes.components() }); - for (auto iter = contig_to_offset.begin(); iter != contig_to_offset.end(); ++iter) { - std::string contig_name = config.contig_prefix + iter->first; - gbwt::size_type contig_id = gbz.index.metadata.contig(contig_name); - if (contig_id >= gbz.index.metadata.contigs()) { - std::cerr << "error: [vg haplotypes] no contig " << contig_name << " in the GBWT index" << std::endl; - std::exit(EXIT_FAILURE); - } - contig_names[iter->second] = contig_name; - gbwt::size_type path_id = path_for_contig(gbz, contig_id, contig_name); - std::pair seq_chain = seq_chain_for_path(gbz, haplotypes, path_id, contig_name); - offset_to_seq_chain[iter->second] = seq_chain; - if (config.verbosity >= Haplotypes::verbosity_debug) { - std::cerr << "VCF contig " << iter->first << ", GBWT contig " << contig_name - << ": contig id " << contig_id - << ", path id " << path_id - << ", reverse " << gbwt::Path::is_reverse(seq_chain.first) - << ", chain " << seq_chain.second << std::endl; - } - } - - // Output (contig[interval], top-level chain, subchains, subchain lengths) - for (auto iter = contig_to_offset.begin(); iter != contig_to_offset.end(); ++iter) { - std::string contig_name = config.contig_prefix + iter->first; - size_t offset = iter->second; - gbwt::size_type sequence_id = offset_to_seq_chain[offset].first; - gbwt::size_type chain_id = offset_to_seq_chain[offset].second; - std::vector ref_intervals; - size_t total_length; - std::tie(ref_intervals, total_length) = - subchain_intervals(gbz, haplotypes, sequence_id, chain_id); - for (auto interval : variant_positions[offset]) { - size_t low = 0, high = ref_intervals.size(); - bool found = false; - while (!found && low < high) { - size_t mid = low + (high - low) / 2; - switch (ref_intervals[mid].compare(interval)) { - case ReferenceInterval::before: - low = mid + 1; - break; - case ReferenceInterval::overlap: - low = mid; - while (low > 0 && ref_intervals[low - 1].compare(interval) == ReferenceInterval::overlap) { - low--; - } - high = mid + 1; - while (high < ref_intervals.size() && ref_intervals[high].compare(interval) == ReferenceInterval::overlap) { - high++; - } - found = true; - break; - case ReferenceInterval::after: - high = mid; - break; - } - } - std::cout << iter->first << "[" << interval.first << ".." << interval.second << "]\t" << chain_id << "\t"; - if (low >= high) { - if (low > 0) { - std::cout << ref_intervals[low - 1].to_string(); - } - std::cout << ".."; - if (low < ref_intervals.size()) { - std::cout << ref_intervals[low].to_string(); - } - } else { - for (size_t i = low; i < high; i++) { - if (i > low) { - std::cout << ","; - } - std::cout << ref_intervals[i].to_string(); - } - } - std::cout << "\t"; - for (size_t i = low; i < high; i++) { - if (i > low) { - std::cout << ","; - } - std::cout << ref_intervals[i].length(); - } - std::cout << std::endl; - } - } -} -//---------------------------------------------------------------------------- - -void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { - if (config.verbosity >= Haplotypes::verbosity_basic) { - std::cerr << "Extracting haplotypes from chain " << config.chain_id << ", subchain " << config.subchain_id << std::endl; - } - - Recombinator recombinator(gbz, haplotypes, config.verbosity); - std::vector result; - try { - result = recombinator.extract_sequences( - config.kmer_input, config.chain_id, config.subchain_id, config.recombinator_parameters - ); - } catch (const std::runtime_error& e) { - std::cerr << "error: [vg haplotypes] " << e.what() << std::endl; - std::exit(EXIT_FAILURE); - } - if (config.verbosity >= Haplotypes::verbosity_detailed) { - std::cerr << "Found " << result.size() << " haplotypes" << std::endl; - } - for (auto& sequence : result) { - write_fasta_sequence(sequence.name, sequence.sequence, std::cout); - } - - if (!config.score_output.empty()) { - std::ofstream out(config.score_output, std::ios_base::binary); - if (!out) { - std::cerr << "error: [vg haplotypes] cannot open score file " << config.score_output << " for writing" << std::endl; - return; - } - for (auto& sequence : result) { - out << sequence.name; - for (size_t i = 0; i < config.recombinator_parameters.num_haplotypes; i++) { - if (i < sequence.scores.size()) { - out << "\t" << sequence.scores[i].first << "\t" << sequence.scores[i].second; - } else { - out << "\t-\t-"; - } - } - out << "\n"; - } - } -} - -//---------------------------------------------------------------------------- - -void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { - if (config.verbosity >= Haplotypes::verbosity_basic) { - std::cerr << "Classifying kmers" << std::endl; - } - Recombinator recombinator(gbz, haplotypes, config.verbosity); - std::vector classifications = recombinator.classify_kmers(config.kmer_input, config.recombinator_parameters); - - if (config.verbosity >= Haplotypes::verbosity_basic) { - std::cerr << "Writing " << classifications.size() << " classifications to " << config.kmer_output << std::endl; - } - std::ofstream out(config.kmer_output, std::ios_base::binary); - if (!out) { - std::cerr << "error: [vg haplotypes] cannot open kmer classification file " << config.kmer_output << " for writing" << std::endl; - std::exit(EXIT_FAILURE); - } - // Multi-gigabyte writes do not work in every environment, but we assume that - // there are only at most a few hundred million kmers. - out.write(classifications.data(), classifications.size()); -} - -//---------------------------------------------------------------------------- - gbwt::size_type path_for_sample_contig( const gbwtgraph::GBZ& gbz, const std::string& sample_name, const std::string& contig_name @@ -1083,6 +807,8 @@ gbwt::size_type path_for_sample_contig( return paths.front(); } +//---------------------------------------------------------------------------- + void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { gbwt::size_type sample_id = gbz.index.metadata.sample(config.ref_sample); if (sample_id >= gbz.index.metadata.samples()) { @@ -1180,50 +906,82 @@ std::string validate_unary_path(const HandleGraph& graph, handle_t from, handle_ return ""; } -// Returns true if the path from (start, offset) reaches end without revisiting start. -bool trace_path(const gbwt::GBWT& index, gbwt::node_type start, gbwt::size_type offset, gbwt::node_type end) { +// Returns true if the path from (start, offset) reaches the end without revisiting start or leaving the subchain. +// The path may continue in subsequent fragments. +bool trace_path( + const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, const hash_set& subchain_nodes, + gbwt::size_type sequence_id, gbwt::node_type start, gbwt::size_type offset, gbwt::node_type end +) { gbwt::edge_type pos(start, offset); while (pos.first != end) { pos = index.LF(pos); - if (pos.first == gbwt::ENDMARKER || pos.first == start) { + while (pos.first == gbwt::ENDMARKER) { + sequence_id = fragment_map.oriented_next(sequence_id); + if (sequence_id == gbwt::invalid_sequence()) { + // No more fragments in the chain. + return false; + } + pos = index.start(sequence_id); + } + if (pos.first == start) { + // This was not a minimal end-to-end visit. + return false; + } + if (subchain_nodes.find(gbwt::Node::id(pos.first)) == subchain_nodes.end()) { + // We are outside the subchain. return false; } } return true; } -// Returns the given haplotype over the given subchain. -std::string get_haplotype(const gbwtgraph::GBWTGraph& graph, Haplotypes::sequence_type sequence, - gbwt::node_type from, gbwt::node_type to, size_t k) { - std::string haplotype; +// Returns the given (possibly fragmented) haplotype over the given subchain. +std::vector get_haplotype( + const gbwtgraph::GBWTGraph& graph, const gbwt::FragmentMap& fragment_map, + Haplotypes::sequence_type sequence, gbwt::node_type from, gbwt::node_type to, size_t k +) { + std::vector haplotype; gbwt::edge_type pos; + bool multiple_fragments = (from != gbwt::ENDMARKER && to != gbwt::ENDMARKER); + haplotype.emplace_back(); // Initial node with three cases (from start, suffix of a long `from`, short `from`). if (from == gbwt::ENDMARKER) { pos = graph.index->start(sequence.first); gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first)); - haplotype.append(view.first, view.second); + haplotype.back().append(view.first, view.second); } else { pos = gbwt::edge_type(from, sequence.second); gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first)); if (view.second >= k) { - haplotype.append(view.first + view.second - (k - 1), k - 1); + haplotype.back().append(view.first + view.second - (k - 1), k - 1); } else { - haplotype.append(view.first, view.second); + haplotype.back().append(view.first, view.second); } } while (true) { pos = graph.index->LF(pos); if (pos.first == gbwt::ENDMARKER) { - break; + if (!multiple_fragments) { + break; + } + do { + sequence.first = fragment_map.oriented_next(sequence.first); + if (sequence.first == gbwt::invalid_sequence()) { + break; + } + pos = graph.index->start(sequence.first); + } + while (pos.first == gbwt::ENDMARKER); + haplotype.emplace_back(); } gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first)); if (pos.first == to) { - haplotype.append(view.first, std::min(view.second, k - 1)); + haplotype.back().append(view.first, std::min(view.second, k - 1)); break; } else { - haplotype.append(view.first, view.second); + haplotype.back().append(view.first, view.second); } } @@ -1232,6 +990,7 @@ std::string get_haplotype(const gbwtgraph::GBWTGraph& graph, Haplotypes::sequenc void validate_chain(const Haplotypes::TopLevelChain& chain, const gbwtgraph::GBWTGraph& graph, + const gbwt::FragmentMap& fragment_map, const gbwt::FastLocate& r_index, const HaplotypePartitioner::minimizer_index_type& minimizer_index, size_t chain_id, @@ -1292,9 +1051,10 @@ void validate_chain(const Haplotypes::TopLevelChain& chain, // Sequences: normal subchains. if (subchain.type == Haplotypes::Subchain::normal) { std::vector da = r_index.decompressDA(subchain.start); + hash_set nodes = extract_subchain(graph, gbwtgraph::GBWTGraph::node_to_handle(subchain.start), gbwtgraph::GBWTGraph::node_to_handle(subchain.end)); hash_set selected; for (size_t i = 0; i < da.size(); i++) { - if (trace_path(*(graph.index), subchain.start, i, subchain.end)) { + if (trace_path(*(graph.index), fragment_map, nodes, da[i], subchain.start, i, subchain.end)) { selected.insert(Haplotypes::sequence_type(da[i], i)); } } @@ -1350,12 +1110,17 @@ void validate_chain(const Haplotypes::TopLevelChain& chain, hash_map used_kmers; // (kmer used in haplotypes, number of sequences that contain it) hash_map missing_kmers; // (kmer not used in haplotypes, number of sequences that contain it) for (size_t i = 0; i < subchain.sequences.size(); i++) { - std::string haplotype = get_haplotype(graph, subchain.sequences[i], subchain.start, subchain.end, minimizer_index.k()); - auto minimizers = minimizer_index.minimizers(haplotype); + std::vector haplotype = get_haplotype( + graph, fragment_map, + subchain.sequences[i], subchain.start, subchain.end, minimizer_index.k() + ); hash_map unique_minimizers; // (kmer, used in the sequence) - for (auto& minimizer : minimizers) { - if (minimizer_index.count(minimizer) == 1) { - unique_minimizers[minimizer.key.get_key()] = false; + for (const std::string& sequence : haplotype) { + auto minimizers = minimizer_index.minimizers(sequence); + for (auto& minimizer : minimizers) { + if (minimizer_index.count(minimizer) == 1) { + unique_minimizers[minimizer.key.get_key()] = false; + } } } for (size_t j = 0, offset = i * subchain.kmers.size(); j < subchain.kmers.size(); j++, offset++) { @@ -1389,7 +1154,7 @@ void validate_chain(const Haplotypes::TopLevelChain& chain, } } if (invalid_count > 0) { - std::string message = "invalid occurrence count for "+ std::to_string(invalid_count) + " kmers"; + std::string message = "invalid occurrence count for " + std::to_string(invalid_count) + " kmers"; validate_error_subchain(chain_id, subchain_id, message); } size_t missing_informative_kmers = 0; @@ -1456,16 +1221,20 @@ void validate_haplotypes(const Haplotypes& haplotypes, if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) { std::cerr << "Validating subchains, sequences, and kmers" << std::endl; } + gbwt::FragmentMap fragment_map(graph.index->metadata, false); #pragma omp parallel for schedule(dynamic, 1) for (size_t chain = 0; chain < haplotypes.components(); chain++) { - validate_chain(haplotypes.chains[chain], graph, r_index, minimizer_index, chain, verbosity); + validate_chain(haplotypes.chains[chain], graph, fragment_map, r_index, minimizer_index, chain, verbosity); } - // Kmers are globally unique. + // Kmers should be globally unique. But if there are fragmented haplotypes with 3 or more fragments + // in a subchain, the middle fragment(s) may actually overlap other subchains. Additionally, + // because the kmers we use are minimizers, they could occur elsewhere as non-minimizers. if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) { std::cerr << "Validating kmer specificity" << std::endl; } hash_map> kmers; + size_t collisions = 0, total_kmers = 0; for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { const Haplotypes::TopLevelChain& chain = haplotypes.chains[chain_id]; for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { @@ -1478,15 +1247,18 @@ void validate_haplotypes(const Haplotypes& haplotypes, // A prefix subchain may overlap the preceding suffix subchain and // contain the same kmers. } else { - std::string message = subchain.to_string() + ": kmer " + std::to_string(i) + " also found in " + subchain_to_string(iter->second.first, iter->second.second, prev); - validate_error_subchain(chain_id, subchain_id, message); + collisions++; } } kmers[subchain.kmers[i]] = { chain_id, subchain_id }; } + total_kmers += subchain.kmers.size(); } } kmers.clear(); + if (collisions > 0) { + std::cerr << "warning: [vg haplotypes] found " << collisions << " kmer collisions out of " << total_kmers << std::endl; + } if (verbosity >= Haplotypes::verbosity_basic) { double seconds = gbwt::readTimer() - start; diff --git a/src/unittest/gaf_sorter.cpp b/src/unittest/gaf_sorter.cpp index 7122ba25e1..288e1aa479 100644 --- a/src/unittest/gaf_sorter.cpp +++ b/src/unittest/gaf_sorter.cpp @@ -24,11 +24,13 @@ struct GAFInfo { size_t id; std::uint32_t min_node, max_node; std::uint32_t first_node, gbwt_offset; + bool first_is_reverse; GAFInfo(size_t id, size_t nodes, bool with_gbwt_offset) : id(id), min_node(std::numeric_limits::max()), max_node(0), - first_node(std::numeric_limits::max()), gbwt_offset(std::numeric_limits::max()) { + first_node(std::numeric_limits::max()), gbwt_offset(std::numeric_limits::max()), + first_is_reverse(false) { // Name, query length, query start, query end, strand; std::string nd = std::to_string(nodes); @@ -46,6 +48,7 @@ struct GAFInfo { if (with_gbwt_offset) { this->gbwt_offset = rng() % 1000; } + this->first_is_reverse = reverse; } this->line += (reverse ? "<" : ">") + std::to_string(node); } @@ -74,7 +77,8 @@ struct GAFInfo { if (this->gbwt_offset == std::numeric_limits::max()) { return GAFSorterRecord::MISSING_KEY; } else { - return (static_cast(this->first_node) << 32) | this->gbwt_offset; + return (static_cast(this->first_node) << 33) | + (static_cast(this->first_is_reverse) << 32) | this->gbwt_offset; } } else if (type == GAFSorterRecord::key_hash) { return GAFSorterRecord::hasher(this->line); diff --git a/src/unittest/graph_algorithms.cpp b/src/unittest/graph_algorithms.cpp new file mode 100644 index 0000000000..bfdc848938 --- /dev/null +++ b/src/unittest/graph_algorithms.cpp @@ -0,0 +1,158 @@ +/** \file + * + * Unit tests for various graph algorithms. + */ + +// These are the algorithms we want to test. +#include "../algorithms/extract_subchain.hpp" + +// Other includes. +#include + +#include + +#include "catch.hpp" + +namespace vg { + +namespace unittest { + +//------------------------------------------------------------------------------ + +namespace { + +// Generates a random graph with the given number of snarls, each containing the given number of nodes. +// There is one shared node between each pair of adjacent snarls. +// Also returns the list of ids for the nodes between the snarls. +// If a snarl is bounded by nodes u and w, then every node v with u < v < w is in the snarl. +std::pair> random_graph(size_t snarls, size_t snarl_size, std::uint32_t seed = 0x12345678) { + std::vector labels { "A", "C", "G", "T" }; + bdsg::HashGraph graph; + std::vector shared_nodes; + auto create_node = [&](nid_t id, bool shared) { + graph.create_handle(labels[id % labels.size()], id); + if (shared) { + shared_nodes.push_back(id); + } + }; + + create_node(1, true); + std::mt19937 rng(seed); + nid_t next = 2; + for (size_t i = 0; i < snarls; i++) { + for (nid_t id = next; id < next + snarl_size; id++) { + create_node(id, false); + } + create_node(next + snarl_size, true); + + // Chain of even offsets. + nid_t prev = next - 1; + for (nid_t id = next; id < next + snarl_size; id += 2) { + graph.create_edge(graph.get_handle(prev), graph.get_handle(id)); + prev = id; + } + graph.create_edge(graph.get_handle(prev), graph.get_handle(next + snarl_size)); + + // Chain of odd offsets. + prev = next - 1; + for (nid_t id = next + 1; id < next + snarl_size; id += 2) { + graph.create_edge(graph.get_handle(prev), graph.get_handle(id)); + prev = id; + } + graph.create_edge(graph.get_handle(prev), graph.get_handle(next + snarl_size)); + + // Every other edge has a 5% chance of existing in random orientation. + for (nid_t from = next - 1; from <= next + snarl_size; from++) { + for (nid_t to = from + 1; to <= next + snarl_size; to++) { + if (rng() % 20 == 0) { + bool from_is_reverse = (from == next - 1 ? false : rng() % 2 == 0); + bool to_is_reverse = (to == next + snarl_size ? false : rng() % 2 == 0); + graph.create_edge(graph.get_handle(from, from_is_reverse), graph.get_handle(to, to_is_reverse)); + } + } + } + + next += snarl_size + 1; + } + + return std::make_pair(graph, shared_nodes); +} + +} // anonymous namespace + +//------------------------------------------------------------------------------ + +TEST_CASE("Extract subchain", "[graph_algorithms]") { + bdsg::HashGraph graph; + std::vector shared_nodes; + std::tie(graph, shared_nodes) = random_graph(10, 10); + + auto check_subchain = [&](const hash_set& nodes, nid_t min, nid_t max) { + size_t expected_size = max - min + 1; + REQUIRE(nodes.size() == expected_size); + for (nid_t id : nodes) { + REQUIRE(min <= id); + REQUIRE(id <= max); + } + }; + + SECTION("Empty subchain") { + nid_t start = 1, end = 1; + hash_set subchain = extract_subchain(graph, graph.get_handle(start), graph.get_handle(end)); + check_subchain(subchain, start, end); + } + + SECTION("Empty subchain, reverse") { + nid_t start = 1, end = 1; + handle_t from = graph.get_handle(end, true); + handle_t to = graph.get_handle(start, true); + hash_set subchain = extract_subchain(graph, from, to); + check_subchain(subchain, start, end); + } + + SECTION("Single snarl") { + for (size_t i = 0; i + 1 < shared_nodes.size(); i++) { + nid_t start = shared_nodes[i], end = shared_nodes[i + 1]; + hash_set subchain = extract_subchain(graph, graph.get_handle(start), graph.get_handle(end)); + check_subchain(subchain, start, end); + } + } + + SECTION("Single snarl, reverse") { + for (size_t i = 0; i + 1 < shared_nodes.size(); i++) { + nid_t start = shared_nodes[i], end = shared_nodes[i + 1]; + handle_t from = graph.get_handle(end, true); + handle_t to = graph.get_handle(start, true); + hash_set subchain = extract_subchain(graph, from, to); + check_subchain(subchain, start, end); + } + } + + SECTION("Multiple snarls") { + for (size_t i = 0; i < shared_nodes.size(); i++) { + for (size_t j = i + 1; j < shared_nodes.size(); j++) { + nid_t start = shared_nodes[i], end = shared_nodes[j]; + hash_set subchain = extract_subchain(graph, graph.get_handle(start), graph.get_handle(end)); + check_subchain(subchain, start, end); + } + } + } + + SECTION("Multiple snarls, reverse") { + for (size_t i = 0; i < shared_nodes.size(); i++) { + for (size_t j = i + 1; j < shared_nodes.size(); j++) { + nid_t start = shared_nodes[i], end = shared_nodes[j]; + handle_t from = graph.get_handle(end, true); + handle_t to = graph.get_handle(start, true); + hash_set subchain = extract_subchain(graph, from, to); + check_subchain(subchain, start, end); + } + } + } +} + +//------------------------------------------------------------------------------ + +} // namespace unittest + +} // namespace vg diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index a5849bc47a..4260f5f596 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 155 +plan tests 161 # Build vg graphs for two chromosomes @@ -330,7 +330,16 @@ is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 ha is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 samples" is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set" -rm -f large.gbz large.local.gbwt +# Now the same but with a specified reference sample +vg gbwt -Z large.gbz -l -n 16 --pass-paths --set-reference GRCh38 -o large.local.GCRh38.gbwt +is $? 0 "Local haplotypes with a specified reference sample" +is $(vg gbwt -c large.local.GCRh38.gbwt) 34 "local haplotypes w/ reference paths: 34 paths" +is $(vg gbwt -C large.local.GCRh38.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs" +is $(vg gbwt -H large.local.GCRh38.gbwt) 17 "local haplotypes w/ reference paths: 17 haplotypes" +is $(vg gbwt -S large.local.GCRh38.gbwt) 17 "local haplotypes w/ reference paths: 17 samples" +is $(vg gbwt --tags large.local.GCRh38.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set" + +rm -f large.gbz large.local.gbwt large.local.GCRh38.gbwt # Build GBWTGraph from an augmented GBWT diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t index 888c524602..3bf66004b8 100644 --- a/test/t/54_vg_haplotypes.t +++ b/test/t/54_vg_haplotypes.t @@ -5,12 +5,12 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 21 +plan tests 27 # The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph: # - GRCh38#chr6:31498145-31511124 (micb) # - GRCh38#chr19:54816468-54830778 (kir3dl1) -# The reads are 50x novaseq reads mapping to those regions. +# The reads are 50x novaseq reads for HG003 mapping to those regions. # Build indexes for the full graph. vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa --gbz-format -g full.gbz -r full.ri @@ -51,6 +51,13 @@ is $? 0 "diploid sampling using a preset" cmp diploid.gbz diploid2.gbz is $? 0 "the outputs are identical" +# Diploid sampling with a specified reference sample +vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff --include-reference --diploid-sampling --set-reference GRCh38 -g diploid3.gbz full.gbz +is $? 0 "diploid sampling with a specified reference sample" +is $(vg gbwt -S -Z diploid3.gbz) 2 "1 generated + 1 reference samples" +is $(vg gbwt -C -Z diploid3.gbz) 2 "2 contigs" +is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes" + # Giraffe integration, guessed output name vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null @@ -66,9 +73,18 @@ is $? 0 "Giraffe integration with a specified output name" cmp full.HG003.gbz sampled.003HG.gbz is $? 0 "the sampled graphs are identical" +# Giraffe integration, specified reference sample +vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ + --index-basename GRCh38 -N HG003 --set-reference GRCh38 \ + -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null +is $? 0 "Giraffe integration with a specified reference sample" +cmp diploid3.gbz GRCh38.HG003.gbz +is $? 0 "the sampled graph is identical to a manually sampled one" + # Cleanup rm -r full.gbz full.ri full.dist full.hapl rm -f indirect.gbz direct.gbz no_ref.gbz -rm -f diploid.gbz diploid2.gbz -rm -f full.HG003.gbz full.HG003.dist full.HG003.min default.gam -rm -f sampled.003HG.gbz sampled.003HG.dist sampled.003HG.min specified.gam +rm -f diploid.gbz diploid2.gbz diploid3.gbz +rm -f full.HG003.* default.gam +rm -f sampled.003HG.* specified.gam +rm -f GRCh38.HG003.* HG003_GRCh38.gam