diff --git a/deps/gbwt b/deps/gbwt
index 99daaccf4c..bde6858046 160000
--- a/deps/gbwt
+++ b/deps/gbwt
@@ -1 +1 @@
-Subproject commit 99daaccf4c8141e5e28680be031a481eb843e2c6
+Subproject commit bde6858046580d1b9dbfa54f48ab187c85998ffe
diff --git a/deps/gbwtgraph b/deps/gbwtgraph
index ebb77a18c1..d5e1955e9f 160000
--- a/deps/gbwtgraph
+++ b/deps/gbwtgraph
@@ -1 +1 @@
-Subproject commit ebb77a18c1fd5d13bb9ac3cc1f6fc75807b742f2
+Subproject commit d5e1955e9f8e478c0af02966f932f6820b78a58b
diff --git a/src/algorithms/extract_subchain.cpp b/src/algorithms/extract_subchain.cpp
new file mode 100644
index 0000000000..57898489ac
--- /dev/null
+++ b/src/algorithms/extract_subchain.cpp
@@ -0,0 +1,53 @@
+#include "extract_subchain.hpp"
+
+#include <stack>
+
+namespace vg {
+
+//------------------------------------------------------------------------------
+
+hash_set<nid_t> extract_subchain(const HandleGraph& graph, handle_t start, handle_t end) {
+    hash_set<nid_t> result;
+    nid_t start_id = graph.get_id(start);
+    bool start_is_reverse = graph.get_is_reverse(start);
+    nid_t end_id = graph.get_id(end);
+    bool end_is_reverse = graph.get_is_reverse(end);
+
+    std::stack<nid_t> active;
+    active.push(start_id);
+    active.push(end_id);
+    while (!active.empty()) {
+        nid_t node_id = active.top(); active.pop();
+        if (result.count(node_id)) {
+            continue;
+        }
+        result.insert(node_id);
+        bool go_forward = true, go_backward = true;
+        if (node_id == start_id) {
+            go_forward &= !start_is_reverse;
+            go_backward &= start_is_reverse;
+
+        }
+        if (node_id == end_id) {
+            go_forward &= end_is_reverse;
+            go_backward &= !end_is_reverse;
+        }
+        handle_t handle = graph.get_handle(node_id, false);
+        if (go_forward) {
+            graph.follow_edges(handle, false, [&](const handle_t& next) {
+                active.push(graph.get_id(next));
+            });
+        }
+        if (go_backward) {
+            graph.follow_edges(handle, true, [&](const handle_t& prev) {
+                active.push(graph.get_id(prev));
+            });
+        }
+    }
+
+    return result;
+}
+
+//------------------------------------------------------------------------------
+
+} // namespace vg
diff --git a/src/algorithms/extract_subchain.hpp b/src/algorithms/extract_subchain.hpp
new file mode 100644
index 0000000000..13da6d04a3
--- /dev/null
+++ b/src/algorithms/extract_subchain.hpp
@@ -0,0 +1,36 @@
+#ifndef VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED
+#define VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED
+
+/**
+ * \file extract_subchain.hpp
+ *
+ * Algorithm that extracts all node ids in a subchain of the snarl decomposition.
+ */
+
+#include "../handle.hpp"
+#include "../hash_map.hpp"
+
+namespace vg {
+
+//------------------------------------------------------------------------------
+
+/**
+ * Extracts all node ids in a subchain of the snarl decomposition.
+ *
+ * The subchain is defined by two handles, start and end.
+ * If the corresponding nodes are on the same chain in the snarl decomposition
+ * and end is after start, the function returns all node ids in the subchain
+ * between them. This means the identifiers of start and end, as well as all
+ * node identifiers in the weakly connected component formed by removing start
+ * and end from the graph.
+ *
+ * If the nodes are not on the same chain or end is before start, the behavior
+ * is undefined.
+ */
+hash_set<nid_t> extract_subchain(const HandleGraph& graph, handle_t start, handle_t end);
+
+//------------------------------------------------------------------------------
+
+} // namespace vg
+
+#endif // VG_ALGORITHMS_EXTRACT_SUBCHAIN_HPP_INCLUDED
diff --git a/src/gaf_sorter.cpp b/src/gaf_sorter.cpp
index 12d158b0fd..08e2dfff0e 100644
--- a/src/gaf_sorter.cpp
+++ b/src/gaf_sorter.cpp
@@ -57,6 +57,7 @@ void GAFSorterRecord::set_key(key_type type) {
         }
     } else if (type == key_gbwt_pos) {
         std::uint32_t node_id = std::numeric_limits<std::uint32_t>::max();
+        bool is_reverse = false;
         std::uint32_t offset = std::numeric_limits<std::uint32_t>::max();
         this->for_each_field([&](size_t i, str_view value) -> bool {
             if (i == PATH_FIELD && value.size > 1) {
@@ -64,6 +65,7 @@ void GAFSorterRecord::set_key(key_type type) {
                 if (result.ec != std::errc()) {
                     return false;
                 }
+                is_reverse = (value[0] == '<');
             } else if (i >= MANDATORY_FIELDS) {
                 size_t tag_size = GBWT_OFFSET_TAG.size();
                 if (value.size > tag_size && value.substr(0, tag_size) == GBWT_OFFSET_TAG) {
@@ -77,7 +79,7 @@ void GAFSorterRecord::set_key(key_type type) {
             // We either did not find both fields or failed to parse them.
             this->key = MISSING_KEY;
         } else {
-            this->key = (static_cast<std::uint64_t>(node_id) << 32) | offset;
+            this->key = (static_cast<std::uint64_t>(node_id) << 33) | (static_cast<std::uint64_t>(is_reverse) << 32) | offset;
         }
     } else if (type == key_hash) {
         this->key = hasher(this->value);
diff --git a/src/recombinator.cpp b/src/recombinator.cpp
index 65d540b2d0..598542283b 100644
--- a/src/recombinator.cpp
+++ b/src/recombinator.cpp
@@ -3,6 +3,7 @@
 #include "kff.hpp"
 #include "statistics.hpp"
 #include "algorithms/component.hpp"
+#include "algorithms/extract_subchain.hpp"
 
 #include <algorithm>
 #include <cmath>
@@ -180,8 +181,8 @@ double Haplotypes::Subchain::badness(const gbwtgraph::GBZ& gbz) const {
             }
         }
         size_t length = this->distance(gbz, selected);
-        if (length < expected_length) {
-            result += std::log(static_cast<double>(expected_length) / static_cast<double>(length));
+        if (length > expected_length) {
+            result += std::log(static_cast<double>(length) / static_cast<double>(expected_length));
         }
     }
 
@@ -366,7 +367,7 @@ void Haplotypes::simple_sds_load(std::istream& in) {
 
     this->chains.resize(this->header.top_level_chains);
     for (auto& chain : this->chains) {
-        if (this->header.version == Header::VERSION) {
+        if (this->header.version >= 3) {
             chain.simple_sds_load(in);
         } else if (this->header.version == 2) {
             chain.load_v2(in);
@@ -388,16 +389,92 @@ size_t Haplotypes::simple_sds_size() const {
     return result;
 }
 
+std::vector<size_t> Haplotypes::assign_reference_paths(const gbwtgraph::GBZ& gbz, const gbwt::FragmentMap& fragment_map, Verbosity verbosity) const {
+    if (verbosity >= verbosity_basic) {
+        std::cerr << "Assigning reference paths to GBWT construction jobs" << std::endl;
+    }
+    double start = gbwt::readTimer();
+
+    // All paths are initially unassigned.
+    std::vector<size_t> result (gbz.named_paths(), this->jobs());
+
+    auto path_id_to_named_path_offset = [&](gbwt::size_type path_id) -> size_t {
+        path_handle_t handle = gbz.graph.path_to_handle(path_id);
+        size_t as_integer = handlegraph::as_integer(handle);
+        if (as_integer >= gbz.named_paths()) {
+            return std::numeric_limits<size_t>::max();
+        } else {
+            return as_integer;
+        }
+    };
+
+    // We do a lot of redundant work here. We iterate over every sequence in every subchain
+    // and top-level chain. If the sequence is a cached (generic or reference) path, we
+    // assign it to the job corresponding to the top-level chain.
+    for (const TopLevelChain& chain : this->chains) {
+        size_t job = chain.job_id;
+        for (const Subchain& subchain : chain.subchains) {
+            for (const compact_sequence_type& sequence : subchain.sequences) {
+                gbwt::size_type path_id = gbwt::Path::id(sequence.first);
+                size_t offset = path_id_to_named_path_offset(path_id);
+                if (offset < gbz.named_paths()) {
+                    result[offset] = job;
+                }
+            }
+        }
+    }
+
+    // TODO: Fragment map could provide an iterator.
+    // If some paths were left unassigned because they are located within a subchain, we
+    // try to use the fragment map to rescue them.
+    size_t unassigned = 0;
+    for (size_t i = 0; i < result.size(); i++) {
+        if (result[i] < this->jobs()) {
+            continue;
+        }
+        gbwt::size_type first = gbz.graph.named_paths[i].id;
+        while (true) {
+            gbwt::size_type prev = fragment_map.prev(first);
+            if (prev == gbwt::invalid_sequence()) {
+                break;
+            }
+            first = prev;
+        }
+        for (gbwt::size_type curr = first; curr != gbwt::invalid_sequence(); curr = fragment_map.next(curr)) {
+            size_t offset = path_id_to_named_path_offset(curr);
+            if (offset < gbz.named_paths() && result[offset] < this->jobs()) {
+                result[i] = result[offset];
+                break;
+            }
+        }
+        if (result[i] > this->jobs()) {
+            unassigned++;
+        }
+    }
+
+    if (verbosity >= verbosity_basic) {
+        double seconds = gbwt::readTimer() - start;
+        std::cerr << "Assigned " << (result.size() - unassigned) << " reference paths (" << unassigned << " unassigned) in " << seconds << " seconds" << std::endl;
+    }
+    return result;
+}
+
 //------------------------------------------------------------------------------
 
-HaplotypePartitioner::HaplotypePartitioner(const gbwtgraph::GBZ& gbz,
+HaplotypePartitioner::HaplotypePartitioner(
+    const gbwtgraph::GBZ& gbz,
     const gbwt::FastLocate& r_index,
     const SnarlDistanceIndex& distance_index,
     const minimizer_index_type& minimizer_index,
-    Verbosity verbosity) :
-    gbz(gbz), r_index(r_index), distance_index(distance_index), minimizer_index(minimizer_index),
+    Verbosity verbosity
+) :
+    gbz(gbz), fragment_map(gbz.index.metadata, verbosity >= Haplotypes::verbosity_extra_debug), r_index(r_index),
+    distance_index(distance_index), minimizer_index(minimizer_index),
     verbosity(verbosity)
 {
+    if (this->verbosity >= Haplotypes::verbosity_detailed) {
+        std::cerr << "HaplotypePartitioner: " << this->gbz.index.metadata.paths() << " fragments for " << this->fragment_map.size() << " haplotype sequences" << std::endl;
+    }
 }
 
 void HaplotypePartitioner::Parameters::print(std::ostream& out) const {
@@ -480,7 +557,7 @@ Haplotypes HaplotypePartitioner::partition_haplotypes(const Parameters& paramete
     }
 
     // Assign named and reference paths to jobs.
-    result.jobs_for_cached_paths = std::vector<size_t>(this->gbz.graph.named_paths.size(), result.jobs());
+    result.jobs_for_cached_paths = std::vector<size_t>(this->gbz.named_paths(), result.jobs());
     // Again, we do not use a path filter, because a GBZ graph should not contain alt paths.
     auto assignments = gbwtgraph::assign_paths(this->gbz.graph, jobs, nullptr, nullptr);
     for (size_t job = 0; job < assignments.size(); job++) {
@@ -678,86 +755,205 @@ HaplotypePartitioner::get_subchains(const gbwtgraph::TopLevelChain& chain, const
 
 //------------------------------------------------------------------------------
 
-std::vector<HaplotypePartitioner::sequence_type> HaplotypePartitioner::get_sequence_visits(handle_t handle) const {
-    std::vector<gbwt::size_type> sa = this->r_index.decompressSA(gbwtgraph::GBWTGraph::handle_to_node(handle));
-    std::vector<sequence_type> result;
+// Returns (SA[i], i) for the handle, ordered by i.
+std::vector<HaplotypePartitioner::sequence_type> get_sequence_visits(handle_t handle, const gbwt::FastLocate& r_index) {
+    std::vector<gbwt::size_type> sa = r_index.decompressSA(gbwtgraph::GBWTGraph::handle_to_node(handle));
+    std::vector<HaplotypePartitioner::sequence_type> result;
     result.reserve(sa.size());
     for (size_t i = 0; i < sa.size(); i++) {
         result.push_back({ sa[i], i });
     }
+    return result;
+}
+
+std::vector<HaplotypePartitioner::sequence_type> HaplotypePartitioner::get_sequences(handle_t handle) const {
+    auto result = get_sequence_visits(handle, this->r_index);
     std::sort(result.begin(), result.end(), [&](sequence_type a, sequence_type b) -> bool {
-        gbwt::size_type a_id = r_index.seqId(a.first);
-        gbwt::size_type a_offset = r_index.seqOffset(a.first);
-        gbwt::size_type b_id = r_index.seqId(b.first);
-        gbwt::size_type b_offset = r_index.seqOffset(b.first);
+        gbwt::size_type a_id = this->r_index.seqId(a.first);
+        gbwt::size_type a_offset = this->r_index.seqOffset(a.first);
+        gbwt::size_type b_id = this->r_index.seqId(b.first);
+        gbwt::size_type b_offset = this->r_index.seqOffset(b.first);
         return ((a_id < b_id) || ((a_id == b_id) && (a_offset > b_offset)));
     });
+    for (auto& sequence : result) {
+        sequence.first = this->r_index.seqId(sequence.first);
+    }
     return result;
 }
 
-void sa_to_da(std::vector<HaplotypePartitioner::sequence_type>& sequences, const gbwt::FastLocate& r_index) {
-    for (auto& sequence : sequences) {
-        sequence.first = r_index.seqId(sequence.first);
+// A sequence visit (SA[i], i) decomposed into a visit in a fragmented haplotype.
+// Visits can be sorted by (chain, orientation, position within the chain).
+struct FragmentedHaplotypeVisit {
+    // (DA[i], i).
+    gbwt::size_type sequence_id;
+    gbwt::size_type gbwt_node_offset;
+
+    // Information used for ordering visits.
+    gbwt::size_type oriented_chain_id;
+    gbwt::size_type rank_in_chain;
+    gbwt::size_type rank_in_sequence;
+
+    FragmentedHaplotypeVisit(HaplotypePartitioner::sequence_type sequence_visit, const HaplotypePartitioner& partitioner) {
+        // Sequence id is encoded in the SA value and node offset is already given.
+        this->sequence_id = partitioner.r_index.seqId(sequence_visit.first);
+        this->gbwt_node_offset = sequence_visit.second;
+
+        // We get the chain id from the fragment map and the orientation from the sequence id.
+        // Then we abuse the path/sequence name encoding to encode the orientation in the id.
+        gbwt::size_type path_id = gbwt::Path::id(this->sequence_id);
+        bool is_reverse = gbwt::Path::is_reverse(this->sequence_id);
+        this->oriented_chain_id = gbwt::Path::encode(partitioner.fragment_map.chain(path_id), is_reverse);
+
+        // Rank in chain is based on the count/fragment/starting offset field in path name.
+        // But we must consider the orientation of the chain.
+        gbwt::PathName path_name = partitioner.gbz.index.metadata.path(path_id);
+        this->rank_in_chain = path_name.count;
+        if (is_reverse) {
+            this->rank_in_chain = std::numeric_limits<gbwt::size_type>::max() - this->rank_in_chain;
+        }
+
+        // Sequence offset in the SA value is the distance to the end of the sequence in nodes.
+        this->rank_in_sequence = std::numeric_limits<gbwt::size_type>::max() - partitioner.r_index.seqOffset(sequence_visit.first);
     }
-}
 
-std::vector<HaplotypePartitioner::sequence_type> HaplotypePartitioner::get_sequences(handle_t handle) const {
-    auto result = this->get_sequence_visits(handle);
-    sa_to_da(result, this->r_index);
-    return result;
-}
+    // Returns true if the visits are in the same chain in the same orientation.
+    bool same_chain(const FragmentedHaplotypeVisit& another) const {
+        return (this->oriented_chain_id == another.oriented_chain_id);
+    }
 
-std::vector<HaplotypePartitioner::sequence_type> HaplotypePartitioner::get_sequences(Subchain subchain) const {
-    if (subchain.type == Haplotypes::Subchain::prefix) {
-        return this->get_sequences(subchain.end);
+    // Orders the visits by chain and orientation, and visits within the chain
+    // by their position in the chain.
+    bool operator<(const FragmentedHaplotypeVisit& another) const {
+        if (this->oriented_chain_id != another.oriented_chain_id) {
+            return (this->oriented_chain_id < another.oriented_chain_id);
+        } else if (this->rank_in_chain != another.rank_in_chain) {
+            return (this->rank_in_chain < another.rank_in_chain);
+        } else {
+            return (this->rank_in_sequence < another.rank_in_sequence);
+        }
     }
-    if (subchain.type == Haplotypes::Subchain::suffix) {
-        return this->get_sequences(subchain.start);
+
+    // Returns (DA[i], i).
+    HaplotypePartitioner::sequence_type to_sequence() const {
+        return HaplotypePartitioner::sequence_type(this->sequence_id, this->gbwt_node_offset);
     }
-    auto from = this->get_sequence_visits(subchain.start);
-    auto to = this->get_sequence_visits(subchain.end);
+};
+
+// Matches the minimal sequence visits in `from` and `to` by the same oriented chain of haplotype sequences.
+// Requires that the fragments remain within the subchain defined by start and end.
+// Returns (DA[i], i) for the matching visits in `from`, sorted by rank in the chain.
+std::vector<HaplotypePartitioner::sequence_type> match_visits(
+    handle_t start, handle_t end,
+    const std::vector<HaplotypePartitioner::sequence_type>& from_visits,
+    const std::vector<HaplotypePartitioner::sequence_type>& to_visits,
+    const HaplotypePartitioner& partitioner
+) {
+    std::vector<FragmentedHaplotypeVisit> from, to;
+    for (auto& visit : from_visits) {
+        from.push_back(FragmentedHaplotypeVisit(visit, partitioner));
+    }
+    std::sort(from.begin(), from.end());
+    for (auto& visit : to_visits) {
+        to.push_back(FragmentedHaplotypeVisit(visit, partitioner));
+    }
+    std::sort(to.begin(), to.end());
+
+    // When there are multiple fragments, some of them could be outside the subchain.
+    // We want to avoid that to keep the kmers we use specific to the subchain.
+    hash_set<nid_t> node_ids = extract_subchain(partitioner.gbz.graph, start, end);
+    auto within_subchain = [&](const FragmentedHaplotypeVisit& from_visit, const FragmentedHaplotypeVisit& to_visit) -> bool {
+        gbwt::size_type sequence_id = from_visit.sequence_id;
+        gbwt::edge_type pos(gbwtgraph::GBWTGraph::handle_to_node(start), from_visit.gbwt_node_offset);
+        while (sequence_id != to_visit.sequence_id) {
+            while (pos.first != gbwt::ENDMARKER) {
+                if (node_ids.find(gbwt::Node::id(pos.first)) == node_ids.end()) {
+                    // The fragment is outside the subchain.
+                    return false;
+                }
+                pos = partitioner.gbz.index.LF(pos);
+            }
+            sequence_id = partitioner.fragment_map.oriented_next(sequence_id);
+            if (sequence_id == gbwt::invalid_sequence()) {
+                // No more fragments in the chain.
+                return false;
+            }
+            pos = partitioner.gbz.index.start(sequence_id);
+            if (pos.first != gbwt::ENDMARKER && node_ids.find(gbwt::Node::id(pos.first)) == node_ids.end()) {
+                // The fragment starts outside the subchain.
+                return false;
+            }
+        }
+
+        // We have ensured that this is a minimal end-to-end visit. Once the sequence ids match
+        // and we know that the position is within the subchain, we cannot leave the subchain and
+        // later reach the end without re-entering the subchain.
+        return true;
+    };
 
+    std::vector<HaplotypePartitioner::sequence_type> result;
     auto from_iter = from.begin();
     auto to_iter = to.begin();
-    std::vector<sequence_type> result;
     while (from_iter != from.end() && to_iter != to.end()) {
-        gbwt::size_type from_id = this->r_index.seqId(from_iter->first);
-        gbwt::size_type to_id = this->r_index.seqId(to_iter->first);
-        if (from_id == to_id) {
-            // If a haplotype crosses the subchain multiple times, we take the last entry before
-            // each exit.
-            gbwt::size_type to_offset = this->r_index.seqOffset(to_iter->first);
-            if (this->r_index.seqOffset(from_iter->first) >= to_offset) {
-                auto peek = from_iter +1;
-                while (peek != from.end() && this->r_index.seqId(peek->first) == from_id && this->r_index.seqOffset(peek->first) >= to_offset) {
+        if (*from_iter < *to_iter) {
+            if (from_iter->same_chain(*to_iter)) {
+                auto peek = from_iter + 1;
+                while (peek != from.end() && *peek < *to_iter) {
                     from_iter = peek;
                     ++peek;
                 }
-                result.push_back(*from_iter);
+                if (within_subchain(*from_iter, *to_iter)) {
+                    result.push_back(from_iter->to_sequence());
+                }
                 ++from_iter; ++to_iter;
             } else {
-                ++to_iter;
+                ++from_iter;
             }
-        } else if (from_id < to_id) {
-            ++from_iter;
-        } else if (from_id > to_id) {
+        } else {
             ++to_iter;
         }
     }
 
-    sa_to_da(result, this->r_index);
     return result;
 }
 
+std::vector<HaplotypePartitioner::sequence_type> HaplotypePartitioner::get_sequences(Subchain subchain) const {
+    if (subchain.type == Haplotypes::Subchain::prefix) {
+        // NOTE: If a sequence flips after this subchain and returns to it, we will
+        // take the initial prefix that exits the subchain multiple times.
+        return this->get_sequences(subchain.end);
+    }
+    if (subchain.type == Haplotypes::Subchain::suffix) {
+        // NOTE: If the sequence flips, exits the subchain, and later returns to it,
+        // some of the haplotypes will overlap the preceding subchain(s).
+        return this->get_sequences(subchain.start);
+    }
+    auto from = get_sequence_visits(subchain.start, this->r_index);
+    auto to = get_sequence_visits(subchain.end, this->r_index);
+    return match_visits(subchain.start, subchain.end, from, to, *this);
+}
+
 //------------------------------------------------------------------------------
 
 // Generate a haplotype over the closed range from `pos` to `end`.
+// The haplotype may consist of multiple fragments.
 // Take at most start_max and end_max characters from the initial and the final
-// node, respectively
+// node, respectively.
 // Returns an empty haplotype if there is only one node.
+// Set `pos.first = gbwt::ENDMARKER` to start from the beginning.
 // Set `end = empty_gbwtgraph_handle()` to continue until the end without a final node.
-std::string generate_haplotype(gbwt::edge_type pos, handle_t end, size_t start_max, size_t end_max, const gbwtgraph::GBWTGraph& graph) {
-    std::string haplotype;
+std::vector<std::string> generate_haplotype(
+    gbwt::size_type sequence_id,
+    gbwt::edge_type pos, handle_t end, size_t start_max, size_t end_max,
+    const HaplotypePartitioner& partitioner
+) {
+    // Determine the start for a prefix.
+    bool single_fragment = (pos.first == gbwt::ENDMARKER || end == empty_gbwtgraph_handle());
+    if (pos.first == gbwt::ENDMARKER) {
+        pos = partitioner.gbz.index.start(sequence_id);
+    }
+
+    // Empty haplotype.
+    std::vector<std::string> haplotype;
     if (pos == gbwt::invalid_edge() || pos.first == gbwt::ENDMARKER) {
         return haplotype;
     }
@@ -767,37 +963,55 @@ std::string generate_haplotype(gbwt::edge_type pos, handle_t end, size_t start_m
     if (curr == end) {
         return haplotype;
     }
-    gbwtgraph::view_type view = graph.get_sequence_view(curr);
+    gbwtgraph::view_type view = partitioner.gbz.graph.get_sequence_view(curr);
     size_t offset = (view.second > start_max ? view.second - start_max : 0);
-    haplotype.append(view.first + offset, view.second - offset);
+    haplotype.emplace_back();
+    haplotype.back().append(view.first + offset, view.second - offset);
 
     while (true) {
-        pos = graph.index->LF(pos);
+        pos = partitioner.gbz.index.LF(pos);
         if (pos.first == gbwt::ENDMARKER) {
-            break;
+            if (single_fragment) {
+                break;
+            }
+            // Try to continue with the next non-empty fragment.
+            while (pos.first == gbwt::ENDMARKER) {
+                sequence_id = partitioner.fragment_map.oriented_next(sequence_id);
+                if (sequence_id == gbwt::invalid_sequence()) {
+                    break;
+                }
+                pos = partitioner.gbz.index.start(sequence_id);
+            }
+            if (sequence_id == gbwt::invalid_sequence()) {
+                break;
+            }
+            haplotype.emplace_back();
         }
         curr = gbwtgraph::GBWTGraph::node_to_handle(pos.first);
-        view = graph.get_sequence_view(curr);
+        view = partitioner.gbz.graph.get_sequence_view(curr);
         if (curr == end) {
-            haplotype.append(view.first, std::min(view.second, end_max));
+            haplotype.back().append(view.first, std::min(view.second, end_max));
             break;
         } else {
-            haplotype.append(view.first, view.second);
+            haplotype.back().append(view.first, view.second);
         }
     }
 
     return haplotype;
 }
 
-// Return the sorted set of kmers that are minimizers in the sequence and have a single
+// Return the sorted set of kmers that are minimizers in the sequences and have a single
 // occurrence in the graph.
-std::vector<HaplotypePartitioner::kmer_type> take_unique_minimizers(const std::string& sequence, const HaplotypePartitioner::minimizer_index_type& minimizer_index) {
+std::vector<HaplotypePartitioner::kmer_type> take_unique_minimizers(
+    const std::vector<std::string>& sequences, const HaplotypePartitioner::minimizer_index_type& minimizer_index
+) {
     std::vector<HaplotypePartitioner::kmer_type> result;
-    auto minimizers = minimizer_index.minimizers(sequence);
-    result.reserve(minimizers.size());
-    for (auto& minimizer : minimizers) {
-        if (minimizer_index.count(minimizer) == 1) {
-            result.push_back(minimizer.key.get_key());
+    for (const std::string& sequence : sequences) {
+        auto minimizers = minimizer_index.minimizers(sequence);
+        for (auto& minimizer : minimizers) {
+            if (minimizer_index.count(minimizer) == 1) {
+                result.push_back(minimizer.key.get_key());
+            }
         }
     }
     gbwt::removeDuplicates(result, false);
@@ -805,22 +1019,28 @@ std::vector<HaplotypePartitioner::kmer_type> take_unique_minimizers(const std::s
 }
 
 std::vector<HaplotypePartitioner::kmer_type> HaplotypePartitioner::unique_minimizers(gbwt::size_type sequence_id) const {
-    gbwt::edge_type pos = this->gbz.index.start(sequence_id);
     size_t limit = std::numeric_limits<size_t>::max();
-    std::string haplotype = generate_haplotype(pos, empty_gbwtgraph_handle(), limit, limit, this->gbz.graph);
+    std::vector<std::string> haplotype = generate_haplotype(
+        sequence_id,
+        gbwt::edge_type(gbwt::ENDMARKER, 0), empty_gbwtgraph_handle(), limit, limit,
+        *this
+    );
     return take_unique_minimizers(haplotype, this->minimizer_index);
 }
 
-std::vector<HaplotypePartitioner::kmer_type> HaplotypePartitioner::unique_minimizers(sequence_type sequence, Subchain subchain) const {
-    gbwt::edge_type pos;
+std::vector<HaplotypePartitioner::kmer_type> HaplotypePartitioner::unique_minimizers(sequence_type sequence, Subchain subchain, size_t& fragments) const {
+    gbwt::edge_type pos(gbwt::ENDMARKER, 0);
     size_t start_max = std::numeric_limits<size_t>::max(), end_max = this->minimizer_index.k() - 1;
     if (subchain.has_start()) {
         pos = gbwt::edge_type(gbwtgraph::GBWTGraph::handle_to_node(subchain.start), sequence.second);
         start_max = this->minimizer_index.k() - 1;
-    } else {
-        pos = this->gbz.index.start(sequence.first);
     }
-    std::string haplotype = generate_haplotype(pos, subchain.end, start_max, end_max, this->gbz.graph);
+    std::vector<std::string> haplotype = generate_haplotype(
+        sequence.first,
+        pos, subchain.end, start_max, end_max,
+        *this
+    );
+    fragments = haplotype.size();
     return take_unique_minimizers(haplotype, this->minimizer_index);
 }
 
@@ -898,20 +1118,25 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
                 extra_snarls += subchain.extra_snarls;
             }
         }
-        #pragma omp critical
-        {
-            std::cerr << "Chain " << chain.offset << ": " << long_subchains << " long subchains ("
-                << with_extra_snarls << " with " << extra_snarls << " extra snarls)" << std::endl;
+        // The second condition should be redundant.
+        if (long_subchains > 0 || with_extra_snarls > 0) {
+            #pragma omp critical
+            {
+                std::cerr << "Chain " << chain.offset << " (" << output.contig_name << "): " << long_subchains << " long subchains ("
+                    << with_extra_snarls << " with " << extra_snarls << " additional snarls)" << std::endl;
+            }
         }
     }
 
     // Convert the subchains to actual subchains.
+    size_t fragmented_subchains = 0, fragmented_sequences = 0, additional_fragments = 0;
     for (const Subchain& subchain : subchains) {
         std::vector<std::pair<Subchain, std::vector<sequence_type>>> to_process;
         auto sequences = this->get_sequences(subchain);
         if (sequences.empty()) {
             // There are no haplotypes crossing the subchain, so we break it into
             // a suffix and a prefix.
+            // NOTE: See the general get_sequences() for the asymmetry in handling prefixes and suffixes.
             to_process.push_back({
                 { Haplotypes::Subchain::suffix, subchain.start, empty_gbwtgraph_handle(), 0, 0 },
                 this->get_sequences(subchain.start)
@@ -924,6 +1149,7 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
             to_process.push_back({ subchain, std::move(sequences) });
         }
         for (auto iter = to_process.begin(); iter != to_process.end(); ++iter) {
+            bool fragmented = false;
             output.subchains.push_back({
                 iter->first.type,
                 gbwtgraph::GBWTGraph::handle_to_node(iter->first.start), gbwtgraph::GBWTGraph::handle_to_node(iter->first.end),
@@ -933,13 +1159,30 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
             std::vector<std::vector<kmer_type>> kmers_by_sequence;
             kmers_by_sequence.reserve(iter->second.size());
             for (sequence_type sequence : iter->second) {
-                kmers_by_sequence.emplace_back(this->unique_minimizers(sequence, iter->first));
+                size_t fragments = 0;
+                kmers_by_sequence.emplace_back(this->unique_minimizers(sequence, iter->first, fragments));
+                if (fragments > 1) {
+                    fragmented = true;
+                    fragmented_sequences++;
+                    additional_fragments += fragments - 1;
+                }
             }
             present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmer_counts, subchain.kmers_present);
             subchain.sequences = std::vector<Haplotypes::compact_sequence_type>(iter->second.size());
             for (size_t i = 0; i < iter->second.size(); i++) {
                 subchain.sequences[i] = Haplotypes::compact_sequence_type(iter->second[i].first, iter->second[i].second);
             }
+            if (fragmented) {
+                fragmented_subchains++;
+            }
+        }
+    }
+
+    if (fragmented_subchains > 0 && this->verbosity >= Haplotypes::verbosity_debug) {
+        #pragma omp critical
+        {
+            std::cerr << "Chain " << chain.offset << " (" << output.contig_name << "): " << fragmented_subchains << " fragmented subchains ("
+                << fragmented_sequences << " fragmented sequences with " << additional_fragments << " additional fragments)" << std::endl;
         }
     }
 
@@ -983,6 +1226,13 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
 struct RecombinatorHaplotype {
     typedef Recombinator::sequence_type sequence_type;
 
+    // Recombinator instance we are using.
+    const Recombinator& recombinator;
+
+    // GBWT builder and metadata builder used by this job.
+    gbwt::GBWTBuilder& builder;
+    gbwtgraph::MetadataBuilder& metadata;
+
     // Contig name in GBWT metadata.
     const std::string& contig_name;
 
@@ -1005,9 +1255,21 @@ struct RecombinatorHaplotype {
     // The path being generated.
     gbwt::vector_type path;
 
+    // Constructor that starts a new haplotype with the given contig name and identifier.
+    RecombinatorHaplotype(
+        const Recombinator& recombinator,
+        gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata,
+        const std::string& contig_name, size_t id
+    ) :
+        recombinator(recombinator),
+        builder(builder), metadata(metadata),
+        contig_name(contig_name), id(id), fragment(0),
+        sequence_id(gbwt::invalid_sequence()), position(gbwt::invalid_edge()) {}
+
     /*
      * Extends the haplotype over the given subchain by using the given
-     * original haplotype.
+     * original haplotype. If the haplotype is fragmented, this may
+     * finish the current fragment and start a new one.
      *
      * This assumes that the original haplotype crosses the subchain.
      *
@@ -1019,117 +1281,120 @@ struct RecombinatorHaplotype {
      * take the prefix of the original original haplotype until the start
      * of the subchain.
      */
-    void extend(
-        sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator,
-        gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata
-    );
+    void extend(sequence_type sequence, const Haplotypes::Subchain& subchain);
 
     // Takes an existing haplotype from the GBWT index and inserts it into
-    /// the builder. This is intended for fragments that do not contain
-    /// subchains crossed by the original haplotypes. The call will fail if
-    /// `extend()` has been called.
-    void take(
-        gbwt::size_type sequence_id, const Recombinator& recombinator,
-        gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata
-    );
+    // the builder. This is intended for fragments that do not contain
+    // subchains crossed by the original haplotypes. The call will fail if
+    // `extend()` has been called.
+    void take(gbwt::size_type sequence_id);
 
-    // Extends the original haplotype from the latest `extend()` call until
-    // the end, inserts it into the builder, and starts a new fragment.
-    // The call will fail if `extend()` has not been called for this
-    // fragment.
-    void finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata);
+    // Finishes the current haplotype fragment and starts a new one.
+    // If `with_suffix` is true, the current fragment will be extended
+    // until the end of the corresponding path. In that case, the call will
+    // fail if `extend()` has not been called for this fragment.
+    void finish(bool with_suffix);
 
 private:
     // Extends the haplotype over a unary path from a previous subchain.
-    void connect(gbwt::node_type until, const gbwtgraph::GBWTGraph& graph);
+    void connect(gbwt::node_type until);
 
-    // Takes a prefix of a sequence.
-    void prefix(gbwt::size_type sequence_id, gbwt::node_type until, const gbwt::GBWT& index);
+    // Takes a prefix of the current sequence until `until`, assuming that
+    // `sequence_id` has been set.
+    void prefix(gbwt::node_type until);
 
     // Extends the haplotype from the previous subchain until the end.
-    void suffix(const gbwt::GBWT& index);
+    void suffix();
 
     // Inserts the current fragment into the builder.
-    void insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata);
+    void insert();
 };
 
-void RecombinatorHaplotype::extend(
-    sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator,
-    gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata
-) {
+void RecombinatorHaplotype::extend(sequence_type sequence, const Haplotypes::Subchain& subchain) {
     if (subchain.type == Haplotypes::Subchain::full_haplotype) {
         throw std::runtime_error("Haplotype::extend(): cannot extend a full haplotype");
     }
 
+    if (sequence.first == gbwt::invalid_sequence()) {
+        throw std::runtime_error("Haplotype::extend(): invalid sequence id");
+    }
+    this->sequence_id = sequence.first;
+
     if (subchain.type == Haplotypes::Subchain::prefix) {
         if (!this->path.empty())  {
             throw std::runtime_error("Haplotype::extend(): got a prefix subchain after the start of a fragment");
         }
-        this->prefix(sequence.first, subchain.end, recombinator.gbz.index);
+        this->prefix(subchain.end);
         return;
     }
 
     // Suffixes and normal subchains have a start node, so we must reach it first.
     if (!this->path.empty()) {
-        this->connect(subchain.start, recombinator.gbz.graph);
+        this->connect(subchain.start);
     } else {
-        this->prefix(sequence.first, subchain.start, recombinator.gbz.index);
+        this->prefix(subchain.start);
     }
 
-    gbwt::edge_type curr(subchain.start, sequence.second);
-    if (!recombinator.gbz.index.contains(curr)) {
-        throw std::runtime_error("Haplotype::extend(): the GBWT index does not contain position (" + std::to_string(curr.first) + ", " + std::to_string(curr.second) + ")");
+    const gbwt::GBWT& index = this->recombinator.gbz.index;
+    this->position = gbwt::edge_type(subchain.start, sequence.second);
+    if (!index.contains(this->position)) {
+        throw std::runtime_error("Haplotype::extend(): the GBWT index does not contain position (" + std::to_string(this->position.first) + ", " + std::to_string(this->position.second) + ")");
     }
 
     if (subchain.type == Haplotypes::Subchain::suffix) {
-        this->position = curr;
-        this->finish(recombinator, builder, metadata);
+        this->finish(true);
         return;
     }
 
     // This is a normal subchain.
-    while (curr.first != subchain.end) {
-        curr = recombinator.gbz.index.LF(curr);
-        if (curr.first == gbwt::ENDMARKER) {
-            throw std::runtime_error("Haplotype::extend(): the sequence did not reach the end of the subchain at GBWT node " + std::to_string(subchain.end));
+    while (this->position.first != subchain.end) {
+        this->position = index.LF(this->position);
+        if (this->position.first == gbwt::ENDMARKER) {
+            gbwt::size_type prev = this->sequence_id;
+            this->finish(false);
+            do {
+                // Find the next non-empty fragment.
+                this->sequence_id = this->recombinator.fragment_map.oriented_next(prev);
+                if (this->sequence_id == gbwt::invalid_sequence()) {
+                    std::string msg = "Haplotype::extend(): no successor for GBWT sequence " + std::to_string(prev);
+                    throw std::runtime_error(msg);
+                }
+                this->position = index.start(this->sequence_id);
+                prev = this->sequence_id;
+            } while (this->position.first == gbwt::ENDMARKER);
         }
-        this->path.push_back(curr.first);
+        this->path.push_back(this->position.first);
     }
-    this->sequence_id = sequence.first;
-    this->position = curr;
 }
 
-void RecombinatorHaplotype::take(
-    gbwt::size_type sequence_id, const Recombinator& recombinator,
-    gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata
-) {
+void RecombinatorHaplotype::take(gbwt::size_type sequence_id) {
+    const gbwt::GBWT& index = this->recombinator.gbz.index;
     if (!this->path.empty()) {
         throw std::runtime_error("Haplotype::take(): the current fragment is not empty");
     }
-    if (sequence_id >= recombinator.gbz.index.sequences()) {
+    if (sequence_id >= index.sequences()) {
         throw std::runtime_error("Haplotype::take(): the GBWT index does not contain sequence " + std::to_string(sequence_id));
     }
-    this->path = recombinator.gbz.index.extract(sequence_id);
-    this->insert(builder, metadata);
-    this->fragment++;
-    this->sequence_id = gbwt::invalid_sequence();
-    this->position = gbwt::invalid_edge();
-    this->path.clear();
+    this->path = index.extract(sequence_id);
+    this->finish(false);
 }
 
-void RecombinatorHaplotype::finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) {
-    if (this->position == gbwt::invalid_edge()) {
-        throw std::runtime_error("Haplotype::finish(): there is no current position");
+void RecombinatorHaplotype::finish(bool with_suffix) {
+    if (with_suffix) {
+        if (this->position == gbwt::invalid_edge()) {
+            throw std::runtime_error("Haplotype::finish(): there is no current position");
+        }
+        this->suffix();
     }
-    this->suffix(recombinator.gbz.index);
-    this->insert(builder, metadata);
+    this->insert();
     this->fragment++;
     this->sequence_id = gbwt::invalid_sequence();
     this->position = gbwt::invalid_edge();
     this->path.clear();
 }
 
-void RecombinatorHaplotype::connect(gbwt::node_type until, const gbwtgraph::GBWTGraph& graph) {
+void RecombinatorHaplotype::connect(gbwt::node_type until) {
+    const gbwtgraph::GBWTGraph& graph = this->recombinator.gbz.graph;
     handle_t curr = gbwtgraph::GBWTGraph::node_to_handle(this->position.first);
     handle_t end = gbwtgraph::GBWTGraph::node_to_handle(until);
     this->position = gbwt::invalid_edge();
@@ -1153,32 +1418,32 @@ void RecombinatorHaplotype::connect(gbwt::node_type until, const gbwtgraph::GBWT
     }
 }
 
-void RecombinatorHaplotype::prefix(gbwt::size_type sequence_id, gbwt::node_type until, const gbwt::GBWT& index) {
+void RecombinatorHaplotype::prefix(gbwt::node_type until) {
+    const gbwt::GBWT& index = this->recombinator.gbz.index;
     this->position = gbwt::invalid_edge();
-    if (sequence_id >= index.sequences()) {
+    if (this->sequence_id >= index.sequences()) {
         throw std::runtime_error("Haplotype::prefix(): invalid GBWT sequence id " + std::to_string(sequence_id));
     }
-    this->sequence_id = sequence_id;
-    for (gbwt::edge_type curr = index.start(sequence_id); curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) {
-        this->path.push_back(curr.first);
-        if (curr.first == until) {
-            this->position = curr;
+    for (this->position = index.start(this->sequence_id); this->position.first != gbwt::ENDMARKER; this->position = index.LF(this->position)) {
+        this->path.push_back(this->position.first);
+        if (this->position.first == until) {
             return;
         }
     }
-    throw std::runtime_error("Haplotype::prefix(): GBWT sequence " + std::to_string(sequence_id) + " did not reach GBWT node " + std::to_string(until));
+    throw std::runtime_error("Haplotype::prefix(): GBWT sequence " + std::to_string(this->sequence_id) + " did not reach GBWT node " + std::to_string(until));
 }
 
-void RecombinatorHaplotype::suffix(const gbwt::GBWT& index) {
+void RecombinatorHaplotype::suffix() {
+    const gbwt::GBWT& index = this->recombinator.gbz.index;
     for (gbwt::edge_type curr = index.LF(this->position); curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) {
         this->path.push_back(curr.first);
     }
 }
 
-void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) {
+void RecombinatorHaplotype::insert() {
     std::string sample_name = "recombination"; // TODO: Make this a static class variable.
-    metadata.add_haplotype(sample_name, this->contig_name, this->id, this->fragment);
-    builder.insert(this->path, true);
+    this->metadata.add_haplotype(sample_name, this->contig_name, this->id, this->fragment);
+    this->builder.insert(this->path, true);
 }
 
 //------------------------------------------------------------------------------
@@ -1189,50 +1454,83 @@ void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder, gbwtgraph::Metada
  * GBWT metadata will be set as following:
  *
  * * Sample name is "fragment".
- * * Contig name is taken from the top-level chain.
+ * * Contig name is "contig_N", with the name taken from the top-level chain and the number from the subchain.
  * * Haplotype identifier is set during construction.
- * * Fragment identifier is the identifier of the subchain.
+ * * Fragment identifier is used if the fragment itself is fragmented.
  */
 struct RecombinatorFragment {
+    // Sequence identifier for the start of the fragment.
+    gbwt::size_type sequence_id;
+
     // GBWT starting position (inclusive).
     gbwt::edge_type from;
 
     // GBWT ending position (inclusive).
     gbwt::node_type to;
 
-    // Haplotype identifier.
-    size_t haplotype;
+    // Identifier of the subchain.
+    size_t subchain_id;
 
-    // Fragment / subchain identifier.
-    size_t fragment;
+    // Haplotype identifier.
+    size_t haplotype_id;
 
-    RecombinatorFragment(gbwt::edge_type from, gbwt::node_type to, size_t haplotype, size_t fragment) :
-        from(from), to(to), haplotype(haplotype), fragment(fragment) {}
+    RecombinatorFragment(gbwt::size_type sequence_id, gbwt::edge_type from, gbwt::node_type to, size_t subchain_id, size_t haplotype_id) :
+        sequence_id(sequence_id), from(from), to(to), subchain_id(subchain_id), haplotype_id(haplotype_id) {}
 
-    // Generates the GBWT path and the metadata for the fragment.
-    void generate(
-        const gbwt::GBWT& index,
+    // Generates the GBWT path(s) and the metadata for the fragment.
+    // Returns the number of paths generated.
+    size_t generate(
+        const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map,
         gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata,
         const std::string& contig_name
     ) const;
 };
 
-void RecombinatorFragment::generate(
-    const gbwt::GBWT& index,
+size_t RecombinatorFragment::generate(
+    const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map,
     gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata,
     const std::string& contig_name
 ) const {
     gbwt::vector_type path;
-    for (gbwt::edge_type curr = this->from; curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) {
-        path.push_back(curr.first);
-        if (curr.first == this->to) {
+    std::string sample = "fragment"; // TODO: Make this a static class variable.
+    std::string contig = contig_name + "_" + std::to_string(this->subchain_id);
+    size_t fragment = 0;
+    auto finish_fragment = [&]() {
+        if (!path.empty()) {
+            metadata.add_haplotype(sample, contig, this->haplotype_id, fragment);
+            builder.insert(path, true);
+            fragment++;
+            path.clear();
+        }
+    };
+
+    size_t sequence = this->sequence_id;
+    gbwt::edge_type pos = this->from;
+    while (true) {
+        if (pos.first == gbwt::ENDMARKER) {
+            if (this->to == gbwt::ENDMARKER) {
+                // We are in a suffix subchain, so we are not interested in subsequent fragments.
+                break;
+            }
+            do {
+                finish_fragment();
+                sequence = fragment_map.oriented_next(sequence);
+                if (sequence == gbwt::invalid_sequence()) {
+                    std::string msg = "RecombinatorFragment::generate(): no successor for GBWT sequence " + std::to_string(this->sequence_id);
+                    throw std::runtime_error(msg);
+                }
+                pos = index.start(sequence);
+            } while (pos.first == gbwt::ENDMARKER);
+        }
+        path.push_back(pos.first);
+        if (pos.first == this->to) {
             break;
         }
+        pos = index.LF(pos);
     }
+    finish_fragment();
 
-    std::string sample_name = "fragment"; // TODO: Make this a static class variable.
-    metadata.add_haplotype(sample_name, contig_name, this->haplotype, this->fragment);
-    builder.insert(path, true);
+    return fragment;
 }
 
 //------------------------------------------------------------------------------
@@ -1274,8 +1572,13 @@ std::ostream& Recombinator::Statistics::print(std::ostream& out) const {
 //------------------------------------------------------------------------------
 
 Recombinator::Recombinator(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, Verbosity verbosity) :
-    gbz(gbz), haplotypes(haplotypes), verbosity(verbosity)
+    gbz(gbz), haplotypes(haplotypes), fragment_map(gbz.index.metadata, verbosity >= Haplotypes::verbosity_extra_debug), verbosity(verbosity)
 {
+    if (this->verbosity >= Haplotypes::verbosity_detailed) {
+        std::cerr << "Recombinator: " << this->gbz.index.metadata.paths() << " fragments for " << this->fragment_map.size() << " haplotype sequences" << std::endl;
+    }
+
+    this->jobs_for_cached_paths = this->haplotypes.assign_reference_paths(this->gbz, this->fragment_map, this->verbosity);
 }
 
 //------------------------------------------------------------------------------
@@ -1459,7 +1762,7 @@ gbwt::GBWT Recombinator::generate_haplotypes(const std::string& kff_file, const
     std::vector<std::vector<gbwt::size_type>> reference_paths(this->haplotypes.jobs());
     if (parameters.include_reference) {
         for (size_t i = 0; i < this->gbz.graph.named_paths.size(); i++) {
-            size_t job_id = this->haplotypes.jobs_for_cached_paths[i];
+            size_t job_id = this->jobs_for_cached_paths[i];
             if (job_id < this->haplotypes.jobs()) {
                 reference_paths[job_id].push_back(this->gbz.graph.named_paths[i].id);
             }
@@ -1580,43 +1883,6 @@ std::vector<std::pair<Recombinator::kmer_presence, double>> classify_kmers(
     return kmer_types;
 }
 
-std::vector<char> Recombinator::classify_kmers(
-    const std::string& kff_file, const Recombinator::Parameters& parameters
-) const {
-    // Get kmer counts (may throw) and determine coverage.
-    hash_map<Haplotypes::Subchain::kmer_type, size_t> counts = this->haplotypes.kmer_counts(kff_file, this->verbosity);
-    double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity);
-
-    // Classify the kmers in each subchain.
-    std::vector<char> classifications;
-    classifications.reserve(this->haplotypes.kmers());
-    for (const auto& chain : this->haplotypes.chains) {
-        for (const auto& subchain : chain.subchains) {
-            std::vector<std::pair<Recombinator::kmer_presence, double>> kmer_types = vg::classify_kmers(
-                subchain, counts, coverage, nullptr, parameters
-            );
-            for (const auto& type : kmer_types) {
-                switch (type.first) {
-                case Recombinator::absent:
-                    classifications.push_back('A');
-                    break;
-                case Recombinator::heterozygous:
-                    classifications.push_back('H');
-                    break;
-                case Recombinator::present:
-                    classifications.push_back('P');
-                    break;
-                case Recombinator::frequent:
-                    classifications.push_back('F');
-                    break;
-                }
-            }
-        }
-    }
-
-    return classifications;
-}
-
 //------------------------------------------------------------------------------
 
 // Select the best pair of haplotypes from the candidates. Each haplotype gets
@@ -1628,15 +1894,17 @@ std::vector<std::pair<size_t, double>> select_diploid(
     const Haplotypes::Subchain& subchain,
     const std::vector<std::pair<size_t, double>>& candidates,
     const std::vector<std::pair<Recombinator::kmer_presence, double>>& kmer_types,
-    size_t original_selected,
     const Recombinator::Parameters& parameters
 ) {
     std::int64_t best_score = std::numeric_limits<std::int64_t>::min();
-    size_t best_left = 0, best_right = 1;
+    size_t best_left = 0, best_right = 0;
 
+    // We now consider taking the same haplotype twice. If this is a bad subchain
+    // and we sample extra fragments, the number of fragments will depend on
+    // whether we take the same haplotype twice.
     for (size_t left = 0; left < candidates.size(); left++) {
         size_t left_offset = candidates[left].first * subchain.kmers.size();
-        for (size_t right = left + 1; right < candidates.size(); right++) {
+        for (size_t right = left; right < candidates.size(); right++) {
             std::int64_t score = 0;
             size_t right_offset = candidates[right].first * subchain.kmers.size();
             for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) {
@@ -1664,7 +1932,7 @@ std::vector<std::pair<size_t, double>> select_diploid(
     }
 
     // If this is a bad subchain, we move the selected haplotypes to the front
-    // and return the remaining non-duplicated haplotypes as extra fragments.
+    // and return the rest as extra fragments.
     // Otherwise we return only the selected haplotypes.
     if (parameters.extra_fragments) {
         double badness = subchain.badness(gbz);
@@ -1672,7 +1940,7 @@ std::vector<std::pair<size_t, double>> select_diploid(
             std::vector<std::pair<size_t, double>> result;
             result.push_back(candidates[best_left]);
             result.push_back(candidates[best_right]);
-            for (size_t i = 0; i < original_selected; i++) {
+            for (size_t i = 0; i < candidates.size(); i++) {
                 if (i != best_left && i != best_right) {
                     result.push_back(candidates[i]);
                 }
@@ -1762,19 +2030,17 @@ std::vector<std::pair<size_t, double>> select_haplotypes(
         }
     }
 
-    // If we did not have enough haplotypes in the subchain, repeat them as necessary.
-    size_t original_selected = selected_haplotypes.size();
-    for (size_t i = original_selected; i < parameters.num_haplotypes; i++) {
-        auto next = selected_haplotypes[i % original_selected];
-        selected_haplotypes.push_back(next);
-    }
-
-    // Do diploid sampling. If this is a bad subchain, we also return the
-    // extra fragments starting from the third haplotype. But we don't return
-    // duplicated ones.
     if (parameters.diploid_sampling) {
-        return select_diploid(gbz, subchain, selected_haplotypes, kmer_types, original_selected, parameters);
+        // Do diploid sampling. If this is a bad subchain, we also return the
+        // extra fragments starting from the third haplotype.
+        return select_diploid(gbz, subchain, selected_haplotypes, kmer_types, parameters);
     } else {
+        // If we did not have enough haplotypes in the subchain, repeat them as necessary.
+        size_t original_selected = selected_haplotypes.size();
+        for (size_t i = original_selected; i < parameters.num_haplotypes; i++) {
+            auto next = selected_haplotypes[i % original_selected];
+            selected_haplotypes.push_back(next);
+        }
         return selected_haplotypes;
     }
 }
@@ -1788,7 +2054,7 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top
     size_t final_haplotypes = (parameters.diploid_sampling ? 2 : parameters.num_haplotypes);
     std::vector<RecombinatorHaplotype> haplotypes;
     for (size_t i = 0; i < final_haplotypes; i++) {
-        haplotypes.push_back({ chain.contig_name, i + 1, 0, gbwt::invalid_sequence(), gbwt::invalid_edge(), {} });
+        haplotypes.emplace_back(*this, builder, metadata, chain.contig_name, i + 1);
     }
 
     Statistics statistics;
@@ -1800,7 +2066,7 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top
         for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) {
             assert(!subchain.sequences.empty());
             size_t seq = haplotype % subchain.sequences.size();
-            haplotypes[haplotype].take(subchain.sequences[seq].first, *this, builder, metadata);
+            haplotypes[haplotype].take(subchain.sequences[seq].first);
         }
         statistics.full_haplotypes = 1;
     } else {
@@ -1820,11 +2086,17 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top
             );
             if (parameters.diploid_sampling && selected_haplotypes.size() > 2) {
                 for (size_t i = 2; i < selected_haplotypes.size(); i++) {
-                    gbwt::edge_type start(subchain.start, subchain.sequences[selected_haplotypes[i].first].second);
-                    extra_fragments.emplace_back(start, subchain.end, i - 1, subchain_id);
+                    gbwt::size_type sequence_id = subchain.sequences[selected_haplotypes[i].first].first;
+                    gbwt::edge_type start;
+                    if (subchain.has_start()) {
+                        start = gbwt::edge_type(subchain.start, subchain.sequences[selected_haplotypes[i].first].second);
+                    } else {
+                        // This is a prefix.
+                        start = this->gbz.index.start(sequence_id);
+                    }
+                    extra_fragments.emplace_back(sequence_id, start, subchain.end, subchain_id, i - 1);
                 }
                 statistics.bad_subchains++;
-                statistics.extra_fragments += selected_haplotypes.size() - 2;
                 selected_haplotypes.resize(2);
             }
 
@@ -1864,21 +2136,27 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top
                 size_t selected = haplotype_to_selected[haplotype];
                 size_t seq_offset = selected_haplotypes[selected].first;
                 statistics.score += selected_haplotypes[selected].second;
-                haplotypes[haplotype].extend(subchain.sequences[seq_offset], subchain, *this, builder, metadata);
+                haplotypes[haplotype].extend(subchain.sequences[seq_offset], subchain);
             }
             have_haplotypes = subchain.has_end();
             statistics.subchains++;
         }
         if (have_haplotypes) {
             for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) {
-                haplotypes[haplotype].finish(*this, builder, metadata);
+                haplotypes[haplotype].finish(true);
             }
         }
-        statistics.fragments = haplotypes.front().fragment;
+
+        // Each haplotype should consist of at least one fragment.
+        statistics.fragments = 0;
+        for (const RecombinatorHaplotype& haplotype : haplotypes) {
+            statistics.fragments += haplotype.fragment;
+        }
 
         // Add the extra fragments as separate paths.
+        statistics.extra_fragments = 0;
         for (const RecombinatorFragment& fragment : extra_fragments) {
-            fragment.generate(this->gbz.index, builder, metadata, chain.contig_name);
+            statistics.extra_fragments += fragment.generate(this->gbz.index, fragment_map, builder, metadata, chain.contig_name);
         }
     }
 
@@ -1887,50 +2165,4 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top
 
 //------------------------------------------------------------------------------
 
-std::vector<Recombinator::LocalHaplotype> Recombinator::extract_sequences(
-    const std::string& kff_file, size_t chain_id, size_t subchain_id, const Parameters& parameters
-) const {
-    // Sanity checks.
-    if (chain_id >= this->haplotypes.chains.size()) {
-        std::string msg = "Recombinator::extract_sequences(): invalid chain id " + std::to_string(chain_id);
-        throw std::runtime_error(msg);
-    }
-    if (subchain_id >= this->haplotypes.chains[chain_id].subchains.size()) {
-        std::string msg = "Recombinator::extract_sequences(): invalid subchain id " + std::to_string(subchain_id) +
-            " in chain " + std::to_string(chain_id);
-        throw std::runtime_error(msg);
-    }
-    recombinator_sanity_checks(parameters);
-
-    // Extract the haplotypes.
-    const Haplotypes::Subchain& subchain = this->haplotypes.chains[chain_id].subchains[subchain_id];
-    std::vector<LocalHaplotype> result(subchain.sequences.size());
-    for (size_t i = 0; i < subchain.sequences.size(); i++) {
-        size_t path_id = gbwt::Path::id(subchain.sequences[i].first);
-        path_handle_t path_handle = this->gbz.graph.path_to_handle(path_id);
-        result[i].name = this->gbz.graph.get_path_name(path_handle);
-
-        gbwt::edge_type pos;
-        if (subchain.has_start()) {
-            pos = gbwt::edge_type(subchain.start, subchain.sequences[i].second);
-        } else {
-            pos = this->gbz.index.start(subchain.sequences[i].first);
-        }
-        handle_t until = gbwtgraph::GBWTGraph::node_to_handle(subchain.end);
-        size_t limit = std::numeric_limits<size_t>::max();
-        result[i].sequence = generate_haplotype(pos, until, limit, limit, this->gbz.graph);
-    }
-
-    // Get kmer counts (may throw) and determine coverage.
-    hash_map<Haplotypes::Subchain::kmer_type, size_t> counts = this->haplotypes.kmer_counts(kff_file, this->verbosity);
-    double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity);
-
-    // Fill in the scores.
-    select_haplotypes(this->gbz, subchain, counts, coverage, nullptr, &result, parameters);
-
-    return result;
-}
-
-//------------------------------------------------------------------------------
-
 } // namespace vg
diff --git a/src/recombinator.hpp b/src/recombinator.hpp
index b8969917fd..6799ee8f66 100644
--- a/src/recombinator.hpp
+++ b/src/recombinator.hpp
@@ -1,7 +1,7 @@
 #ifndef VG_RECOMBINATOR_HPP_INCLUDED
 #define VG_RECOMBINATOR_HPP_INCLUDED
 
-/** \file 
+/** \file recombinator.hpp
  * Tools for generating synthetic haplotypes as recombinations of existing
  * haplotypes.
  */
@@ -37,6 +37,9 @@ namespace vg {
  *
  * Versions:
  *
+ * * Version 4: Subchains can have fragmented haplotypes instead of a single
+ *   GBWT sequence always crossing from start to end. Compatible with version 3.
+ *
  * * Version 3: Subchains use smaller integers when possible. Compatible with
  *   version 2.
  *
@@ -58,13 +61,16 @@ class Haplotypes {
         verbosity_detailed = 2,
 
         /// Basic information, detailed statistics, and debug information.
-        verbosity_debug = 3
+        verbosity_debug = 3,
+
+        /// Hidden level; potentially tens of thousands of lines of debugging information.
+        verbosity_extra_debug = 4
     };
 
     /// Header of the serialized file.
     struct Header {
         constexpr static std::uint32_t MAGIC_NUMBER = 0x4C504148; // "HAPL"
-        constexpr static std::uint32_t VERSION = 3;
+        constexpr static std::uint32_t VERSION = 4;
         constexpr static std::uint32_t MIN_VERSION = 1;
         constexpr static std::uint64_t DEFAULT_K = 29;
 
@@ -132,7 +138,7 @@ class Haplotypes {
         /// Sequences as (GBWT sequence id, offset in the relevant node).
         std::vector<compact_sequence_type> sequences;
 
-        // TODO v3: Use an extra bit for each sequence to mark whether the presence for that sequence
+        // TODO v5: Use an extra bit for each sequence to mark whether the presence for that sequence
         // is stored explicitly or relative to the last explicit sequence.
         // We need to cluster the sequences by similarity and store the clusters consecutively.
         // And then use sd_vector for the sequences with relative presence.
@@ -231,7 +237,9 @@ class Haplotypes {
 
     Header header;
 
+    // TODO v5: Remove this.
     // Job ids for each cached path in the GBWTGraph, or `jobs()` if the path is empty.
+    // This is no longer in use.
     std::vector<size_t> jobs_for_cached_paths;
 
     std::vector<TopLevelChain> chains;
@@ -254,6 +262,14 @@ class Haplotypes {
 
     /// Returns the size of the object in elements.
     size_t simple_sds_size() const;
+
+    /**
+     * Assigns each reference and generic path in the graph to a GBWT construction job.
+     *
+     * For each path handle from 0 to gbz.named_paths() - 1, we assign the path to
+     * the given construction job, or jobs() if the path is empty.
+     */
+    std::vector<size_t> assign_reference_paths(const gbwtgraph::GBZ& gbz, const gbwt::FragmentMap& fragment_map, Verbosity verbosity) const;
 };
 
 //------------------------------------------------------------------------------
@@ -355,10 +371,12 @@ class HaplotypePartitioner {
      * Each top-level chain is partitioned into subchains that consist of one or
      * more snarls. Multiple snarls are combined into the same subchain if the
      * minimum distance over the subchain is at most the target length and there
-     * are GBWT haplotypes that cross the subchain. We also keep extending the
-     * subchain if a haplotype would cross the end in both directions. By doing
-     * this, we can avoid sequence loss with haplotypes reversing their direction,
-     * while keeping kmers specific to each subchain.
+     * are GBWT haplotypes that cross the subchain.
+     *
+     * With the right option, we keep extending the subchain if a haplotype would
+     * cross the end in both directions. By doing this, we can avoid sequence loss
+     * with haplotypes reversing their direction, while keeping kmers specific to
+     * each subchain.
      *
      * If there are no snarls in a top-level chain, it is represented as a single
      * subchain without boundary nodes.
@@ -372,6 +390,7 @@ class HaplotypePartitioner {
     Haplotypes partition_haplotypes(const Parameters& parameters) const;
 
     const gbwtgraph::GBZ& gbz;
+    gbwt::FragmentMap fragment_map;
     const gbwt::FastLocate& r_index;
     const SnarlDistanceIndex& distance_index;
     const minimizer_index_type& minimizer_index;
@@ -388,29 +407,34 @@ class HaplotypePartitioner {
     // Partition the top-level chain into subchains.
     std::vector<Subchain> get_subchains(const gbwtgraph::TopLevelChain& chain, const Parameters& parameters) const;
 
-    // Return (SA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id
-    // and the number of the visit.
-    std::vector<sequence_type> get_sequence_visits(handle_t handle) const;
-
-    // Return (DA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id.
+    // Return (DA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id
+    // and the rank of the visit for the same sequence.
     std::vector<sequence_type> get_sequences(handle_t handle) const;
 
-    // Get all GBWT sequences crossing the subchain. The sequences will be at
-    // start for normal subchains and suffixes and at end for prefixes.
+    // Get all GBWT sequences crossing the subchain.
+    //
+    // * If the subchain is a prefix (suffix), the sequences will be at the end
+    //   (start) of the subchain.
+    // * If the subchain is normal, the sequences will be at the start and
+    //   correspond to minimal end-to-end visits to the subchain. A sequence
+    //   that ends within the subchain may be selected if subsequent fragments
+    //   of the same haplotype remain within the subchain and reach the end.
     std::vector<sequence_type> get_sequences(Subchain subchain) const;
 
     // Return the sorted set of kmers that are minimizers in the sequence and have
     // a single occurrence in the graph.
     std::vector<kmer_type> unique_minimizers(gbwt::size_type sequence_id) const;
 
-    // Count the number of minimizers in the sequence over the subchain with a single
-    // occurrence in the graph. Return the sorted set of kmers that are minimizers in
-    // the sequence over the subchain and have a single occurrence in the graph.
+    // Returns the sorted set of kmers that are minimizers in the sequence over the
+    // subchain and have a single occurrence in the graph. If the sequence does not
+    // reach the end of the subchain, this will try to continue with the next fragment(s).
+    //
+    // Also reports the number of fragments that were used to generate the kmers.
     //
     // To avoid using kmers shared between all haplotypes in the subchain, and
     // potentially with neighboring subchains, this does not include kmers contained
     // entirely in the shared initial/final nodes.
-    std::vector<kmer_type> unique_minimizers(sequence_type sequence, Subchain subchain) const;
+    std::vector<kmer_type> unique_minimizers(sequence_type sequence, Subchain subchain, size_t& fragments) const;
 
     // Build subchains for a specific top-level chain.
     void build_subchains(const gbwtgraph::TopLevelChain& chain, Haplotypes::TopLevelChain& output, const Parameters& parameters) const;
@@ -470,10 +494,11 @@ class Recombinator {
         /// Number of subchains exceeding the badness threshold.
         size_t bad_subchains = 0;
 
-        /// Number of fragments.
+        /// Total number of fragments in the generated haplotypes.
         size_t fragments = 0;
 
         /// Number of top-level chains where full haplotypes were taken.
+        /// These are not counted as fragments.
         size_t full_haplotypes = 0;
 
         /// Number of haplotypes generated.
@@ -482,7 +507,7 @@ class Recombinator {
         /// Number of additional haplotype fragments in bad subchains.
         size_t extra_fragments = 0;
 
-        /// Number of times a haplotype was extended from a subchain to the next subchain.
+        /// Number of times the same haplotype was extended from a subchain to the next subchain.
         size_t connections = 0;
 
         /// Number of reference paths included.
@@ -549,7 +574,7 @@ class Recombinator {
         /// Include named and reference paths.
         bool include_reference = false;
 
-        // TODO: Should be use extra_fragments?
+        // TODO: Should we use extra_fragments?
         /// Preset parameters for common use cases.
         enum preset_t {
             /// Default parameters.
@@ -574,9 +599,10 @@ class Recombinator {
      * (component).
      *
      * Each generated haplotype has a single source haplotype in each subchain.
-     * The subchains are connected by unary paths. Suffix / prefix subchains in
-     * the middle of a chain create fragment breaks. If the chain starts without
-     * a prefix (ends without a suffix), the haplotype chosen for the first (last)
+     * The source haplotype may consist of multiple fragments. Subchains are
+     * by unary paths. Suffix / prefix subchains in the middle of a chain create
+     * fragment breaks in every haplotype. If the chain starts without a prefix
+     * (ends without a suffix), the haplotype chosen for the first (last)
      * subchain is used from the start (continued until the end).
      *
      * Throws `std::runtime_error` on error in single-threaded parts and exits
@@ -600,32 +626,15 @@ class Recombinator {
     /// Kmer classification.
     enum kmer_presence { absent, heterozygous, present, frequent };
 
-    /**
-     * Classifies the kmers used for describing the haplotypes according to
-     * their frequency in the KFF file. Uses `A`, `H`, `P`, and `F` to represent
-     * absent, heterozygous, present, and frequent kmers, respectively.
-     *
-     * Throws `std::runtime_error` on error.
-     */
-    std::vector<char> classify_kmers(const std::string& kff_file, const Parameters& parameters) const;
-
-    /**
-     * Extracts the local haplotypes in the given subchain. In addition to the
-     * haplotype sequence, this also reports the name of the corresponding path
-     * as well as (rank, score) for the haplotype in each round of haplotype
-     * selection. The number of rounds is `parameters.num_haplotypes`, but if
-     * the haplotype is selected earlier, it will not get further scores.
-     *
-     * Throws `std::runtime_error` on error.
-     */
-    std::vector<LocalHaplotype> extract_sequences(
-        const std::string& kff_file, size_t chain_id, size_t subchain_id, const Parameters& parameters
-    ) const;
-
     const gbwtgraph::GBZ& gbz;
     const Haplotypes& haplotypes;
+    gbwt::FragmentMap fragment_map;
     Verbosity verbosity;
 
+    // This used to be in the Haplotypes object. For each path handle from 0 to gbz.named_paths() - 1,
+    // this stores the job id for the corresponding generic/reference path.
+    std::vector<size_t> jobs_for_cached_paths;
+
 private:
     // Generate haplotypes for the given chain.
     Statistics generate_haplotypes(const Haplotypes::TopLevelChain& chain,
diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index 4eca4e47a8..e7d2609187 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -158,7 +158,7 @@ void validate_gbwt_config(GBWTConfig& config);
 
 void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config);
 void step_2_merge_gbwts(GBWTHandler& gbwts, GBWTConfig& config);
-void step_3_alter_gbwt(GBWTHandler& gbwts, GBWTConfig& config);
+void step_3_alter_gbwt(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config);
 void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config);
 void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config);
 void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config);
@@ -195,7 +195,7 @@ int main_gbwt(int argc, char** argv) {
 
     // Edit the GBWT (remove samples, apply tags).
     if (!config.to_remove.empty() || !config.tags_to_set.empty()) {
-        step_3_alter_gbwt(gbwts, config);
+        step_3_alter_gbwt(gbwts, graphs, config);
     }
 
     // Path cover construction.
@@ -1501,28 +1501,38 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) {
     report_time_memory("Samples removed", start, config);
 }
 
-void set_tags(GBWTHandler& gbwts, GBWTConfig& config) {
+void set_tags(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config) {
     double start = gbwt::readTimer();
     if (config.show_progress) {
         std::cerr << "Setting " << config.tags_to_set.size() << " tags on the GBWT" << std::endl;
     }
     
     gbwts.use_compressed();
+    bool set_reference_samples = false;
     for (auto& kv : config.tags_to_set) {
-        gbwts.compressed.tags.set(kv.first, kv.second); 
+        gbwts.compressed.tags.set(kv.first, kv.second);
+        if (kv.first == gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG) {
+            set_reference_samples = true;
+        }
     }
     // We modified the GBWT (we assume some tags got set)
     gbwts.unbacked();
-    
+
+    // If we updated reference samples and have an existing GBWTGraph, we need to recache the named paths.
+    // This is because we may want to include the reference paths in a path cover GBWT.
+    if (set_reference_samples && (graphs.in_use == GraphHandler::graph_gbwtgraph || graphs.in_use == GraphHandler::graph_gbz)) {
+        graphs.gbwt_graph->set_gbwt(gbwts.compressed);
+    }
+
     report_time_memory("Tags set", start, config);
 }
 
-void step_3_alter_gbwt(GBWTHandler& gbwts, GBWTConfig& config) {
+void step_3_alter_gbwt(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config) {
     if (!config.to_remove.empty()) {
         remove_samples(gbwts, config);
     }
     if (!config.tags_to_set.empty()) {
-        set_tags(gbwts, config);
+        set_tags(gbwts, graphs, config);
     }
 }
 
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 05a204fdf8..e1395d8e64 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -665,7 +665,12 @@ std::string strip_suffixes(std::string filename, const std::vector<std::string>&
 }
 
 // Returns the name of the sampled GBZ.
-string sample_haplotypes(const vector<pair<string, string>>& indexes, string& basename, string& sample_name, string& haplotype_file, string& kff_file, bool progress);
+std::string sample_haplotypes(
+    const std::vector<std::pair<std::string, std::string>>& indexes,
+    const std::unordered_set<std::string>& reference_samples,
+    const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file,
+    bool progress
+);
 
 //----------------------------------------------------------------------------
 
@@ -711,7 +716,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map<std
     << "haplotype sampling:" << endl
     << "  --haplotype-name FILE         sample from haplotype information in FILE" << endl
     << "  --kff-name FILE               sample according to kmer counts in FILE" << endl
-    << "  --index-basename STR          name prefix for generated graph/index files (default: from graph name)" << endl;
+    << "  --index-basename STR          name prefix for generated graph/index files (default: from graph name)" << endl
+    << "  --set-reference STR           include this sample as a reference in the personalized graph (may repeat)" << endl;
 
     cerr
     << "alternate graphs:" << endl
@@ -778,7 +784,8 @@ int main_giraffe(int argc, char** argv) {
     constexpr int OPT_HAPLOTYPE_NAME = 1100;
     constexpr int OPT_KFF_NAME = 1101;
     constexpr int OPT_INDEX_BASENAME = 1102;
-    constexpr int OPT_COMMENTS_AS_TAGS = 1103;
+    constexpr int OPT_SET_REFERENCE = 1103;
+    constexpr int OPT_COMMENTS_AS_TAGS = 1104;
 
     // initialize parameters with their default options
     
@@ -792,6 +799,7 @@ int main_giraffe(int argc, char** argv) {
 
     // For haplotype sampling.
     string haplotype_name, kff_name;
+    std::unordered_set<std::string> reference_samples;
 
     string output_basename;
     string report_name;
@@ -1116,6 +1124,7 @@ int main_giraffe(int argc, char** argv) {
         {"haplotype-name", required_argument, 0, OPT_HAPLOTYPE_NAME},
         {"kff-name", required_argument, 0, OPT_KFF_NAME},
         {"index-basename", required_argument, 0, OPT_INDEX_BASENAME},
+        {"set-reference", required_argument, 0, OPT_SET_REFERENCE},
         {"gam-in", required_argument, 0, 'G'},
         {"fastq-in", required_argument, 0, 'f'},
         {"interleaved", no_argument, 0, 'i'},
@@ -1176,11 +1185,11 @@ int main_giraffe(int argc, char** argv) {
         {
             case 'Z':
                 if (!optarg || !*optarg) {
-                    cerr << "error:[vg giraffe] Must provide GBZ file with -Z." << endl;
+                    cerr << "error: [vg giraffe] Must provide GBZ file with -Z." << endl;
                     exit(1);
                 }
                 if (!std::ifstream(optarg).is_open()) {
-                    cerr << "error:[vg giraffe] Couldn't open GBZ file " << optarg << endl;
+                    cerr << "error: [vg giraffe] Couldn't open GBZ file " << optarg << endl;
                     exit(1);
                 }
                 provided_indexes.emplace_back("Giraffe GBZ", optarg);
@@ -1193,11 +1202,11 @@ int main_giraffe(int argc, char** argv) {
 
             case 'x':
                 if (!optarg || !*optarg) {
-                    cerr << "error:[vg giraffe] Must provide graph file with -x." << endl;
+                    cerr << "error: [vg giraffe] Must provide graph file with -x." << endl;
                     exit(1);
                 }
                 if (!std::ifstream(optarg).is_open()) {
-                    cerr << "error:[vg giraffe] Couldn't open graph file " << optarg << endl;
+                    cerr << "error: [vg giraffe] Couldn't open graph file " << optarg << endl;
                     exit(1); 
                 }
                 provided_indexes.emplace_back("XG", optarg);
@@ -1210,11 +1219,11 @@ int main_giraffe(int argc, char** argv) {
 
             case 'g':
                 if (!optarg || !*optarg) {
-                    cerr << "error:[vg giraffe] Must provide GBWTGraph file with -g." << endl;
+                    cerr << "error: [vg giraffe] Must provide GBWTGraph file with -g." << endl;
                     exit(1);
                 }
                 if (!std::ifstream(optarg).is_open()) {
-                    cerr << "error:[vg giraffe] Couldn't open GBWTGraph file " << optarg << endl;
+                    cerr << "error: [vg giraffe] Couldn't open GBWTGraph file " << optarg << endl;
                     exit(1); 
                 }
                 provided_indexes.emplace_back("GBWTGraph", optarg);
@@ -1227,11 +1236,11 @@ int main_giraffe(int argc, char** argv) {
 
             case 'H':
                 if (!optarg || !*optarg) {
-                    cerr << "error:[vg giraffe] Must provide GBWT file with -H." << endl;
+                    cerr << "error: [vg giraffe] Must provide GBWT file with -H." << endl;
                     exit(1);
                 }
                 if (!std::ifstream(optarg).is_open()) {
-                    cerr << "error:[vg giraffe] Couldn't open GBWT file " << optarg << endl;
+                    cerr << "error: [vg giraffe] Couldn't open GBWT file " << optarg << endl;
                     exit(1); 
                 }
                 provided_indexes.emplace_back("Giraffe GBWT", optarg);
@@ -1239,11 +1248,11 @@ int main_giraffe(int argc, char** argv) {
                 
             case 'm':
                 if (!optarg || !*optarg) {
-                    cerr << "error:[vg giraffe] Must provide minimizer file with -m." << endl;
+                    cerr << "error: [vg giraffe] Must provide minimizer file with -m." << endl;
                     exit(1);
                 }
                 if (!std::ifstream(optarg).is_open()) {
-                    cerr << "error:[vg giraffe] Couldn't open minimizer file " << optarg << endl;
+                    cerr << "error: [vg giraffe] Couldn't open minimizer file " << optarg << endl;
                     exit(1); 
                 }
                 provided_indexes.emplace_back("Long Read Minimizers", optarg);
@@ -1252,11 +1261,11 @@ int main_giraffe(int argc, char** argv) {
                 
             case 'z':
                 if (!optarg || !*optarg) {
-                    cerr << "error:[vg giraffe] Must provide zipcode index file with -z." << endl;
+                    cerr << "error: [vg giraffe] Must provide zipcode index file with -z." << endl;
                     exit(1);
                 }
                 if (!std::ifstream(optarg).is_open()) {
-                    cerr << "error:[vg giraffe] Couldn't open zipcode index file " << optarg << endl;
+                    cerr << "error: [vg giraffe] Couldn't open zipcode index file " << optarg << endl;
                     exit(1); 
                 }
                 provided_indexes.emplace_back("Long Read Zipcodes", optarg);
@@ -1264,11 +1273,11 @@ int main_giraffe(int argc, char** argv) {
                 break;
             case 'd':
                 if (!optarg || !*optarg) {
-                    cerr << "error:[vg giraffe] Must provide distance index file with -d." << endl;
+                    cerr << "error: [vg giraffe] Must provide distance index file with -d." << endl;
                     exit(1);
                 }
                 if (!std::ifstream(optarg).is_open()) {
-                    cerr << "error:[vg giraffe] Couldn't open distance index file " << optarg << endl;
+                    cerr << "error: [vg giraffe] Couldn't open distance index file " << optarg << endl;
                     exit(1); 
                 }
                 provided_indexes.emplace_back("Giraffe Distance Index", optarg);
@@ -1291,7 +1300,7 @@ int main_giraffe(int argc, char** argv) {
             case 'G':
                 gam_filename = optarg;
                 if (gam_filename.empty()) {
-                    cerr << "error:[vg giraffe] Must provide GAM file with -G." << endl;
+                    cerr << "error: [vg giraffe] Must provide GAM file with -G." << endl;
                     exit(1);
                 }
                 break;
@@ -1300,19 +1309,19 @@ int main_giraffe(int argc, char** argv) {
                 if (fastq_filename_1.empty()) {
                     fastq_filename_1 = optarg;
                     if (fastq_filename_1.empty()) {
-                        cerr << "error:[vg giraffe] Must provide FASTQ file with -f." << endl;
+                        cerr << "error: [vg giraffe] Must provide FASTQ file with -f." << endl;
                         exit(1);
                     }
                 }
                 else if (fastq_filename_2.empty()) {
                     fastq_filename_2 = optarg;
                     if (fastq_filename_2.empty()) {
-                        cerr << "error:[vg giraffe] Must provide FASTQ file with -f." << endl;
+                        cerr << "error: [vg giraffe] Must provide FASTQ file with -f." << endl;
                         exit(1);
                     }
                     paired = true;
                 } else {
-                    cerr << "error:[vg giraffe] Cannot specify more than two FASTQ files." << endl;
+                    cerr << "error: [vg giraffe] Cannot specify more than two FASTQ files." << endl;
                     exit(1);
                 }
                 break;
@@ -1366,7 +1375,11 @@ int main_giraffe(int argc, char** argv) {
             case OPT_OUTPUT_BASENAME:
                 output_basename = optarg;
                 break;
-            
+
+            case OPT_SET_REFERENCE:
+                reference_samples.insert(optarg);
+                break;
+
             case OPT_REPORT_NAME:
                 report_name = optarg;
                 break;
@@ -1442,7 +1455,7 @@ int main_giraffe(int argc, char** argv) {
             {
                 int num_threads = parse<int>(optarg);
                 if (num_threads <= 0) {
-                    cerr << "error:[vg giraffe] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
+                    cerr << "error: [vg giraffe] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
                     exit(1);
                 }
                 omp_set_num_threads(num_threads);
@@ -1470,7 +1483,7 @@ int main_giraffe(int argc, char** argv) {
             fasta_parts = split_ext(fasta_parts.first);
         }
         if (fasta_parts.second != "fa" && fasta_parts.second != "fasta" && fasta_parts.second != "fna") {
-            cerr << "error:[vg giraffe] FASTA file " << fasta_filename << " is not named like a FASTA" << endl;
+            cerr << "error: [vg giraffe] FASTA file " << fasta_filename << " is not named like a FASTA" << endl;
             exit(1);
         }
         
@@ -1489,7 +1502,7 @@ int main_giraffe(int argc, char** argv) {
                 vcf_parts = split_ext(vcf_parts.first);
             }
             if (vcf_parts.second != "vcf") {
-                cerr << "error:[vg giraffe] VCF file " << vcf_filename << " is not named like a VCF" << endl;
+                cerr << "error: [vg giraffe] VCF file " << vcf_filename << " is not named like a VCF" << endl;
                 exit(1);
             }
             
@@ -1514,33 +1527,33 @@ int main_giraffe(int argc, char** argv) {
     bool hts_output = (output_format == "SAM" || output_format == "BAM" || output_format == "CRAM");
     
     if (!ref_paths_name.empty() && !hts_output) {
-        cerr << "warning:[vg giraffe] Reference path file (--ref-paths) is only used when output format (-o) is SAM, BAM, or CRAM." << endl;
+        cerr << "warning: [vg giraffe] Reference path file (--ref-paths) is only used when output format (-o) is SAM, BAM, or CRAM." << endl;
         ref_paths_name = "";
     }
     
     if (output_format != "GAM" && !output_basename.empty()) {
-        cerr << "error:[vg giraffe] Using an output basename (--output-basename) only makes sense for GAM format (-o)" << endl;
+        cerr << "error: [vg giraffe] Using an output basename (--output-basename) only makes sense for GAM format (-o)" << endl;
         exit(1);
     }
     
     if (interleaved && !fastq_filename_2.empty()) {
-        cerr << "error:[vg giraffe] Cannot designate both interleaved paired ends (-i) and separate paired end file (-f)." << endl;
+        cerr << "error: [vg giraffe] Cannot designate both interleaved paired ends (-i) and separate paired end file (-f)." << endl;
         exit(1);
     }
 
     if (!fastq_filename_1.empty() && !gam_filename.empty()) {
-        cerr << "error:[vg giraffe] Cannot designate both FASTQ input (-f) and GAM input (-G) in same run." << endl;
+        cerr << "error: [vg giraffe] Cannot designate both FASTQ input (-f) and GAM input (-G) in same run." << endl;
         exit(1);
     }
     
     if (have_input_file(optind, argc, argv)) {
         // TODO: work out how to interpret additional files as reads.
-        cerr << "error:[vg giraffe] Extraneous input file: " << get_input_file_name(optind, argc, argv) << endl;
+        cerr << "error: [vg giraffe] Extraneous input file: " << get_input_file_name(optind, argc, argv) << endl;
         exit(1);
     }
 
     if ((forced_mean && ! forced_stdev) || (!forced_mean && forced_stdev)) {
-        cerr << "warning:[vg giraffe] Both a mean and standard deviation must be specified for the fragment length distribution" << endl;
+        cerr << "warning: [vg giraffe] Both a mean and standard deviation must be specified for the fragment length distribution" << endl;
         cerr << "                   Detecting fragment length distribution automatically" << endl;
         forced_mean = false;
         forced_stdev = false;
@@ -1548,12 +1561,12 @@ int main_giraffe(int argc, char** argv) {
         fragment_stdev = 0.0;
     }
     if ((forced_mean || forced_stdev || forced_rescue_attempts) && (!paired)) {
-        cerr << "warning:[vg giraffe] Attempting to set paired-end parameters but running in single-end mode" << endl;
+        cerr << "warning: [vg giraffe] Attempting to set paired-end parameters but running in single-end mode" << endl;
     }
 
     if (parser->get_option_value<bool>("align-from-chains") && paired) {
         // TODO: Implement chaining for paired-end alignment
-        cerr << "error:[vg giraffe] Paired-end alignment is not yet implemented for --align-from-chains or chaining-based presets" << endl;
+        cerr << "error: [vg giraffe] Paired-end alignment is not yet implemented for --align-from-chains or chaining-based presets" << endl;
         exit(1);
     }
 
@@ -1563,7 +1576,7 @@ int main_giraffe(int argc, char** argv) {
     }
     if (haplotype_sampling) {
         // If we do haplotype sampling, we get a new GBZ and later build indexes for it.
-        string gbz_name = sample_haplotypes(provided_indexes, index_basename, sample_name, haplotype_name, kff_name, show_progress);
+        string gbz_name = sample_haplotypes(provided_indexes, reference_samples, index_basename, sample_name, haplotype_name, kff_name, show_progress);
         registry.provide("Giraffe GBZ", gbz_name);
         index_basename = split_ext(gbz_name).first;
     } else {
@@ -1604,7 +1617,7 @@ int main_giraffe(int argc, char** argv) {
                 // A file with the appropriate name exists and we can read it.
                 if (haplotype_sampling) {
                     // If we did haplotype sampling, we are going to overwrite existing indexes.
-                    cerr << "warning:[vg giraffe] " << inferred_filename << " exists and will be overwritten" << endl;
+                    cerr << "warning: [vg giraffe] " << inferred_filename << " exists and will be overwritten" << endl;
                 } else {
                     // Report it because this may not be desired behavior.
                     cerr << "Guessing that " << inferred_filename << " is " << index_and_extensions.first << endl;
@@ -1639,7 +1652,7 @@ int main_giraffe(int argc, char** argv) {
         registry.make_indexes(index_targets);
     }
     catch (InsufficientInputException ex) {
-        cerr << "error:[vg giraffe] Input is not sufficient to create indexes" << endl;
+        cerr << "error: [vg giraffe] Input is not sufficient to create indexes" << endl;
         cerr << ex.what();
         return 1;
     }
@@ -2226,12 +2239,12 @@ int main_giraffe(int argc, char** argv) {
                 long long thread_instructions;
                 if (read(perf_fd, &thread_instructions, sizeof(long long)) != sizeof(long long)) {
                     // Read failed for some reason.
-                    cerr << "warning:[vg giraffe] Could not count CPU instructions executed" << endl;
+                    cerr << "warning: [vg giraffe] Could not count CPU instructions executed" << endl;
                     thread_instructions = 0;
                 }
                 if (close(perf_fd)) {
                     int problem = errno;
-                    cerr << "warning:[vg giraffe] Error closing perf event instruction counter: " << strerror(problem) << endl;
+                    cerr << "warning: [vg giraffe] Error closing perf event instruction counter: " << strerror(problem) << endl;
                 }
                 total_instructions += thread_instructions;
             }
@@ -2285,15 +2298,19 @@ int main_giraffe(int argc, char** argv) {
 
 //----------------------------------------------------------------------------
 
-string sample_haplotypes(const vector<pair<string, string>>& indexes, string& basename, string& sample_name, string& haplotype_file, string& kff_file, bool progress) {
-
+std::string sample_haplotypes(
+    const std::vector<std::pair<std::string, std::string>>& indexes,
+    const std::unordered_set<std::string>& reference_samples,
+    const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file,
+    bool progress
+) {
     if (progress) {
         std::cerr << "Sampling haplotypes" << std::endl;
     }
 
     // Sanity checks.
     if (haplotype_file.empty() || kff_file.empty()) {
-        std::cerr << "error:[vg giraffe] Haplotype sampling requires --haplotype-name and --kff-name." << std::endl;
+        std::cerr << "error: [vg giraffe] Haplotype sampling requires --haplotype-name and --kff-name." << std::endl;
         std::exit(EXIT_FAILURE);
     }
 
@@ -2306,7 +2323,7 @@ string sample_haplotypes(const vector<pair<string, string>>& indexes, string& ba
         }
     }
     if (sample == "giraffe") {
-        std::cerr << "warning:[vg giraffe] Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl;
+        std::cerr << "warning: [vg giraffe] Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl;
     }
     std::string output_name = basename + "." + sample + ".gbz";
 
@@ -2319,10 +2336,21 @@ string sample_haplotypes(const vector<pair<string, string>>& indexes, string& ba
     } else if (indexes.size() == 2 && indexes[0].first == "GBWTGraph" && indexes[1].first == "Giraffe GBWT") {
         load_gbz(gbz, indexes[1].second, indexes[0].second, progress);
     } else {
-        std::cerr << "error:[vg giraffe] Haplotype sampling requires either -Z or -g and -H with no other indexes." << std::endl;
+        std::cerr << "error: [vg giraffe] Haplotype sampling requires either -Z or (-g and -H) with no other indexes." << std::endl;
         std::exit(EXIT_FAILURE);
     }
 
+    // Override reference samples if necessary.
+    if (!reference_samples.empty()) {
+        if (progress) {
+            std::cerr << "Updating reference samples" << std::endl;
+        }
+        size_t present = gbz.set_reference_samples(reference_samples);
+        if (present != reference_samples.size()) {
+            std::cerr << "warning: [vg giraffe] Only " << present << " out of " << reference_samples.size() << " reference samples are present" << std::endl;
+        }
+    }
+
     // Load haplotype information.
     if (progress) {
         std::cerr << "Loading haplotype information from " << haplotype_file << std::endl;
@@ -2338,7 +2366,7 @@ string sample_haplotypes(const vector<pair<string, string>>& indexes, string& ba
     try {
         sampled_gbwt = recombinator.generate_haplotypes(kff_file, parameters);
     } catch (const std::runtime_error& e) {
-        std::cerr << "error:[vg giraffe] Haplotype sampling failed: " << e.what() << std::endl;
+        std::cerr << "error: [vg giraffe] Haplotype sampling failed: " << e.what() << std::endl;
         std::exit(EXIT_FAILURE);
     }
 
diff --git a/src/subcommand/haplotypes_main.cpp b/src/subcommand/haplotypes_main.cpp
index ec7a9f97bf..3a73d928d2 100644
--- a/src/subcommand/haplotypes_main.cpp
+++ b/src/subcommand/haplotypes_main.cpp
@@ -2,19 +2,14 @@
  *
  * Defines the "vg haplotypes" subcommand, which samples haplotypes by kmer counts in the reads.
  *
- * TODO:
- *
- * 1. Option to sample non-contiguous haplotypes. This may be important in large
- *    snarls. Select a suffix of a fragment, all fragments fully within the snarl,
- *    and the prefix of the fragment exiting the snarl.
- *
- * 2. Tests for --linear-structure, --extra-fragments, and #1.
+ * TODO: Tests for --linear-structure, --extra-fragments, and fragmented haplotypes.
  */
 
 #include "subcommand.hpp"
 
 #include "../hash_map.hpp"
 #include "../recombinator.hpp"
+#include "../algorithms/extract_subchain.hpp"
 
 #include <cmath>
 #include <fstream>
@@ -95,9 +90,6 @@ struct HaplotypesConfig {
         mode_sample_graph,
         mode_preprocess,
         mode_sample_haplotypes,
-        mode_map_variants,
-        mode_extract,
-        mode_classify,
         mode_statistics,
     };
 
@@ -106,21 +98,15 @@ struct HaplotypesConfig {
 
     // File names.
     std::string graph_name;
-    std::string gbz_output, haplotype_output, score_output, kmer_output;
+    std::string gbz_output, haplotype_output;
     std::string distance_name, r_index_name;
-    std::string haplotype_input, kmer_input, vcf_input;
+    std::string haplotype_input, kmer_input;
 
     // Computational parameters.
     size_t k = haplotypes_defaults::k(), w = haplotypes_defaults::w();
     HaplotypePartitioner::Parameters partitioner_parameters;
     Recombinator::Parameters recombinator_parameters;
-
-    // A prefix to add to VCF contig names to get GBWT contig names.
-    std::string contig_prefix;
-
-    // For extracting local haplotypes.
-    size_t chain_id = std::numeric_limits<size_t>::max();
-    size_t subchain_id = std::numeric_limits<size_t>::max();
+    std::unordered_set<std::string> reference_samples; // Overrides those specified in the graph.
 
     // For subchain statistics.
     std::string ref_sample;
@@ -134,13 +120,9 @@ struct HaplotypesConfig {
 
 void preprocess_graph(const gbwtgraph::GBZ& gbz, Haplotypes& haplotypes, HaplotypesConfig& config);
 
-void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
+void set_reference_samples(gbwtgraph::GBZ& gbz, const HaplotypesConfig& config);
 
-void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
-
-void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
-
-void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
+void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
 
 void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
 
@@ -185,24 +167,13 @@ int main_haplotypes(int argc, char** argv) {
 
     // Sample the haplotypes.
     if (config.mode == HaplotypesConfig::mode_sample_graph || config.mode == HaplotypesConfig::mode_sample_haplotypes) {
+        // Update reference samples if necessary.
+        if (!config.reference_samples.empty()) {
+            set_reference_samples(gbz, config);
+        }
         sample_haplotypes(gbz, haplotypes, config);
     }
 
-    // Map variants to subchains.
-    if (config.mode == HaplotypesConfig::mode_map_variants) {
-        map_variants(gbz, haplotypes, config);
-    }
-
-    // Extract local haplotypes in FASTA format.
-    if (config.mode == HaplotypesConfig::mode_extract) {
-        extract_haplotypes(gbz, haplotypes, config);
-    }
-
-    // Classify kmers.
-    if (config.mode == HaplotypesConfig::mode_classify) {
-        classify_kmers(gbz, haplotypes, config);
-    }
-
     // Output statistics on subchain lengths and kmer counts.
     if (config.mode == HaplotypesConfig::mode_statistics) {
         subchain_statistics(gbz, haplotypes, config);
@@ -227,9 +198,6 @@ void help_haplotypes(char** argv, bool developer_options) {
     std::cerr << usage << "-H output.hapl graph.gbz" << std::endl;
     std::cerr << usage << "-i graph.hapl -k kmers.kff -g output.gbz graph.gbz" << std::endl;
     if (developer_options) {
-        std::cerr << usage << "-i graph.hapl --vcf-input variants.vcf graph.gbz > output.tsv" << std::endl;
-        std::cerr << usage << "-i graph.hapl -k kmers.kff --extract M:N graph.gbz > output.fa" << std::endl;
-        std::cerr << usage << "-i graph.hapl -k kmers.kff --classify output.kmers graph.gbz" << std::endl;
         std::cerr << usage << "-i graph.hapl --statistics ref_sample graph.gbz > output.tsv" << std::endl;
     }
     std::cerr << std::endl;
@@ -265,6 +233,7 @@ void help_haplotypes(char** argv, bool developer_options) {
     std::cerr << "        --extra-fragments     in diploid sampling, select all candidates in bad subchains" << std::endl;
     std::cerr << "        --badness F           threshold for the badness of a subchain (default: " << haplotypes_defaults::badness() << ")" << std::endl;
     std::cerr << "        --include-reference   include named and reference paths in the output" << std::endl;
+    std::cerr << "        --set-reference X     use sample X as a reference sample (may repeat)" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Other options:" << std::endl;
     std::cerr << "    -v, --verbosity N         verbosity level (0 = silent, 1 = basic, 2 = detailed, 3 = debug; default: 0)" << std::endl;
@@ -273,11 +242,6 @@ void help_haplotypes(char** argv, bool developer_options) {
     if (developer_options) {
         std::cerr << "Developer options:" << std::endl;
         std::cerr << "        --validate            validate the generated information (may be slow)" << std::endl;
-        std::cerr << "        --vcf-input X         map the variants in VCF file X to subchains" << std::endl;
-        std::cerr << "        --contig-prefix X     a prefix for transforming VCF contig names into GBWT contig names" << std::endl;
-        std::cerr << "        --extract M:N         extract haplotypes in chain M, subchain N in FASTA format" << std::endl;
-        std::cerr << "        --score-output X      write haplotype scores to X (use with --extract)" << std::endl;
-        std::cerr << "        --classify X          classify kmers and write output to X" << std::endl;
         std::cerr << "        --statistics X        output subchain statistics over reference sample X" << std::endl;
         std::cerr << std::endl;
     }
@@ -301,13 +265,9 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
     constexpr int OPT_EXTRA_FRAGMENTS = 1308;
     constexpr int OPT_BADNESS = 1309;
     constexpr int OPT_INCLUDE_REFERENCE = 1310;
+    constexpr int OPT_SET_REFERENCE = 1311;
     constexpr int OPT_VALIDATE = 1400;
-    constexpr int OPT_VCF_INPUT = 1500;
-    constexpr int OPT_CONTIG_PREFIX = 1501;
-    constexpr int OPT_EXTRACT = 1600;
-    constexpr int OPT_SCORE_OUTPUT = 1601;
-    constexpr int OPT_CLASSIFY = 1602;
-    constexpr int OPT_STATISTICS = 1603;
+    constexpr int OPT_STATISTICS = 1500;
 
     static struct option long_options[] =
     {
@@ -332,14 +292,10 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         { "extra-fragments", no_argument, 0, OPT_EXTRA_FRAGMENTS },
         { "badness", required_argument, 0, OPT_BADNESS },
         { "include-reference", no_argument, 0, OPT_INCLUDE_REFERENCE },
+        { "set-reference", required_argument, 0, OPT_SET_REFERENCE },
         { "verbosity", required_argument, 0, 'v' },
         { "threads", required_argument, 0, 't' },
         { "validate", no_argument, 0,  OPT_VALIDATE },
-        { "vcf-input", required_argument, 0, OPT_VCF_INPUT },
-        { "contig-prefix", required_argument, 0, OPT_CONTIG_PREFIX },
-        { "extract", required_argument, 0, OPT_EXTRACT },
-        { "score-output", required_argument, 0, OPT_SCORE_OUTPUT },
-        { "classify", required_argument, 0, OPT_CLASSIFY },
         { "statistics", required_argument, 0, OPT_STATISTICS },
         { 0, 0, 0, 0 }
     };
@@ -468,6 +424,9 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         case OPT_INCLUDE_REFERENCE:
             this->recombinator_parameters.include_reference = true;
             break;
+        case OPT_SET_REFERENCE:
+            this->reference_samples.insert(optarg);
+            break;
 
         case 'v':
             {
@@ -491,30 +450,6 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         case OPT_VALIDATE:
             this->validate = true;
             break;
-        case OPT_VCF_INPUT:
-            this->vcf_input = optarg;
-            break;
-        case OPT_CONTIG_PREFIX:
-            this->contig_prefix = optarg;
-            break;
-        case OPT_EXTRACT:
-            {
-                std::string arg = optarg;
-                size_t offset = arg.find(':');
-                if (offset == 0 || offset == std::string::npos || offset + 1 >= arg.length()) {
-                    std::cerr << "error: [vg haplotypes] cannot parse chain:subchain from " << arg << std::endl;
-                    std::exit(EXIT_FAILURE);
-                }
-                this->chain_id = parse<size_t>(arg.substr(0, offset));
-                this->subchain_id = parse<size_t>(arg.substr(offset + 1));
-            }
-            break;
-        case OPT_SCORE_OUTPUT:
-            this->score_output = optarg;
-            break;
-        case OPT_CLASSIFY:
-            this->kmer_output = optarg;
-            break;
         case OPT_STATISTICS:
             this->ref_sample = optarg;
             break;
@@ -540,13 +475,6 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         this->mode = mode_preprocess;
     } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && !this->gbz_output.empty()) {
         this->mode = mode_sample_haplotypes;
-    } else if (!this->haplotype_input.empty() && !this->vcf_input.empty()) {
-        this->mode = mode_map_variants;
-    } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() &&
-        this->chain_id < std::numeric_limits<size_t>::max() && this->subchain_id < std::numeric_limits<size_t>::max()) {
-        this->mode = mode_extract;
-    } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && !this->kmer_output.empty()) {
-        this->mode = mode_classify;
     } else if (!this->haplotype_input.empty() && !this->ref_sample.empty()) {
         this->mode = mode_statistics;
     }
@@ -655,6 +583,33 @@ void preprocess_graph(const gbwtgraph::GBZ& gbz, Haplotypes& haplotypes, Haploty
 
 //----------------------------------------------------------------------------
 
+void set_reference_samples(gbwtgraph::GBZ& gbz, const HaplotypesConfig& config) {
+    omp_set_num_threads(config.threads);
+    if (config.verbosity >= Haplotypes::verbosity_basic) {
+        std::cerr << "Updating reference samples" << std::endl;
+    }
+    if (config.verbosity >= Haplotypes::verbosity_debug) {
+        std::cerr << "Reference samples:";
+        for (const std::string& sample : config.reference_samples) {
+            std::cerr << " " << sample;
+        }
+        std::cerr << std::endl;
+    }
+
+    double start = gbwt::readTimer();
+    size_t present = gbz.set_reference_samples(config.reference_samples);
+    if (present < config.reference_samples.size()) {
+        std::cerr << "warning: [vg haplotypes] only " << present << " out of " << config.reference_samples.size() << " reference samples are present" << std::endl;
+    }
+
+    if (config.verbosity >= Haplotypes::verbosity_basic) {
+        double seconds = gbwt::readTimer() - start;
+        std::cerr << "Updated reference samples in " << seconds << " seconds" << std::endl;
+    }
+}
+
+//----------------------------------------------------------------------------
+
 size_t threads_to_jobs(size_t threads) {
     size_t jobs = std::round(0.85 * threads);
     return std::max(jobs, size_t(1));
@@ -695,23 +650,6 @@ void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes,
 
 //----------------------------------------------------------------------------
 
-gbwt::size_type path_for_contig(const gbwtgraph::GBZ& gbz, gbwt::size_type contig_id, const std::string& contig_name) {
-    gbwt::size_type path_id = gbz.index.metadata.paths();
-    size_t found_paths = 0;
-    for (size_t i = 0; i < gbz.graph.named_paths.size(); i++) {
-        gbwt::size_type candidate = gbz.graph.named_paths[i].id;
-        if (gbz.index.metadata.path(candidate).contig == contig_id) {
-            path_id = candidate;
-            found_paths++;
-        }
-    }
-    if (found_paths != 1) {
-        std::cerr << "error: [vg haplotypes] found " << found_paths << " named/reference paths for contig " << contig_name << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    return path_id;
-}
-
 // Returns the GBWT sequence id for the given path in the orientation it appears in this top-level chain.
 // Returns `gbwt::invalid_sequence()` if the path is not found.
 gbwt::size_type seq_for_chain(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, size_t chain_id) {
@@ -730,24 +668,6 @@ gbwt::size_type seq_for_chain(const gbwtgraph::GBZ& gbz, const Haplotypes& haplo
     return gbwt::invalid_sequence();
 }
 
-std::pair<gbwt::size_type, size_t> seq_chain_for_path(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, const std::string& contig_name) {
-    size_t found_chains = 0;
-    std::pair<gbwt::size_type, size_t> result(gbwt::invalid_sequence(), haplotypes.components());
-    for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) {
-        gbwt::size_type seq_id = seq_for_chain(gbz, haplotypes, path_id, chain_id);
-        if (seq_id != gbwt::invalid_sequence()) {
-            result.first = seq_id;
-            result.second = chain_id;
-            found_chains++;
-        }
-    }
-    if (found_chains != 1) {
-        std::cerr << "error: [vg haplotypes] found " << found_chains << " top-level chains for contig " << contig_name << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    return result;
-}
-
 struct ReferenceInterval {
     enum order { before, overlap, after };
 
@@ -864,202 +784,6 @@ std::pair<std::vector<ReferenceInterval>, size_t> subchain_intervals(const gbwtg
     return { result, total_length };
 }
 
-void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) {
-    if (!gbz.index.metadata.hasContigNames()) {
-        std::cerr << "error: [vg haplotypes] cannot map variant positions without contig names in the GBWT index" << std::endl;
-    }
-
-    // Read variants from the VCF file.
-    if (config.verbosity >= Haplotypes::verbosity_basic) {
-        std::cerr << "Reading VCF file " << config.vcf_input << std::endl;
-    }
-    vcflib::VariantCallFile variant_file;
-    variant_file.parseSamples = false; // Just in case there are many samples.
-    std::string temp_filename = config.vcf_input;
-    variant_file.open(temp_filename);
-    if (!variant_file.is_open()) {
-        std::cerr << "error: [vg haplotypes] cannot open VCF file " << config.vcf_input << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    std::unordered_map<std::string, size_t> contig_to_offset; // VCF contig name to offset in `variant positions`.
-    std::vector<std::vector<std::pair<size_t, size_t>>> variant_positions; // Semiopen 0-based ranges of sequence positions.
-    vcflib::Variant var(variant_file);
-    size_t total_variants = 0;
-    while (variant_file.is_open() && variant_file.getNextVariant(var)) {
-        size_t offset;
-        auto iter = contig_to_offset.find(var.sequenceName);
-        if (iter == contig_to_offset.end()) {
-            offset = variant_positions.size();
-            contig_to_offset[var.sequenceName] = offset;
-            variant_positions.push_back({});
-        } else {
-            offset = iter->second;
-        }
-        size_t start = var.zeroBasedPosition();
-        variant_positions[offset].push_back({ start, start + var.ref.length() });
-        total_variants++;
-    }
-    for (auto& positions : variant_positions) {
-        std::sort(positions.begin(), positions.end());
-    }
-    if (config.verbosity >= Haplotypes::verbosity_detailed) {
-        std::cerr << "Read " << total_variants << " variants over " << variant_positions.size() << " contigs" << std::endl;
-    }
-
-    // Map VCF contig names to GBWT sequence ids for named/reference paths and top-level chain.
-    std::vector<std::string> contig_names(contig_to_offset.size(), "");
-    std::vector<std::pair<gbwt::size_type, size_t>> offset_to_seq_chain(contig_to_offset.size(), { gbwt::invalid_sequence(), haplotypes.components() });
-    for (auto iter = contig_to_offset.begin(); iter != contig_to_offset.end(); ++iter) {
-        std::string contig_name = config.contig_prefix + iter->first;
-        gbwt::size_type contig_id = gbz.index.metadata.contig(contig_name);
-        if (contig_id >= gbz.index.metadata.contigs()) {
-            std::cerr << "error: [vg haplotypes] no contig " << contig_name << " in the GBWT index" << std::endl;
-            std::exit(EXIT_FAILURE);
-        }
-        contig_names[iter->second] = contig_name;
-        gbwt::size_type path_id = path_for_contig(gbz, contig_id, contig_name);
-        std::pair<gbwt::size_type, size_t> seq_chain = seq_chain_for_path(gbz, haplotypes, path_id, contig_name);
-        offset_to_seq_chain[iter->second] = seq_chain;
-        if (config.verbosity >= Haplotypes::verbosity_debug) {
-            std::cerr << "VCF contig " << iter->first << ", GBWT contig " << contig_name
-                << ": contig id " << contig_id
-                << ", path id " << path_id
-                << ", reverse " << gbwt::Path::is_reverse(seq_chain.first)
-                << ", chain " << seq_chain.second << std::endl;
-        }
-    }
-
-    // Output (contig[interval], top-level chain, subchains, subchain lengths)
-    for (auto iter = contig_to_offset.begin(); iter != contig_to_offset.end(); ++iter) {
-        std::string contig_name = config.contig_prefix + iter->first;
-        size_t offset = iter->second;
-        gbwt::size_type sequence_id = offset_to_seq_chain[offset].first;
-        gbwt::size_type chain_id = offset_to_seq_chain[offset].second;
-        std::vector<ReferenceInterval> ref_intervals;
-        size_t total_length;
-        std::tie(ref_intervals, total_length) =
-            subchain_intervals(gbz, haplotypes, sequence_id, chain_id);
-        for (auto interval : variant_positions[offset]) {
-            size_t low = 0, high = ref_intervals.size();
-            bool found = false;
-            while (!found && low < high) {
-                size_t mid = low + (high - low) / 2;
-                switch (ref_intervals[mid].compare(interval)) {
-                    case ReferenceInterval::before:
-                        low = mid + 1;
-                        break;
-                    case ReferenceInterval::overlap:
-                        low = mid;
-                        while (low > 0 && ref_intervals[low - 1].compare(interval) == ReferenceInterval::overlap) {
-                            low--;
-                        }
-                        high = mid + 1;
-                        while (high < ref_intervals.size() && ref_intervals[high].compare(interval) == ReferenceInterval::overlap) {
-                            high++;
-                        }
-                        found = true;
-                        break;
-                    case ReferenceInterval::after:
-                        high = mid;
-                        break;
-                }
-            }
-            std::cout << iter->first << "[" << interval.first << ".." << interval.second << "]\t" << chain_id << "\t";
-            if (low >= high) {
-                if (low > 0) {
-                    std::cout << ref_intervals[low - 1].to_string();
-                }
-                std::cout << "..";
-                if (low < ref_intervals.size()) {
-                    std::cout << ref_intervals[low].to_string();
-                }
-            } else {
-                for (size_t i = low; i < high; i++) {
-                    if (i > low) {
-                        std::cout << ",";
-                    }
-                    std::cout << ref_intervals[i].to_string();
-                }
-            }
-            std::cout << "\t";
-            for (size_t i = low; i < high; i++) {
-                if (i > low) {
-                    std::cout << ",";
-                }
-                std::cout << ref_intervals[i].length();
-            }
-            std::cout << std::endl;
-        }
-    }
-}
-//----------------------------------------------------------------------------
-
-void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) {
-    if (config.verbosity >= Haplotypes::verbosity_basic) {
-        std::cerr << "Extracting haplotypes from chain " << config.chain_id << ", subchain " << config.subchain_id << std::endl;
-    }
-
-    Recombinator recombinator(gbz, haplotypes, config.verbosity);
-    std::vector<Recombinator::LocalHaplotype> result;
-    try {
-        result = recombinator.extract_sequences(
-            config.kmer_input, config.chain_id, config.subchain_id, config.recombinator_parameters
-        );
-    } catch (const std::runtime_error& e) {
-        std::cerr << "error: [vg haplotypes] " << e.what() << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    if (config.verbosity >= Haplotypes::verbosity_detailed) {
-        std::cerr << "Found " << result.size() << " haplotypes" << std::endl;
-    }
-    for (auto& sequence : result) {
-        write_fasta_sequence(sequence.name, sequence.sequence, std::cout);
-    }
-
-    if (!config.score_output.empty()) {
-        std::ofstream out(config.score_output, std::ios_base::binary);
-        if (!out) {
-            std::cerr << "error: [vg haplotypes] cannot open score file " << config.score_output << " for writing" << std::endl;
-            return;
-        }
-        for (auto& sequence : result) {
-            out << sequence.name;
-            for (size_t i = 0; i < config.recombinator_parameters.num_haplotypes; i++) {
-                if (i < sequence.scores.size()) {
-                    out << "\t" << sequence.scores[i].first << "\t" << sequence.scores[i].second;
-                } else {
-                    out << "\t-\t-";
-                }
-            }
-            out << "\n";
-        }
-    }
-}
-
-//----------------------------------------------------------------------------
-
-void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) {
-    if (config.verbosity >= Haplotypes::verbosity_basic) {
-        std::cerr << "Classifying kmers" << std::endl;
-    }
-    Recombinator recombinator(gbz, haplotypes, config.verbosity);
-    std::vector<char> classifications = recombinator.classify_kmers(config.kmer_input, config.recombinator_parameters);
-
-    if (config.verbosity >= Haplotypes::verbosity_basic) {
-        std::cerr << "Writing " << classifications.size() << " classifications to " << config.kmer_output << std::endl;
-    }
-    std::ofstream out(config.kmer_output, std::ios_base::binary);
-    if (!out) {
-        std::cerr << "error: [vg haplotypes] cannot open kmer classification file " << config.kmer_output << " for writing" << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    // Multi-gigabyte writes do not work in every environment, but we assume that
-    // there are only at most a few hundred million kmers.
-    out.write(classifications.data(), classifications.size());
-}
-
-//----------------------------------------------------------------------------
-
 gbwt::size_type path_for_sample_contig(
     const gbwtgraph::GBZ& gbz,
     const std::string& sample_name, const std::string& contig_name
@@ -1083,6 +807,8 @@ gbwt::size_type path_for_sample_contig(
     return paths.front();
 }
 
+//----------------------------------------------------------------------------
+
 void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) {
     gbwt::size_type sample_id = gbz.index.metadata.sample(config.ref_sample);
     if (sample_id >= gbz.index.metadata.samples()) {
@@ -1180,50 +906,82 @@ std::string validate_unary_path(const HandleGraph& graph, handle_t from, handle_
     return "";
 }
 
-// Returns true if the path from (start, offset) reaches end without revisiting start.
-bool trace_path(const gbwt::GBWT& index, gbwt::node_type start, gbwt::size_type offset, gbwt::node_type end) {
+// Returns true if the path from (start, offset) reaches the end without revisiting start or leaving the subchain.
+// The path may continue in subsequent fragments.
+bool trace_path(
+    const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, const hash_set<nid_t>& subchain_nodes,
+    gbwt::size_type sequence_id, gbwt::node_type start, gbwt::size_type offset, gbwt::node_type end
+) {
     gbwt::edge_type pos(start, offset);
     while (pos.first != end) {
         pos = index.LF(pos);
-        if (pos.first == gbwt::ENDMARKER || pos.first == start) {
+        while (pos.first == gbwt::ENDMARKER) {
+            sequence_id = fragment_map.oriented_next(sequence_id);
+            if (sequence_id == gbwt::invalid_sequence()) {
+                // No more fragments in the chain.
+                return false;
+            }
+            pos = index.start(sequence_id);
+        }
+        if (pos.first == start) {
+            // This was not a minimal end-to-end visit.
+            return false;
+        }
+        if (subchain_nodes.find(gbwt::Node::id(pos.first)) == subchain_nodes.end()) {
+            // We are outside the subchain.
             return false;
         }
     }
     return true;
 }
 
-// Returns the given haplotype over the given subchain.
-std::string get_haplotype(const gbwtgraph::GBWTGraph& graph, Haplotypes::sequence_type sequence,
-                          gbwt::node_type from, gbwt::node_type to, size_t k) {
-    std::string haplotype;
+// Returns the given (possibly fragmented) haplotype over the given subchain.
+std::vector<std::string> get_haplotype(
+    const gbwtgraph::GBWTGraph& graph, const gbwt::FragmentMap& fragment_map,
+    Haplotypes::sequence_type sequence, gbwt::node_type from, gbwt::node_type to, size_t k
+) {
+    std::vector<std::string> haplotype;
     gbwt::edge_type pos;
+    bool multiple_fragments = (from != gbwt::ENDMARKER && to != gbwt::ENDMARKER);
+    haplotype.emplace_back();
 
     // Initial node with three cases (from start, suffix of a long `from`, short `from`).
     if (from == gbwt::ENDMARKER) {
         pos = graph.index->start(sequence.first);
         gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first));
-        haplotype.append(view.first, view.second);
+        haplotype.back().append(view.first, view.second);
     } else {
         pos = gbwt::edge_type(from, sequence.second);
         gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first));
         if (view.second >= k) {
-            haplotype.append(view.first + view.second - (k - 1), k - 1);
+            haplotype.back().append(view.first + view.second - (k - 1), k - 1);
         } else {
-            haplotype.append(view.first, view.second);
+            haplotype.back().append(view.first, view.second);
         }
     }
 
     while (true) {
         pos = graph.index->LF(pos);
         if (pos.first == gbwt::ENDMARKER) {
-            break;
+            if (!multiple_fragments) {
+                break;
+            }
+            do {
+                sequence.first = fragment_map.oriented_next(sequence.first);
+                if (sequence.first == gbwt::invalid_sequence()) {
+                    break;
+                }
+                pos = graph.index->start(sequence.first);
+            }
+            while (pos.first == gbwt::ENDMARKER);
+            haplotype.emplace_back();
         }
         gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first));
         if (pos.first == to) {
-            haplotype.append(view.first, std::min(view.second, k - 1));
+            haplotype.back().append(view.first, std::min(view.second, k - 1));
             break;
         } else {
-            haplotype.append(view.first, view.second);
+            haplotype.back().append(view.first, view.second);
         }
     }
 
@@ -1232,6 +990,7 @@ std::string get_haplotype(const gbwtgraph::GBWTGraph& graph, Haplotypes::sequenc
 
 void validate_chain(const Haplotypes::TopLevelChain& chain,
                     const gbwtgraph::GBWTGraph& graph,
+                    const gbwt::FragmentMap& fragment_map,
                     const gbwt::FastLocate& r_index,
                     const HaplotypePartitioner::minimizer_index_type& minimizer_index,
                     size_t chain_id,
@@ -1292,9 +1051,10 @@ void validate_chain(const Haplotypes::TopLevelChain& chain,
         // Sequences: normal subchains.
         if (subchain.type == Haplotypes::Subchain::normal) {
             std::vector<gbwt::size_type> da = r_index.decompressDA(subchain.start);
+            hash_set<nid_t> nodes = extract_subchain(graph, gbwtgraph::GBWTGraph::node_to_handle(subchain.start), gbwtgraph::GBWTGraph::node_to_handle(subchain.end));
             hash_set<Haplotypes::sequence_type> selected;
             for (size_t i = 0; i < da.size(); i++) {
-                if (trace_path(*(graph.index), subchain.start, i, subchain.end)) {
+                if (trace_path(*(graph.index), fragment_map, nodes, da[i], subchain.start, i, subchain.end)) {
                     selected.insert(Haplotypes::sequence_type(da[i], i));
                 }
             }
@@ -1350,12 +1110,17 @@ void validate_chain(const Haplotypes::TopLevelChain& chain,
             hash_map<Haplotypes::Subchain::kmer_type, size_t> used_kmers; // (kmer used in haplotypes, number of sequences that contain it)
             hash_map<Haplotypes::Subchain::kmer_type, size_t> missing_kmers; // (kmer not used in haplotypes, number of sequences that contain it)
             for (size_t i = 0; i < subchain.sequences.size(); i++) {
-                std::string haplotype = get_haplotype(graph, subchain.sequences[i], subchain.start, subchain.end, minimizer_index.k());
-                auto minimizers = minimizer_index.minimizers(haplotype);
+                std::vector<std::string> haplotype = get_haplotype(
+                    graph, fragment_map,
+                    subchain.sequences[i], subchain.start, subchain.end, minimizer_index.k()
+                );
                 hash_map<Haplotypes::Subchain::kmer_type, bool> unique_minimizers; // (kmer, used in the sequence)
-                for (auto& minimizer : minimizers) {
-                    if (minimizer_index.count(minimizer) == 1) {
-                        unique_minimizers[minimizer.key.get_key()] = false;
+                for (const std::string& sequence : haplotype) {
+                    auto minimizers = minimizer_index.minimizers(sequence);
+                    for (auto& minimizer : minimizers) {
+                        if (minimizer_index.count(minimizer) == 1) {
+                            unique_minimizers[minimizer.key.get_key()] = false;
+                        }
                     }
                 }
                 for (size_t j = 0, offset = i * subchain.kmers.size(); j < subchain.kmers.size(); j++, offset++) {
@@ -1389,7 +1154,7 @@ void validate_chain(const Haplotypes::TopLevelChain& chain,
                 }
             }
             if (invalid_count > 0) {
-                std::string message = "invalid occurrence count for "+ std::to_string(invalid_count) + " kmers";
+                std::string message = "invalid occurrence count for " + std::to_string(invalid_count) + " kmers";
                 validate_error_subchain(chain_id, subchain_id, message);
             }
             size_t missing_informative_kmers = 0;
@@ -1456,16 +1221,20 @@ void validate_haplotypes(const Haplotypes& haplotypes,
     if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) {
         std::cerr << "Validating subchains, sequences, and kmers" << std::endl;
     }
+    gbwt::FragmentMap fragment_map(graph.index->metadata, false);
     #pragma omp parallel for schedule(dynamic, 1)
     for (size_t chain = 0; chain < haplotypes.components(); chain++) {
-        validate_chain(haplotypes.chains[chain], graph, r_index, minimizer_index, chain, verbosity);
+        validate_chain(haplotypes.chains[chain], graph, fragment_map, r_index, minimizer_index, chain, verbosity);
     }
 
-    // Kmers are globally unique.
+    // Kmers should be globally unique. But if there are fragmented haplotypes with 3 or more fragments
+    // in a subchain, the middle fragment(s) may actually overlap other subchains. Additionally,
+    // because the kmers we use are minimizers, they could occur elsewhere as non-minimizers.
     if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) {
         std::cerr << "Validating kmer specificity" << std::endl;
     }
     hash_map<Haplotypes::Subchain::kmer_type, std::pair<size_t, size_t>> kmers;
+    size_t collisions = 0, total_kmers = 0;
     for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) {
         const Haplotypes::TopLevelChain& chain = haplotypes.chains[chain_id];
         for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) {
@@ -1478,15 +1247,18 @@ void validate_haplotypes(const Haplotypes& haplotypes,
                         // A prefix subchain may overlap the preceding suffix subchain and
                         // contain the same kmers.
                     } else {
-                        std::string message = subchain.to_string() + ": kmer " + std::to_string(i) + " also found in " + subchain_to_string(iter->second.first, iter->second.second, prev);
-                        validate_error_subchain(chain_id, subchain_id, message);
+                        collisions++;
                     }
                 }
                 kmers[subchain.kmers[i]] = { chain_id, subchain_id };
             }
+            total_kmers += subchain.kmers.size();
         }
     }
     kmers.clear();
+    if (collisions > 0) {
+        std::cerr << "warning: [vg haplotypes] found " << collisions << " kmer collisions out of " << total_kmers << std::endl;
+    }
 
     if (verbosity >= Haplotypes::verbosity_basic) {
         double seconds = gbwt::readTimer() - start;
diff --git a/src/unittest/gaf_sorter.cpp b/src/unittest/gaf_sorter.cpp
index 7122ba25e1..288e1aa479 100644
--- a/src/unittest/gaf_sorter.cpp
+++ b/src/unittest/gaf_sorter.cpp
@@ -24,11 +24,13 @@ struct GAFInfo {
     size_t id;
     std::uint32_t min_node, max_node;
     std::uint32_t first_node, gbwt_offset;
+    bool first_is_reverse;
 
     GAFInfo(size_t id, size_t nodes, bool with_gbwt_offset) :
         id(id),
         min_node(std::numeric_limits<std::uint32_t>::max()), max_node(0),
-        first_node(std::numeric_limits<std::uint32_t>::max()), gbwt_offset(std::numeric_limits<std::uint32_t>::max()) {
+        first_node(std::numeric_limits<std::uint32_t>::max()), gbwt_offset(std::numeric_limits<std::uint32_t>::max()),
+        first_is_reverse(false) {
 
         // Name, query length, query start, query end, strand;
         std::string nd = std::to_string(nodes);
@@ -46,6 +48,7 @@ struct GAFInfo {
                 if (with_gbwt_offset) {
                     this->gbwt_offset = rng() % 1000;
                 }
+                this->first_is_reverse = reverse;
             }
             this->line += (reverse ? "<" : ">") + std::to_string(node);
         }
@@ -74,7 +77,8 @@ struct GAFInfo {
             if (this->gbwt_offset == std::numeric_limits<std::uint32_t>::max()) {
                 return GAFSorterRecord::MISSING_KEY;
             } else {
-                return (static_cast<std::uint64_t>(this->first_node) << 32) | this->gbwt_offset;
+                return (static_cast<std::uint64_t>(this->first_node) << 33) |
+                    (static_cast<std::uint64_t>(this->first_is_reverse) << 32) | this->gbwt_offset;
             }
         } else if (type == GAFSorterRecord::key_hash) {
             return GAFSorterRecord::hasher(this->line);        
diff --git a/src/unittest/graph_algorithms.cpp b/src/unittest/graph_algorithms.cpp
new file mode 100644
index 0000000000..bfdc848938
--- /dev/null
+++ b/src/unittest/graph_algorithms.cpp
@@ -0,0 +1,158 @@
+/** \file
+ *
+ * Unit tests for various graph algorithms.
+ */
+
+// These are the algorithms we want to test.
+#include "../algorithms/extract_subchain.hpp"
+
+// Other includes.
+#include <bdsg/hash_graph.hpp>
+
+#include <random>
+
+#include "catch.hpp"
+
+namespace vg {
+
+namespace unittest {
+
+//------------------------------------------------------------------------------
+
+namespace {
+
+// Generates a random graph with the given number of snarls, each containing the given number of nodes.
+// There is one shared node between each pair of adjacent snarls.
+// Also returns the list of ids for the nodes between the snarls.
+// If a snarl is bounded by nodes u and w, then every node v with u < v < w is in the snarl.
+std::pair<bdsg::HashGraph, std::vector<nid_t>> random_graph(size_t snarls, size_t snarl_size, std::uint32_t seed = 0x12345678) {
+    std::vector<std::string> labels { "A", "C", "G", "T" };
+    bdsg::HashGraph graph;
+    std::vector<nid_t> shared_nodes;
+    auto create_node = [&](nid_t id, bool shared) {
+        graph.create_handle(labels[id % labels.size()], id);
+        if (shared) {
+            shared_nodes.push_back(id);
+        }
+    };
+
+    create_node(1, true);
+    std::mt19937 rng(seed);
+    nid_t next = 2;
+    for (size_t i = 0; i < snarls; i++) {
+        for (nid_t id = next; id < next + snarl_size; id++) {
+            create_node(id, false);
+        }
+        create_node(next + snarl_size, true);
+
+        // Chain of even offsets.
+        nid_t prev = next - 1;
+        for (nid_t id = next; id < next + snarl_size; id += 2) {
+            graph.create_edge(graph.get_handle(prev), graph.get_handle(id));
+            prev = id;
+        }
+        graph.create_edge(graph.get_handle(prev), graph.get_handle(next + snarl_size));
+
+        // Chain of odd offsets.
+        prev = next - 1;
+        for (nid_t id = next + 1; id < next + snarl_size; id += 2) {
+            graph.create_edge(graph.get_handle(prev), graph.get_handle(id));
+            prev = id;
+        }
+        graph.create_edge(graph.get_handle(prev), graph.get_handle(next + snarl_size));
+
+        // Every other edge has a 5% chance of existing in random orientation.
+        for (nid_t from = next - 1; from <= next + snarl_size; from++) {
+            for (nid_t to = from + 1; to <= next + snarl_size; to++) {
+                if (rng() % 20 == 0) {
+                    bool from_is_reverse = (from == next - 1 ? false : rng() % 2 == 0);
+                    bool to_is_reverse = (to == next + snarl_size ? false : rng() % 2 == 0);
+                    graph.create_edge(graph.get_handle(from, from_is_reverse), graph.get_handle(to, to_is_reverse));
+                }
+            }
+        }
+
+        next += snarl_size + 1;
+    }
+
+    return std::make_pair(graph, shared_nodes);
+}
+
+} // anonymous namespace
+
+//------------------------------------------------------------------------------
+
+TEST_CASE("Extract subchain", "[graph_algorithms]") {
+    bdsg::HashGraph graph;
+    std::vector<nid_t> shared_nodes;
+    std::tie(graph, shared_nodes) = random_graph(10, 10);
+
+    auto check_subchain = [&](const hash_set<nid_t>& nodes, nid_t min, nid_t max) {
+        size_t expected_size = max - min + 1;
+        REQUIRE(nodes.size() == expected_size);
+        for (nid_t id : nodes) {
+            REQUIRE(min <= id);
+            REQUIRE(id <= max);
+        }
+    };
+
+    SECTION("Empty subchain") {
+        nid_t start = 1, end = 1;
+        hash_set<nid_t> subchain = extract_subchain(graph, graph.get_handle(start), graph.get_handle(end));
+        check_subchain(subchain, start, end);
+    }
+
+    SECTION("Empty subchain, reverse") {
+        nid_t start = 1, end = 1;
+        handle_t from = graph.get_handle(end, true);
+        handle_t to = graph.get_handle(start, true);
+        hash_set<nid_t> subchain = extract_subchain(graph, from, to);
+        check_subchain(subchain, start, end);
+    }
+
+    SECTION("Single snarl") {
+        for (size_t i = 0; i + 1 < shared_nodes.size(); i++) {
+            nid_t start = shared_nodes[i], end = shared_nodes[i + 1];
+            hash_set<nid_t> subchain = extract_subchain(graph, graph.get_handle(start), graph.get_handle(end));
+            check_subchain(subchain, start, end);
+        }
+    }
+
+    SECTION("Single snarl, reverse") {
+        for (size_t i = 0; i + 1 < shared_nodes.size(); i++) {
+            nid_t start = shared_nodes[i], end = shared_nodes[i + 1];
+            handle_t from = graph.get_handle(end, true);
+            handle_t to = graph.get_handle(start, true);
+            hash_set<nid_t> subchain = extract_subchain(graph, from, to);
+            check_subchain(subchain, start, end);
+        }
+    }
+
+    SECTION("Multiple snarls") {
+        for (size_t i = 0; i < shared_nodes.size(); i++) {
+            for (size_t j = i + 1; j < shared_nodes.size(); j++) {
+                nid_t start = shared_nodes[i], end = shared_nodes[j];
+                hash_set<nid_t> subchain = extract_subchain(graph, graph.get_handle(start), graph.get_handle(end));
+                check_subchain(subchain, start, end);
+            }
+        }
+    }
+
+    SECTION("Multiple snarls, reverse") {
+        for (size_t i = 0; i < shared_nodes.size(); i++) {
+            for (size_t j = i + 1; j < shared_nodes.size(); j++) {
+                nid_t start = shared_nodes[i], end = shared_nodes[j];
+                handle_t from = graph.get_handle(end, true);
+                handle_t to = graph.get_handle(start, true);
+                hash_set<nid_t> subchain = extract_subchain(graph, from, to);
+                check_subchain(subchain, start, end);
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+
+} // namespace unittest
+
+} // namespace vg
diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index a5849bc47a..4260f5f596 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 155
+plan tests 161
 
 
 # Build vg graphs for two chromosomes
@@ -330,7 +330,16 @@ is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 ha
 is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 samples"
 is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set"
 
-rm -f large.gbz large.local.gbwt
+# Now the same but with a specified reference sample
+vg gbwt -Z large.gbz -l -n 16 --pass-paths --set-reference GRCh38 -o large.local.GCRh38.gbwt
+is $? 0 "Local haplotypes with a specified reference sample"
+is $(vg gbwt -c large.local.GCRh38.gbwt) 34 "local haplotypes w/ reference paths: 34 paths"
+is $(vg gbwt -C large.local.GCRh38.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs"
+is $(vg gbwt -H large.local.GCRh38.gbwt) 17 "local haplotypes w/ reference paths: 17 haplotypes"
+is $(vg gbwt -S large.local.GCRh38.gbwt) 17 "local haplotypes w/ reference paths: 17 samples"
+is $(vg gbwt --tags large.local.GCRh38.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set"
+
+rm -f large.gbz large.local.gbwt large.local.GCRh38.gbwt
 
 
 # Build GBWTGraph from an augmented GBWT
diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t
index 888c524602..3bf66004b8 100644
--- a/test/t/54_vg_haplotypes.t
+++ b/test/t/54_vg_haplotypes.t
@@ -5,12 +5,12 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 21
+plan tests 27
 
 # The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph:
 # - GRCh38#chr6:31498145-31511124 (micb)
 # - GRCh38#chr19:54816468-54830778 (kir3dl1)
-# The reads are 50x novaseq reads mapping to those regions.
+# The reads are 50x novaseq reads for HG003 mapping to those regions.
 
 # Build indexes for the full graph.
 vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa --gbz-format -g full.gbz -r full.ri
@@ -51,6 +51,13 @@ is $? 0 "diploid sampling using a preset"
 cmp diploid.gbz diploid2.gbz
 is $? 0 "the outputs are identical"
 
+# Diploid sampling with a specified reference sample
+vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff --include-reference --diploid-sampling --set-reference GRCh38 -g diploid3.gbz full.gbz
+is $? 0 "diploid sampling with a specified reference sample"
+is $(vg gbwt -S -Z diploid3.gbz) 2 "1 generated + 1 reference samples"
+is $(vg gbwt -C -Z diploid3.gbz) 2 "2 contigs"
+is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes"
+
 # Giraffe integration, guessed output name
 vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
     -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null
@@ -66,9 +73,18 @@ is $? 0 "Giraffe integration with a specified output name"
 cmp full.HG003.gbz sampled.003HG.gbz
 is $? 0 "the sampled graphs are identical"
 
+# Giraffe integration, specified reference sample
+vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
+    --index-basename GRCh38 -N HG003 --set-reference GRCh38 \
+    -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null
+is $? 0 "Giraffe integration with a specified reference sample"
+cmp diploid3.gbz GRCh38.HG003.gbz
+is $? 0 "the sampled graph is identical to a manually sampled one"
+
 # Cleanup
 rm -r full.gbz full.ri full.dist full.hapl
 rm -f indirect.gbz direct.gbz no_ref.gbz
-rm -f diploid.gbz diploid2.gbz
-rm -f full.HG003.gbz full.HG003.dist full.HG003.min default.gam
-rm -f sampled.003HG.gbz sampled.003HG.dist sampled.003HG.min specified.gam
+rm -f diploid.gbz diploid2.gbz diploid3.gbz
+rm -f full.HG003.* default.gam
+rm -f sampled.003HG.* specified.gam
+rm -f GRCh38.HG003.* HG003_GRCh38.gam