diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2bafca1cc8a..4f331e3dfa5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,7 +11,7 @@ before_script: - sudo apt-get -q -y update # Make sure we have some curl stuff for pycurl which we need for some Python stuff # And the CI report upload needs uuidgen from uuid-runtime - - sudo apt-get -q -y install --no-upgrade docker.io python3-pip python3-virtualenv libcurl4-gnutls-dev python-dev npm nodejs node-gyp uuid-runtime libgnutls28-dev doxygen libzstd-dev + - sudo apt-get -q -y install --no-upgrade docker.io python3-pip python3-virtualenv libcurl4-gnutls-dev python-dev npm nodejs node-gyp uuid-runtime libgnutls28-dev doxygen libzstd-dev bcftools - which junit-merge || sudo npm install -g junit-merge # Configure Docker to use a mirror for Docker Hub and restart the daemon - | diff --git a/Makefile b/Makefile index f77ae12c82e..6c8fd7262a1 100644 --- a/Makefile +++ b/Makefile @@ -215,6 +215,9 @@ ifeq ($(shell uname -s),Darwin) # We don't actually do any static linking on Mac, so we leave this empty. START_STATIC = END_STATIC = + + # We need to use special flags to let us rename libraries + LD_RENAMEABLE_FLAGS = -Wl,-headerpad -Wl,-headerpad_max_install_names else # We are not running on OS X $(info OS is Linux) @@ -252,7 +255,8 @@ else # Note that END_STATIC is only safe to use in a mostly-dynamic build, and has to appear or we will try to statically link secret trailing libraries. END_STATIC = -Wl,-Bdynamic - + # We don't need any flags because we don't need to rename libraries with install_name_tool + LD_RENAMEABLE_FLAGS = endif # Set the C++ standard we are using @@ -656,7 +660,7 @@ ifeq ($(shell uname -s),Darwin) endif $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c - +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) + +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) # We build htslib after libdeflate so it can use libdeflate. # We need to do some wizardry to get it to pick up the right build and target system types on modern autotools. diff --git a/README.md b/README.md index 55556f6f44d..6acfeb9ce29 100644 --- a/README.md +++ b/README.md @@ -55,27 +55,41 @@ The easiest way to get vg is to download one of our release builds for Linux. We If you don't want to or can't use a pre-built release of vg, or if you want to become a vg developer, you can build it from source instead. +#### Linux: Clone VG + First, obtain the repo and its submodules: git clone --recursive https://github.com/vgteam/vg.git cd vg + +#### Linux: Install Dependencies Then, install VG's dependencies. You'll need the protobuf and jansson development libraries installed, and to run the tests you will need: - * `jq`, `bc`, `rs`, and `parallel` - * `hexdump` and `column` from `bsdmainutils` - * [`npm` for testing documentation examples](https://github.com/anko/txm)). +* `jq`, `bc`, `rs`, and `parallel` +* `hexdump` and `column` from `bsdmainutils` +* [`npm` for testing documentation examples](https://github.com/anko/txm)). + On Ubuntu, you should be able to do: make get-deps + +If you get complaints that `sudo` is not found, install it: + + apt update + apt install sudo + +If you get a bunch of errors like `E: Unable to locate package build-essential`, make sure your package index files are up to date by running: + + sudo apt update -On other distros, you will need to perform the equivalent of: +On other distros, or if you do not have root access, you will need to perform the equivalent of: sudo apt-get install build-essential git cmake pkg-config libncurses-dev libbz2-dev \ protobuf-compiler libprotoc-dev libprotobuf-dev libjansson-dev \ automake gettext autopoint libtool jq bsdmainutils bc rs parallel \ npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \ liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \ - libzstd-devel pybind11-dev python3-pybind11 + libzstd-dev pybind11-dev python3-pybind11 Note that **Ubuntu 16.04** does not ship a sufficiently new Protobuf; vg requires **Protobuf 3** which will have to be manually installed. @@ -85,22 +99,47 @@ Other libraries may be required. Please report any build difficulties. Note that a 64-bit OS is required. Ubuntu 20.04 should work. -When you are ready, build with `. ./source_me.sh && make`, and run with `./bin/vg`. +#### Linux: Build + +When you are ready, build with `. ./source_me.sh && make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers. Note that vg can take anywhere from 10 minutes to more than an hour to compile depending on your machine and the number of threads used. You can also produce a static binary with `make static`, assuming you have static versions of all the dependencies installed on your system. +#### Linux: Run + +Once vg is built, the binary will be at `bin/vg` inside the vg repository directory. You can run it with: + +``` +./bin/vg +``` + +You can also add its directory to your `PATH` enviornment variable, so that you can invoke `vg` from any directory. To do that on Bash, use this command from the vg repository directory: + +``` +echo 'export PATH="${PATH}:'"$(pwd)"'/bin"' >>~/.bashrc +``` + +Then close your terminal and open a new one. Run `vg` to make sure it worked. + +If it did not work, make sure that you have a `.bash_profile` file in your home directory that will run your `.bashrc`: +``` +if [ -f ~/.bashrc ]; then + source ~/.bashrc +fi +``` + ### Building on MacOS -#### Clone VG +#### Mac: Clone VG The first step is to clone the vg repository: git clone --recursive https://github.com/vgteam/vg.git cd vg -#### Install Dependencies +#### Mac: Install Dependencies VG depends on a number of packages being installed on the system where it is being built. Dependencies can be installed using either [MacPorts](https://www.macports.org/install.php) or [Homebrew](http://brew.sh/). @@ -118,17 +157,35 @@ Homebrew provides another package management solution for OSX, and may be prefer # Install all the dependencies in the Brewfile brew bundle -#### Build +#### Mac: Build With dependencies installed, VG can now be built: . ./source_me.sh && make + +As with Linux, you can add `-j16` or other numbers at the end to run multiple build tasks at once, if your computer can handle them. **Note that static binaries cannot yet be built for Mac.** Our team has successfully built vg on Mac with GCC versions 4.9, 5.3, 6, 7, and 7.3, as well as Clang 9.0. -#### Migrating to ARM Macs +#### Mac: Run + +Once vg is built, the binary will be at `bin/vg` inside the vg repository directory. You can run it with: + +``` +./bin/vg +``` + +You can also add its directory to your `PATH` enviornment variable, so that you can invoke `vg` from any directory. To do that on the default `zsh` Mac shell, use this command from the vg repository directory: + +``` +echo 'export PATH="${PATH}:'"$(pwd)"'/bin"' >>~/.zshrc +``` + +Then close your terminal and open a new one. Run `vg` to make sure it worked. + +##### Migrate a VG installation from x86 to ARM The Mac platform is moving to ARM, with Apple's M1, M1 Pro, M1 Max, and subsequent chip designs. The vg codebase supports ARM on Mac as well as on Linux. **The normal installation instructions work on a factory-fresh ARM Mac**. @@ -396,14 +453,6 @@ vg index hla.vg -x hla.xg vg deconstruct hla.xg -e -p "gi|568815592:29791752-29792749" > hla_variants.vcf ``` -Variants can also be inferred strictly from topology by not using `-e`, though unlike the above example, cycles are not supported. "Deconstruct" the VCF variants that were used to construct the graph. The output will be similar but identical to `small/x.vcf.gz` as `vg construct` can add edges between adjacent alts and/or do some normalization: - - -```sh -# using the same graph from the `map` example -vg deconstruct x.xg -p x > x.vcf -``` - Haplotype paths from `.gbz` or `.gbwt` indexes input can be considered using `-z` and `-g', respectively. As with `vg call`, it is best to compute snarls separately and pass them in with `-r` when working with large graphs. diff --git a/deps/libvgio b/deps/libvgio index 9b0d0e11df6..def4827b903 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit 9b0d0e11df6f9bd389ba4dba08d107953eabff8f +Subproject commit def4827b9034d9624179c442c8568978ca33e5b8 diff --git a/ontology/vg.html b/ontology/vg.html index 982fc7a668a..586ed09aaa5 100644 --- a/ontology/vg.html +++ b/ontology/vg.html @@ -688,7 +688,7 @@

rdfs:comment - "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)." + "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)." xsd:string diff --git a/ontology/vg.ttl b/ontology/vg.ttl index 056e421fa01..39ef0fc795b 100644 --- a/ontology/vg.ttl +++ b/ontology/vg.ttl @@ -31,7 +31,7 @@ . :Step rdf:type owl:Class ; - rdfs:comment "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."^^xsd:string ; + rdfs:comment "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."^^xsd:string ; rdfs:label "Step"^^xsd:string ; rdfs:subClassOf owl:Thing ; . diff --git a/src/algorithms/alignment_path_offsets.cpp b/src/algorithms/alignment_path_offsets.cpp index d50f9100818..f781b042377 100644 --- a/src/algorithms/alignment_path_offsets.cpp +++ b/src/algorithms/alignment_path_offsets.cpp @@ -3,94 +3,94 @@ //#define debug_mpaln_offsets namespace vg { -namespace algorithms { - -unordered_map > > -alignment_path_offsets(const PathPositionHandleGraph& graph, - const Alignment& aln, - bool just_min, - bool nearby, - size_t search_limit, - const std::function* path_filter) { - if (nearby && search_limit == 0) { - // Fill in the search limit - search_limit = aln.sequence().size(); - } - unordered_map > > offsets; - if (graph.get_path_count() == 0) return offsets; - for (auto& mapping : aln.path().mapping()) { - // How many bases does this Mapping cover over? - size_t mapping_width = mapping_from_length(mapping); - if (mapping_width == 0 && !nearby) { - // Just skip over this mapping; it touches no bases. - continue; - } - // We may have to consider both the starts and ends of mappings - vector end = {false}; - if (just_min && !nearby) { - // We want the min actually touched position along each path. It - // could come from the Mapping start or the Mapping end. - end.push_back(true); - } - // Find the position of this end of this mapping - pos_t mapping_pos = make_pos_t(mapping.position()); - // Find the positions for this end of this Mapping - auto pos_offs = algorithms::nearest_offsets_in_paths(&graph, mapping_pos, nearby ? search_limit : -1, path_filter); - for (auto look_at_end : end) { - // For the start and the end of the Mapping, as needed - for (auto& p : pos_offs) { - // For each path, splice the list of path positions for this Mapping - // onto the end of the list of positions we found in that path - auto& v = offsets[p.first]; - for (pair& y : p.second) { - v.emplace_back(y.second ? y.first - mapping_width : y.first, - y.second); + namespace algorithms { + + unordered_map > > + alignment_path_offsets(const PathPositionHandleGraph& graph, + const Alignment& aln, + bool just_min, + bool nearby, + size_t search_limit, + const std::function* path_filter) { + if (nearby && search_limit == 0) { + // Fill in the search limit + search_limit = aln.sequence().size(); + } + unordered_map > > offsets; + if (graph.get_path_count() == 0) return offsets; + for (auto& mapping : aln.path().mapping()) { + // How many bases does this Mapping cover over? + size_t mapping_width = mapping_from_length(mapping); + if (mapping_width == 0 && !nearby) { + // Just skip over this mapping; it touches no bases. + continue; + } + // We may have to consider both the starts and ends of mappings + vector end = {false}; + if (just_min && !nearby) { + // We want the min actually touched position along each path. It + // could come from the Mapping start or the Mapping end. + end.push_back(true); + } + // Find the position of this end of this mapping + pos_t mapping_pos = make_pos_t(mapping.position()); + // Find the positions for this end of this Mapping + auto pos_offs = algorithms::nearest_offsets_in_paths(&graph, mapping_pos, nearby ? search_limit : -1, path_filter); + for (auto look_at_end : end) { + // For the start and the end of the Mapping, as needed + for (auto& p : pos_offs) { + // For each path, splice the list of path positions for this Mapping + // onto the end of the list of positions we found in that path + auto& v = offsets[p.first]; + for (pair& y : p.second) { + v.emplace_back(y.second ? y.first - mapping_width : y.first, + y.second); + } + } } } - } - } - if (!nearby && offsets.empty()) { - // find the nearest if we couldn't find any before - return alignment_path_offsets(graph, aln, just_min, true, search_limit, path_filter); - } - if (just_min) { - // We need the minimum position for each path - for (auto& p : offsets) { - auto& v = p.second; - auto m = *min_element(v.begin(), v.end(), - [](const pair& a, - const pair& b) - { return a.first < b.first; }); - v.clear(); - v.push_back(m); - } - } - return offsets; -} - -unordered_map > > -multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, - const multipath_alignment_t& mp_aln, - const std::function* path_filter) { - - using path_positions_t = unordered_map>>; - - // collect the search results for each mapping on each subpath - vector> search_results(mp_aln.subpath_size()); - for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { - const subpath_t& subpath = mp_aln.subpath(i); - auto& subpath_search_results = search_results[i]; - subpath_search_results.resize(subpath.path().mapping_size()); - for (size_t j = 0; j < subpath.path().mapping_size(); ++j) { - // get the positions on paths that this mapping touches - pos_t mapping_pos = make_pos_t(subpath.path().mapping(j).position()); - subpath_search_results[j] = nearest_offsets_in_paths(&graph, mapping_pos, 0, path_filter); - // make sure that offsets are stored in increasing order - for (pair>>& search_record : subpath_search_results[j]) { - sort(search_record.second.begin(), search_record.second.end()); + if (!nearby && offsets.empty()) { + // find the nearest if we couldn't find any before + return alignment_path_offsets(graph, aln, just_min, true, search_limit, path_filter); + } + if (just_min) { + // We need the minimum position for each path + for (auto& p : offsets) { + auto& v = p.second; + auto m = *min_element(v.begin(), v.end(), + [](const pair& a, + const pair& b) + { return a.first < b.first; }); + v.clear(); + v.push_back(m); + } } + return offsets; + } + + unordered_map > > + multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, + const multipath_alignment_t& mp_aln, + const std::function* path_filter) { + + using path_positions_t = unordered_map>>; + + // collect the search results for each mapping on each subpath + vector> search_results(mp_aln.subpath_size()); + for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { + const subpath_t& subpath = mp_aln.subpath(i); + auto& subpath_search_results = search_results[i]; + subpath_search_results.resize(subpath.path().mapping_size()); + for (size_t j = 0; j < subpath.path().mapping_size(); ++j) { + // get the positions on paths that this mapping touches + pos_t mapping_pos = make_pos_t(subpath.path().mapping(j).position()); + subpath_search_results[j] = nearest_offsets_in_paths(&graph, mapping_pos, 0, path_filter); + // make sure that offsets are stored in increasing order + for (pair>>& search_record : subpath_search_results[j]) { + sort(search_record.second.begin(), search_record.second.end()); + } #ifdef debug_mpaln_offsets - cerr << "subpath " << i << ", mapping " << j << " path locations" << endl; + cerr << "subpath " << i << ", mapping " << j << " path locations" << endl; for (const auto& pps : subpath_search_results[j]) { cerr << graph.get_path_name(pps.first) << endl; for (const auto& pp : pps.second) { @@ -98,132 +98,137 @@ multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, } } #endif - } - } - - path_positions_t return_val; - - // to keep track of whether we've already chosen a position on each path - // earlier in the multipath alignment in either the forward or reverse pass - vector> covered_fwd(mp_aln.subpath_size()); - vector> covered_rev(mp_aln.subpath_size()); - - // forward pass looking for positions on the forward strand of paths - for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { - const auto& subpath_search_results = search_results[i]; - for (size_t j = 0; j < subpath_search_results.size(); ++j) { - for (const auto& path_pos : subpath_search_results[j]) { - if (!covered_fwd[i].count(path_pos.first)) { - // we haven't already covered this path at an earlier position on the alignment - for (const auto& path_offset : path_pos.second) { - if (!path_offset.second) { - // there's a position on the forward strand of this path - return_val[path_pos.first].emplace_back(path_offset); - - // we're now covering this path for future search results - covered_fwd[i].insert(path_pos.first); - + } + } + + path_positions_t return_val; + + // to keep track of whether we've already chosen a position on each path + // earlier in the multipath alignment in either the forward or reverse pass + vector> covered_fwd(mp_aln.subpath_size()); + vector> covered_rev(mp_aln.subpath_size()); + + // forward pass looking for positions on the forward strand of paths + for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { + const auto& subpath_search_results = search_results[i]; + for (size_t j = 0; j < subpath_search_results.size(); ++j) { + for (const auto& path_pos : subpath_search_results[j]) { + if (!covered_fwd[i].count(path_pos.first)) { + // we haven't already covered this path at an earlier position on the alignment + for (const auto& path_offset : path_pos.second) { + if (!path_offset.second) { + // there's a position on the forward strand of this path + return_val[path_pos.first].emplace_back(path_offset); + + // we're now covering this path for future search results + covered_fwd[i].insert(path_pos.first); + #ifdef debug_mpaln_offsets - cerr << "found fwd pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first << " " << path_offset.second << endl; + cerr << "found fwd pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first << " " << path_offset.second << endl; #endif - - break; + + break; + } + } } } } + + // the following subpaths will be covered for any path that this + // one is covered for + for (auto n : mp_aln.subpath(i).next()) { + auto& next_coverings = covered_fwd[n]; + for (auto path_handle : covered_fwd[i]) { + next_coverings.insert(path_handle); + } + } + for (const auto& c : mp_aln.subpath(i).connection()) { + auto& next_coverings = covered_fwd[c.next()]; + for (auto path_handle : covered_fwd[i]) { + next_coverings.insert(path_handle); + } + } } - } - - // the following subpaths will be covered for any path that this - // one is covered for - for (auto n : mp_aln.subpath(i).next()) { - auto& next_coverings = covered_fwd[n]; - for (auto path_handle : covered_fwd[i]) { - next_coverings.insert(path_handle); - } - } - for (const auto& c : mp_aln.subpath(i).connection()) { - auto& next_coverings = covered_fwd[c.next()]; - for (auto path_handle : covered_fwd[i]) { - next_coverings.insert(path_handle); - } - } - } - - // now do a backward pass for the reverse strand of paths - for (int64_t i = mp_aln.subpath_size() - 1; i >= 0; --i) { - // find which paths are already covered in the reverse - for (auto n : mp_aln.subpath(i).next()) { - for (auto path_handle : covered_rev[n]) { - covered_rev[i].insert(path_handle); - } - } - for (const auto& c : mp_aln.subpath(i).connection()) { - for (auto path_handle : covered_rev[c.next()]) { - covered_rev[i].insert(path_handle); - } - } - - const auto& subpath_search_results = search_results[i]; - for (int64_t j = subpath_search_results.size() - 1; j >= 0; --j) { - for (const auto& path_pos : subpath_search_results[j]) { - if (!covered_rev[i].count(path_pos.first)) { - // we haven't already covered this path at an earlier position on the alignment - for (const auto& path_offset : path_pos.second) { - if (path_offset.second) { - // there's a position on the reverse strand of this path - auto mapping_len = mapping_from_length(mp_aln.subpath(i).path().mapping(j)); - return_val[path_pos.first].emplace_back(path_offset.first - mapping_len, - path_offset.second); - + + // now do a backward pass for the reverse strand of paths + for (int64_t i = mp_aln.subpath_size() - 1; i >= 0; --i) { + // find which paths are already covered in the reverse + for (auto n : mp_aln.subpath(i).next()) { + for (auto path_handle : covered_rev[n]) { + covered_rev[i].insert(path_handle); + } + } + for (const auto& c : mp_aln.subpath(i).connection()) { + for (auto path_handle : covered_rev[c.next()]) { + covered_rev[i].insert(path_handle); + } + } + + const auto& subpath_search_results = search_results[i]; + for (int64_t j = subpath_search_results.size() - 1; j >= 0; --j) { + for (const auto& path_pos : subpath_search_results[j]) { + if (!covered_rev[i].count(path_pos.first)) { + // we haven't already covered this path at an earlier position on the alignment + for (const auto& path_offset : path_pos.second) { + if (path_offset.second) { + // there's a position on the reverse strand of this path + auto mapping_len = mapping_from_length(mp_aln.subpath(i).path().mapping(j)); + return_val[path_pos.first].emplace_back(path_offset.first - mapping_len, + path_offset.second); + #ifdef debug_mpaln_offsets - cerr << "found rev pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first - mapping_len << " " << path_offset.second << endl; + cerr << "found rev pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first - mapping_len << " " << path_offset.second << endl; #endif - // we're now covering this path for future search results - covered_rev[i].insert(path_pos.first); - - break; + // we're now covering this path for future search results + covered_rev[i].insert(path_pos.first); + + break; + } + } } } } } + + return return_val; } - } - - return return_val; -} - -void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { - annotate_with_path_positions(graph, aln, true, search_limit, path_filter); -} - -void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { - annotate_with_path_positions(graph, aln, false, search_limit, path_filter); -} - -void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit, const std::function* path_filter) { - if (!aln.refpos_size()) { - // Get requested path positions - unordered_map > > positions = alignment_path_offsets(graph, aln, just_min, false, search_limit, path_filter); - // emit them in order of the path handle - vector ordered; - for (auto& path : positions) { ordered.push_back(path.first); } - std::sort(ordered.begin(), ordered.end(), [](const path_handle_t& a, const path_handle_t& b) { return as_integer(a) < as_integer(b); }); - for (auto& path : ordered) { - for (auto& p : positions[path]) { - // Add each determined refpos - Position* refpos = aln.add_refpos(); - refpos->set_name(graph.get_path_name(path)); - refpos->set_offset(p.first); - refpos->set_is_reverse(p.second); + + void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { + annotate_with_path_positions(graph, aln, true, search_limit, path_filter); + } + + void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { + annotate_with_path_positions(graph, aln, false, search_limit, path_filter); + } + + void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit, const std::function* path_filter) { + if (!aln.refpos_size()) { + // Get requested path positions + unordered_map > > positions = alignment_path_offsets(graph, aln, just_min, false, search_limit, path_filter); + // emit them in order of the path handle + vector ordered; + for (auto& path : positions) { ordered.push_back(path.first); } + std::sort(ordered.begin(), ordered.end(), [](const path_handle_t& a, const path_handle_t& b) { return as_integer(a) < as_integer(b); }); + for (auto& path : ordered) { + for (auto& p : positions[path]) { + // Add each determined refpos + + Position* refpos = aln.add_refpos(); + subrange_t subrange; + string path_name = graph.get_path_name(path); + path_name = Paths::strip_subrange(path_name, &subrange); + int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + refpos->set_name(path_name); + refpos->set_offset(offset + p.first); + refpos->set_is_reverse(p.second); + } + } } } - } -} -void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& alns, size_t search_limit, const std::function* path_filter) { - for (auto& aln : alns) annotate_with_initial_path_positions(graph, aln, search_limit, path_filter); -} + void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& alns, size_t search_limit, const std::function* path_filter) { + for (auto& aln : alns) annotate_with_initial_path_positions(graph, aln, search_limit, path_filter); + } -} -} + } +} \ No newline at end of file diff --git a/src/algorithms/k_widest_paths.cpp b/src/algorithms/k_widest_paths.cpp index 3576b71ae02..5bcfd0267b8 100644 --- a/src/algorithms/k_widest_paths.cpp +++ b/src/algorithms/k_widest_paths.cpp @@ -246,7 +246,7 @@ vector>> yens_k_widest_paths(const HandleGraph* g, forgotten_nodes.insert(g->flip(prev_path[j])); } - // find our path from the the spur_node to the sink + // find our path from the spur_node to the sink pair> spur_path_v = widest_dijkstra(g, spur_node, sink, node_weight_callback, edge_weight_callback, [&](handle_t h) {return forgotten_nodes.count(h);}, [&](edge_t e) {return forgotten_edges.count(e);}, diff --git a/src/cactus.cpp b/src/cactus.cpp index 7d5a6bc8ac3..a66c8e90e22 100644 --- a/src/cactus.cpp +++ b/src/cactus.cpp @@ -148,7 +148,7 @@ void getReachableBridges(stCactusEdgeEnd *edgeEnd1, stList *bridgeEnds) { } /** - * Finds an arbitrary pair of telomeres in a Cactus graph, which are are either + * Finds an arbitrary pair of telomeres in a Cactus graph, which are either * a pair of bridge edge ends or a pair of chain edge ends, oriented such that * they form a pair of boundaries. * diff --git a/src/clip.cpp b/src/clip.cpp index 0d2c5efd78d..8b078619fc9 100644 --- a/src/clip.cpp +++ b/src/clip.cpp @@ -135,7 +135,7 @@ void visit_contained_snarls(const PathPositionHandleGraph* graph, const vector>& com } cerr << strm.str(); #endif - // the the original component + // the original component components[component_idx] = move(new_components[0]); // add the remaining to the end for (size_t i = 1; i < new_components.size(); i++) { diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 118d9df45a5..aa052db955a 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1,6 +1,7 @@ #include "deconstructor.hpp" #include "traversal_finder.hpp" #include +#include "traversal_clusters.hpp" //#define debug @@ -8,8 +9,7 @@ using namespace std; namespace vg { -Deconstructor::Deconstructor() : VCFOutputCaller(""), - exhaustive_jaccard_warning(false){ +Deconstructor::Deconstructor() : VCFOutputCaller("") { } Deconstructor::~Deconstructor(){ } @@ -23,13 +23,12 @@ Deconstructor::~Deconstructor(){ * ought to become in the VCF. If a traversal is flagged off, it gets a -1. */ vector Deconstructor::get_alleles(vcflib::Variant& v, - const pair, - vector>>& path_travs, + const vector& travs, + const vector>& trav_steps, int ref_path_idx, - const vector& use_trav, + const vector>& trav_clusters, char prev_char, bool use_start) const { - auto& travs = path_travs.first; assert(ref_path_idx >=0 && ref_path_idx < travs.size()); // map strings to allele numbers (and their traversal) @@ -42,14 +41,18 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, vector trav_to_allele(travs.size()); // compute the allele as a string - auto trav_to_string = [&](const SnarlTraversal& trav) { + auto trav_to_string = [&](const Traversal& trav) { string allele; - // we skip the snarl endpoints - for (int j = 1; j < trav.visit_size() - 1; ++j) { - const string& node_sequence = graph->get_sequence(graph->get_handle(trav.visit(j).node_id())); - allele += trav.visit(j).backward() ? reverse_complement(node_sequence) : node_sequence; + // hack to support star alleles + if (trav.size() == 0) { + allele = "*"; + } else { + // we skip the snarl endpoints + for (int j = 1; j < trav.size() - 1; ++j) { + allele += toUppercase(graph->get_sequence(trav[j])); + } } - return toUppercase(allele); + return allele; }; // set the reference allele @@ -59,10 +62,11 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, bool substitution = true; // set the other alleles (they can end up as 0 alleles too if their strings match the reference) - for (int i = 0; i < travs.size(); ++i) { - if (i != ref_path_idx) { - if (use_trav[i]) { - string allele = trav_to_string(travs[i]); + // note that we have one (unique) allele per cluster, so we take advantage of that here + for (const vector& cluster : trav_clusters) { + string allele = trav_to_string(travs[cluster.front()]); + for (const int& i : cluster) { + if (i != ref_path_idx) { auto ai_it = allele_idx.find(allele); if (ai_it == allele_idx.end()) { // make a new allele for this string @@ -133,15 +137,15 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, if (untangle_allele_traversals) { // set up for reference position context mapping across allele traversals - path_handle_t ref_path = graph->get_path_handle_of_step(path_travs.second.at(ref_path_idx).first); + path_handle_t ref_path = graph->get_path_handle_of_step(trav_steps.at(ref_path_idx).first); unordered_map>>> ref_dup_nodes; unordered_map ref_simple_pos; { auto& trav = travs.at(ref_path_idx); - for (size_t i = 0; i < trav.visit_size(); ++i) { - size_t j = !reversed ? i : trav.visit_size() - 1 - i; - const Visit& visit = trav.visit(j); - nid_t node_id = visit.node_id(); + for (size_t i = 0; i < trav.size(); ++i) { + size_t j = !reversed ? i : trav.size() - 1 - i; + const handle_t& handle = trav[j]; + nid_t node_id = graph->get_id(handle); if (ref_simple_pos.find(node_id) != ref_simple_pos.end()) continue; if (ref_dup_nodes.find(node_id) != ref_dup_nodes.end()) continue; handle_t h = graph->get_handle(node_id); @@ -188,8 +192,8 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, for (size_t i = 0; i < allele_idx_unfolded.size(); i++) { int allele_no = i; int allele_trav_no = allele_idx_unfolded[i]; - auto start_step = path_travs.second.at(allele_trav_no).first; - auto end_step = path_travs.second.at(allele_trav_no).second; + auto start_step = trav_steps.at(allele_trav_no).first; + auto end_step = trav_steps.at(allele_trav_no).second; auto start_pos = graph->get_position_of_step(start_step); auto end_pos = graph->get_position_of_step(end_step); bool flip_path = start_pos > end_pos; @@ -251,7 +255,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, for (auto& c : ref_contexts) { auto& ref_context = c.second; auto& ref_pos = c.first; - double j = context_jaccard(ref_context, path_context); + double j = jaccard_coefficient(ref_context, path_context); if (j > best_jaccard) { best_jaccard = j; best_pos = ref_pos; @@ -273,7 +277,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, int allele_no = ai_pair.second.first; int allele_trav_no = ai_pair.second.second; // update the traversal info - add_allele_path_to_info(v, allele_no, travs[allele_trav_no], reversed, !substitution); + add_allele_path_to_info(graph, v, allele_no, travs[allele_trav_no], reversed, !substitution); } } @@ -289,13 +293,18 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, } void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& names, - const vector& trav_to_allele) const { + const vector& trav_to_allele, + const vector>& trav_to_cluster_info) const { assert(names.size() == trav_to_allele.size()); // set up our variant fields v.format.push_back("GT"); - if (show_path_info && path_to_sample_phase && path_restricted) { + if (show_path_info && path_to_sample_phase) { v.format.push_back("PI"); } + if (this->cluster_threshold < 1.0) { + v.format.push_back("TS"); + v.format.push_back("TL"); + } // get a list of traversals for every vcf sample // (this will be 1:1 unless we're using the path_to_sample name map) @@ -345,6 +354,18 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name } genotype += (chosen_travs[i] != -1 && (!conflict || keep_conflicted_genotypes)) ? std::to_string(trav_to_allele[chosen_travs[i]]) : "."; + if (this->cluster_threshold < 1.0) { + if (*genotype.rbegin() == '.') { + v.samples[sample_name]["TS"].push_back("."); + v.samples[sample_name]["TL"].push_back("."); + } else { + ostringstream ss; + ss.precision(3); + ss << trav_to_cluster_info[chosen_travs[i]].first; + v.samples[sample_name]["TS"].push_back(ss.str()); + v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second)); + } + } } v.samples[sample_name]["GT"] = {genotype}; if (show_path_info && path_to_sample_phase) { @@ -364,7 +385,7 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name } } v.samples[sample_name]["GT"] = {blank_gt}; - if (show_path_info && path_to_sample_phase && path_restricted) { + if (show_path_info && path_to_sample_phase) { v.samples[sample_name]["PI"] = {blank_gt}; } } @@ -422,9 +443,9 @@ pair, bool> Deconstructor::choose_traversals(const string& sample_na std::any_of(gbwt_phases.begin(), gbwt_phases.end(), [](int i) { return i >= 0; }); //|| path_to_sample_phase; bool phasing_conflict = false; - int sample_ploidy = ploidy; + int sample_ploidy = 1; int min_phase = 1; - int max_phase = ploidy; + int max_phase = 1; if (has_phasing || path_to_sample_phase) { if (has_phasing) { // override ploidy with information about all phases found in input @@ -490,38 +511,6 @@ pair, bool> Deconstructor::choose_traversals(const string& sample_na return make_pair(most_frequent_travs, conflict); } - -// todo refactor if we need to reuse elsewhere in vg -// implemented inline for development -// assumes sorted input -double Deconstructor::context_jaccard( - const vector& target, - const vector& query) const { - size_t node_isec = 0; - std::set_intersection(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_isec)); - size_t node_union = 0; - std::set_union(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_union)); - return (double)node_isec / (double)node_union; -} - -double Deconstructor::context_jaccard( - const dac_vector<>& target, - const vector& query) const { - size_t node_isec = 0; - std::set_intersection(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_isec)); - size_t node_union = 0; - std::set_union(target.begin(), target.end(), - query.begin(), query.end(), - count_back_inserter(node_union)); - return (double)node_isec / (double)node_union; -} - vector Deconstructor::get_context( step_handle_t start_step, step_handle_t end_step) const { @@ -564,69 +553,40 @@ vector Deconstructor::get_context( return context; } -vector Deconstructor::get_context( - const pair, - vector>>& path_travs, - const int& trav_idx) const { - step_handle_t start_step = path_travs.second[trav_idx].first; - step_handle_t end_step = path_travs.second[trav_idx].second; - return get_context(start_step, end_step); -} -bool Deconstructor::deconstruct_site(const Snarl* snarl) const { - - auto contents = snarl_manager->shallow_contents(snarl, *graph, false); - if (contents.first.empty()) { - // Nothing but the boundary nodes in this snarl +void Deconstructor::get_traversals(const handle_t& snarl_start, const handle_t& snarl_end, + vector& out_travs, + vector& out_trav_path_names, + vector>& out_trav_steps) const { + // empty snarl check + vector next_handles; + graph->follow_edges(snarl_start, false, [&](handle_t handle) { + next_handles.push_back(handle); + }); + if (next_handles.size() == 1 && next_handles.back() == snarl_end) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping empty site " << pb2json(*snarl) << endl; -#endif - return false; + cerr << "Skipping empty site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; +#endif + return; } + #ifdef debug #pragma omp crtiical (cerr) - cerr << "Computing traversals of site " << pb2json(*snarl) << endl; + cerr << "Computing traversals of site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif // find every traversal that runs through a path in the graph - pair, vector > > path_travs; - path_travs = path_trav_finder->find_path_traversals(*snarl); - vector path_trav_names; - for (const pair& trav_ends : path_travs.second) { - path_trav_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_ends.first))); - } - - // pick out the traversal corresponding to a reference path, breaking ties consistently - string ref_trav_name; - for (int i = 0; i < path_travs.first.size(); ++i) { - const string& path_trav_name = path_trav_names.at(i); -#ifdef debug -#pragma omp critical (cerr) - { - cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << path_travs.first[i].visit_size() - << ", start=" << graph->get_position_of_step(path_travs.second[i].first) - << ", end=" << graph->get_position_of_step(path_travs.second[i].second) << endl - << " trav=" << pb2json(path_travs.first[i]) << endl; - } -#endif - if (ref_paths.count(path_trav_name) && - (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { - ref_trav_name = path_trav_name; -#ifdef debug -#pragma omp critical (cerr) - cerr << "Setting ref_trav_name " << ref_trav_name << endl; -#endif - } + std::tie(out_travs, out_trav_steps) = path_trav_finder->find_path_traversals(snarl_start, snarl_end); + for (const pair& trav_steps : out_trav_steps) { + out_trav_path_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_steps.first))); } // add in the gbwt traversals // after this, all traversals are treated the same, with metadata embedded in their names - int64_t first_gbwt_trav_idx = path_trav_names.size(); - vector gbwt_path_ids; if (gbwt_trav_finder.get() != nullptr) { const gbwt::GBWT& gbwt_index = gbwt_trav_finder->get_gbwt(); - pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(*snarl); + pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(snarl_start, snarl_end); for (int i = 0; i < thread_travs.first.size(); ++i) { // We need to get a bunch of metadata about the path, but the GBWT // we have might not even have structured path names stored. @@ -635,7 +595,6 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { continue; } - gbwt_path_ids.push_back(path_id); PathSense sense = gbwtgraph::get_path_sense(gbwt_index, path_id, gbwt_reference_samples); if (sense == PathSense::HAPLOTYPE) { @@ -648,26 +607,151 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { gbwtgraph::get_path_haplotype(gbwt_index, path_id, sense), gbwtgraph::get_path_phase_block(gbwt_index, path_id, sense), gbwtgraph::get_path_subrange(gbwt_index, path_id, sense)); - path_trav_names.push_back(path_name); - path_travs.first.push_back(thread_travs.first[i]); - // dummy handles so we can use the same code as the named path traversals above - path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); + out_trav_path_names.push_back(path_name); + out_travs.push_back(std::move(thread_travs.first[i])); + } + } + } +} + +unordered_map> Deconstructor::add_star_traversals(vector& travs, + vector& names, + vector>& trav_clusters, + vector>& trav_cluster_info, + const unordered_map>& parent_haplotypes) const { + // todo: refactor this into general genotyping code + unordered_map> sample_to_haps; + + // find out what's in the traversals + assert(names.size() == travs.size()); + for (int64_t i = 0; i < names.size(); ++i) { + string sample_name = PathMetadata::parse_sample_name(names[i]); + // for backward compatibility + if (sample_name.empty()) { + sample_name = names[i]; + } + auto phase = PathMetadata::parse_haplotype(names[i]); + if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { + // THis probably won't fit in an int. Use 0 instead. + phase = 0; + } + sample_to_haps[sample_name].push_back(phase); + } + + // find everything that's in parent_haplotyes but not the travefsals, + // and add in dummy start-alleles for them + for (const auto& parent_sample_haps : parent_haplotypes) { + string parent_sample_name = PathMetadata::parse_sample_name(parent_sample_haps.first); + if (parent_sample_name.empty()) { + parent_sample_name = parent_sample_haps.first; + } + if (!this->sample_names.count(parent_sample_name)) { + // dont' bother for purely reference samples -- we don't need to force and allele for them. + continue; + } + for (int parent_hap : parent_sample_haps.second) { + bool found = false; + if (sample_to_haps.count(parent_sample_haps.first)) { + // note: this is brute-force search, but number of haplotypes usually tiny. + for (int hap : sample_to_haps[parent_sample_haps.first]) { + if (parent_hap == hap) { + found = true; + break; + } + } + } + if (!found) { + travs.push_back(Traversal()); + names.push_back(PathMetadata::create_path_name(PathSense::REFERENCE, + parent_sample_haps.first, + "star", + parent_hap, + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE)); + sample_to_haps[parent_sample_haps.first].push_back(parent_hap); + trav_clusters.push_back({(int)travs.size() - 1}); + trav_cluster_info.push_back(make_pair(0, 0)); } } } + return sample_to_haps; +} + + +bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, + const NestingInfo* in_nesting_info, + vector* out_nesting_infos) const { + + + vector travs; + vector trav_path_names; + // note that this vector (unlike the above two) is for embedded paths only (not GBWT) + vector> trav_steps; + + // compute all the traversals from embedded paths and gbwt + this->get_traversals(snarl_start, snarl_end, travs, trav_path_names, trav_steps); + int64_t trav_count = travs.size(); + int64_t trav_step_count = trav_steps.size(); + + if (travs.empty()) { + return false; + } + + // pick out the traversal corresponding to an embedded reference path, breaking ties consistently + string ref_trav_name; + string parent_ref_trav_name; + if (in_nesting_info != nullptr && in_nesting_info->has_ref) { + parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); +#ifdef debug +#pragma omp critical (cerr) + cerr << "Using nesting information to set reference to " << parent_ref_trav_name << endl; +#endif + // remember it for the vcf header + this->off_ref_paths[omp_get_thread_num()].insert(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); + } + for (int i = 0; i < travs.size(); ++i) { + const string& path_trav_name = trav_path_names[i]; +#ifdef debug +#pragma omp critical (cerr) + { + cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].size(); + if (i < trav_steps.size()) { + cerr << ", start=" << graph->get_position_of_step(trav_steps[i].first) + << ", end=" << graph->get_position_of_step(trav_steps[i].second) << endl; + } + cerr << " trav=" << traversal_to_string(graph, travs[i]) << endl; + } +#endif + bool ref_path_check; + if (!parent_ref_trav_name.empty()) { + // the reference was specified by the parent + ref_path_check = path_trav_name == parent_ref_trav_name; + } else { + // the reference comes from the global options + ref_path_check = ref_paths.count(path_trav_name); + } + if (ref_path_check && + (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { + ref_trav_name = path_trav_name; +#ifdef debug +#pragma omp critical (cerr) + cerr << "Setting ref_trav_name " << ref_trav_name << (in_nesting_info ? " using nesting info" : "") << endl; +#endif + } + } + // remember all the reference traversals (there can be more than one only in the case of a // cycle in the reference path // in case of cycles, we need our allele traversals to be associated to the correct reference position // this is done with the path jaccard metric over all overlapping reference paths the given path_jaccard_window size - vector ref_travs; // hacky subpath support -- gets added to variant on output vector ref_offsets; if (!ref_trav_name.empty()) { - for (int i = 0; i < path_travs.first.size(); ++i) { - const string& path_trav_name = path_trav_names.at(i); + for (int i = 0; i < trav_steps.size(); ++i) { + const string& path_trav_name = trav_path_names.at(i); subrange_t subrange ; Paths::strip_subrange(path_trav_name, &subrange); int64_t sub_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; @@ -676,7 +760,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { ref_offsets.push_back(sub_offset); #ifdef debug #pragma omp critical (cerr) - cerr << "Adding ref_tav idx=" << i << " offset=" << sub_offset << " because " << path_trav_name << " == " << ref_trav_name << endl; + cerr << "Adding ref_trav idx=" << i << " offset=" << sub_offset << " because " << path_trav_name << " == " << ref_trav_name << endl; #endif } } @@ -687,240 +771,339 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { if (ref_travs.empty()) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping site because no reference traversal was found " << pb2json(*snarl) << endl; + cerr << "Skipping site because no reference traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; } - - bool exhaustive = !path_restricted && gbwt_trav_finder.get() == nullptr; - if (exhaustive) { - // add in the exhaustive traversals - vector additional_travs; - - // exhaustive traversal can't do all snarls - if (snarl->type() != ULTRABUBBLE) { - return false; - } - if (!check_max_nodes(snarl)) { -#pragma omp critical (cerr) - cerr << "Warning: Skipping site because it is too complex for exhaustive traversal enumeration: " << pb2json(*snarl) << endl << " Consider using -e to traverse embedded paths" << endl; - return false; - } - additional_travs = explicit_exhaustive_traversals(snarl); - - // happens when there was a nested non-ultrabubble snarl - if (additional_travs.empty()) { - return false; - } - path_travs.first.insert(path_travs.first.end(), additional_travs.begin(), additional_travs.end()); - for (int i = 0; i < additional_travs.size(); ++i) { - // dummy names so we can use the same code as the named path traversals above - path_trav_names.push_back(" >>" + std::to_string(i)); - // dummy handles so we can use the same code as the named path traversals above - path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); - } - - } // there's not alt path through the snarl, so we can't make an interesting variant - if (path_travs.first.size() < 2) { + if (travs.size() < 2) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping site because to alt traversal was found " << pb2json(*snarl) << endl; + cerr << "Skipping site because to alt traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; } + if (ref_travs.size() > 1 && this->nested_decomposition) { +#ifdef debug +#pragma omp critical (cerr) + cerr << "Multiple ref traversals not yet supported with nested decomposition: removing all but first" << endl; +#endif + size_t min_start_pos = numeric_limits::max(); + int64_t first_ref_trav; + for (int64_t i = 0; i < ref_travs.size(); ++i) { + auto& ref_trav_idx = ref_travs[i]; + step_handle_t start_step = trav_steps[ref_trav_idx].first; + step_handle_t end_step = trav_steps[ref_trav_idx].second; + size_t ref_trav_pos = min(graph->get_position_of_step(start_step), graph->get_position_of_step(end_step)); + if (ref_trav_pos < min_start_pos) { + min_start_pos = ref_trav_pos; + first_ref_trav = i; + } + } + ref_travs = {ref_travs[first_ref_trav]}; + } + // XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles // we collect windows around the reference traversals // to compare with equivalent windows from the alternate allele paths // we will associate these 1:1 with reference traversals - // remember that path_travs := pair, vector > > path_travs; + // remember that path_travs := pair, vector > > path_travs; // map from each path_trav index to the ref_trav index it best maps to vector path_trav_to_ref_trav; - if (ref_travs.size() > 1 && this->path_jaccard_window && exhaustive && !exhaustive_jaccard_warning) { -#pragma omp critical (cerr) - cerr << "warning [vg deconstruct]: Conext Jaccard logic for multiple references disabled with exhaustive traversals. Use -e, -g or GBZ input to switch to path-based traversals only (recommended)." << endl; - exhaustive_jaccard_warning = true; - } - if (ref_travs.size() > 1 && this->path_jaccard_window && !exhaustive) { - path_trav_to_ref_trav.resize(path_travs.first.size()); + if (ref_travs.size() > 1 && this->path_jaccard_window) { + path_trav_to_ref_trav.resize(trav_steps.size()); #ifdef debug #pragma omp critical (cerr) cerr << "Multiple ref traversals!" << endl; #endif - { - vector> ref_contexts(ref_travs.size()); + vector> ref_contexts(ref_travs.size()); #pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < ref_travs.size(); ++i) { - auto& trav_id = ref_travs[i]; - ref_contexts[i] = get_context(path_travs, trav_id); - } - // now for each traversal, we compute and equivalent context and match it to a ref context - // using a jaccard metric over node ids + for (size_t i = 0; i < ref_travs.size(); ++i) { + ref_contexts[i] = get_context(trav_steps[ref_travs[i]].first, trav_steps[ref_travs[i]].second); + } + + // now for each traversal, we compute and equivalent context and match it to a ref context + // using a jaccard metric over node ids #pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < path_travs.first.size(); ++i) { - vector context = get_context(path_travs, i); - // map jaccard metric to the index of the ref_trav - vector> ref_mappings; - for (uint64_t j = 0; j < ref_travs.size(); ++j) { - ref_mappings.push_back(make_pair( - context_jaccard( - ref_contexts[j], - context), - ref_travs[j])); - } - std::sort(ref_mappings.begin(), ref_mappings.end()); - // the best is the last, which has the highest jaccard - path_trav_to_ref_trav[i] = ref_mappings.back().second; + for (size_t i = 0; i < trav_steps.size(); ++i) { + vector context = get_context(trav_steps[i].first, trav_steps[i].second); + // map jaccard metric to the index of the ref_trav + vector> ref_mappings; + for (uint64_t j = 0; j < ref_travs.size(); ++j) { + ref_mappings.push_back(make_pair( + jaccard_coefficient( + ref_contexts[j], + context), + ref_travs[j])); } + std::stable_sort(ref_mappings.begin(), ref_mappings.end()); + // the best is the last, which has the highest jaccard + path_trav_to_ref_trav[i] = ref_mappings.back().second; } } // we write a variant for every reference traversal // (optionally) selecting the subset of path traversals that are 1:1 -//#pragma omp parallel for for (size_t i = 0; i < ref_travs.size(); ++i) { -//#pragma omp task firstprivate(i) - { - auto& ref_trav_idx = ref_travs[i]; - auto& ref_trav_offset = ref_offsets[i]; - - const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx]; - - vcflib::Variant v; - v.quality = 60; - - // in VCF we usually just want the contig - string contig_name = PathMetadata::parse_locus_name(ref_trav_name); - if (contig_name == PathMetadata::NO_LOCUS_NAME) { + // we zap these to their original size, as the nesting logic can add + // dummy traversals and these are reference-specific (and so need to be cleaned each iteration here) + travs.resize(trav_count); + trav_path_names.resize(trav_count); + trav_steps.resize(trav_step_count); + auto& ref_trav_idx = ref_travs[i]; + auto& ref_trav_offset = ref_offsets[i]; + + const Traversal& ref_trav = travs[ref_trav_idx]; + + vcflib::Variant v; + v.quality = 60; + + // in VCF we usually just want the contig + string contig_name = PathMetadata::parse_locus_name(ref_trav_name); + if (contig_name == PathMetadata::NO_LOCUS_NAME) { + contig_name = ref_trav_name; + } else if (long_ref_contig) { + // the sample name isn't unique enough, so put a full ugly name in the vcf + if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { contig_name = ref_trav_name; - } else if (long_ref_contig) { - // the sample name isn't unique enough, so put a full ugly name in the vcf - if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { - contig_name = ref_trav_name; - } else { - contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, - PathMetadata::parse_sample_name(ref_trav_name), - contig_name, - PathMetadata::parse_haplotype(ref_trav_name), - PathMetadata::NO_PHASE_BLOCK, - PathMetadata::NO_SUBRANGE); - } + } else { + contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, + PathMetadata::parse_sample_name(ref_trav_name), + contig_name, + PathMetadata::parse_haplotype(ref_trav_name), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); } + } - // write variant's sequenceName (VCF contig) - v.sequenceName = contig_name; - - // Map our snarl endpoints to oriented positions in the embedded path in the graph - handle_t first_path_handle; - size_t first_path_pos; - bool use_start; - assert(ref_trav_idx < first_gbwt_trav_idx); - step_handle_t start_step = path_travs.second[ref_trav_idx].first; - step_handle_t end_step = path_travs.second[ref_trav_idx].second; - handle_t start_handle = graph->get_handle_of_step(start_step); - handle_t end_handle = graph->get_handle_of_step(end_step); - size_t start_pos = graph->get_position_of_step(start_step); - size_t end_pos = graph->get_position_of_step(end_step); - use_start = start_pos < end_pos; - first_path_handle = use_start ? start_handle : end_handle; - first_path_pos = use_start ? start_pos : end_pos; + // write variant's sequenceName (VCF contig) + v.sequenceName = contig_name; + + // Map our snarl endpoints to oriented positions in the embedded path in the graph + handle_t first_path_handle; + size_t first_path_pos; + bool use_start; + step_handle_t start_step = trav_steps[ref_trav_idx].first; + step_handle_t end_step = trav_steps[ref_trav_idx].second; + handle_t start_handle = graph->get_handle_of_step(start_step); + handle_t end_handle = graph->get_handle_of_step(end_step); + size_t start_pos = graph->get_position_of_step(start_step); + size_t end_pos = graph->get_position_of_step(end_step); + use_start = start_pos < end_pos; + first_path_handle = use_start ? start_handle : end_handle; + first_path_pos = use_start ? start_pos : end_pos; - // Get the first visit of our snarl traversal - const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1); - - char prev_char; - if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) || - (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) { - // Our path and traversal have consistent orientation. leave off the end of the start node going forward - first_path_pos += graph->get_length(first_path_handle); - prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); - } else { - // They are flipped: leave off the beginning of the start node going backward - prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0])); - } + // Get the first visit of our snarl traversal + const handle_t& first_trav_handle = use_start ? ref_trav.front() : ref_trav.back(); + + char prev_char; + if ((use_start && graph->get_is_reverse(first_trav_handle) == graph->get_is_reverse(first_path_handle)) || + (!use_start && graph->get_is_reverse(first_trav_handle) != graph->get_is_reverse(first_path_handle))) { + // Our path and traversal have consistent orientation. leave off the end of the start node going forward + first_path_pos += graph->get_length(first_path_handle); + prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); + } else { + // They are flipped: leave off the beginning of the start node going backward + prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0])); + } - // shift from 0-based to 1-based for VCF - first_path_pos += 1; + // shift from 0-based to 1-based for VCF + first_path_pos += 1; - v.position = first_path_pos + ref_trav_offset; + v.position = first_path_pos + ref_trav_offset; - v.id = print_snarl(*snarl); + v.id = print_snarl(graph, snarl_start, snarl_end); - // Convert the snarl traversals to strings and add them to the variant - vector use_trav(path_travs.first.size()); - if (path_trav_to_ref_trav.size()) { - for (uint64_t i = 0; i < use_trav.size(); ++i) { - use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]); + // Convert the snarl traversals to strings and add them to the variant + vector use_trav(travs.size()); + if (path_trav_to_ref_trav.size()) { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]); + } + } else { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = true; + } + } + + if (std::none_of(use_trav.begin(), use_trav.end(), [](bool b) {return b;})) { + // no alts were jaccard-assigned to this reference, so abort before an assertion gets tripped + continue; + } + + // Sort the traversals for clustering + vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, ref_trav_idx, use_trav); + + // jaccard clustering (using handles for now) on traversals + vector> trav_cluster_info; + vector child_snarl_to_trav; + vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, + (in_nesting_info ? in_nesting_info->child_snarls : + vector>()), + cluster_threshold, + trav_cluster_info, + child_snarl_to_trav); + +#ifdef debug + cerr << "cluster priority"; + for (const auto& t: sorted_travs) { + cerr << " " << t; + } + cerr << endl; + for (const auto& tc : trav_clusters) { + cerr << "traversal cluster: { "; + for (const auto& t: tc) { + cerr << t << "(" << trav_cluster_info[t].first << "," << trav_cluster_info[t].second << ") "; + } + cerr << " }" << endl; + } +#endif + + unordered_map> sample_to_haps; + if (in_nesting_info != nullptr) { + // if the reference traversal is also an alt traversal, we pop out an extra copy + // todo: this is a hack add add off-reference support while keeping the current + // logic where the reference traversal is always distinct from the alts. this step + // could be avoided, but it would come at the cost of some detailed refactoring of the + // allele getting code... + string ref_sample_name = PathMetadata::parse_sample_name(trav_path_names[ref_trav_idx]); + if (this->sample_names.count(ref_sample_name)) { + int alt_trav_copy = travs.size(); + travs.push_back(travs[ref_trav_idx]); + trav_path_names.push_back(trav_path_names[ref_trav_idx]); + trav_cluster_info.push_back(make_pair(0, 0)); + if (trav_steps.size() == travs.size()) { + trav_steps.push_back(trav_steps[ref_trav_idx]); } - } else { - for (uint64_t i = 0; i < use_trav.size(); ++i) { - use_trav[i] = true; + bool found_cluster = false; + for (vector& cluster : trav_clusters) { + if (cluster[0] == ref_trav_idx) { + found_cluster =true; + cluster.push_back(alt_trav_copy); + break; + } } + assert(found_cluster == true); + } + + // add in the star alleles -- these are alleles that were genotyped in the parent but not + // the current allele, and are treated as *'s in VCF. + if (this->star_allele) { + sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, + in_nesting_info->sample_to_haplotypes); } + + } - vector trav_to_allele = get_alleles(v, path_travs, - ref_trav_idx, - use_trav, - prev_char, use_start); + vector trav_to_allele = get_alleles(v, travs, trav_steps, + ref_trav_idx, + trav_clusters, + prev_char, use_start); - // Fill in the genotypes - if (path_restricted || gbwt_trav_finder.get()) { - get_genotypes(v, path_trav_names, trav_to_allele); - } + +#ifdef debug + assert(trav_to_allele.size() == travs.size()); + cerr << "trav_to_allele ="; + for (const auto& tta : trav_to_allele) { + cerr << " " << tta; + } + cerr << endl; +#endif + + // Fill in the genotypes + get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info); + + // Fill in some nesting-specific (site-level) tags + NestingInfo ref_info; // since in_nesting_info is const, we put top-level stuff here + if (this->nested_decomposition) { + if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) { + // if we're a child, just take what's passed in + ref_info.parent_allele = in_nesting_info->parent_allele; + ref_info.parent_len = in_nesting_info->parent_len; + ref_info.parent_ref_len = in_nesting_info->parent_ref_len; + ref_info.lv0_ref_name = in_nesting_info->lv0_ref_name; + ref_info.lv0_ref_start = in_nesting_info->lv0_ref_start; + ref_info.lv0_ref_len = in_nesting_info->lv0_ref_len; + ref_info.lv0_alt_len = in_nesting_info->lv0_alt_len; + } else { + // if we're a root, compute values from the prsent site + // todo: should they just be left undefined? + ref_info.parent_allele = 0; + ref_info.parent_len = v.alleles[0].length(); + ref_info.parent_ref_len = v.alleles[0].length(); + ref_info.lv0_ref_name = v.sequenceName; + ref_info.lv0_ref_start = v.position; + ref_info.lv0_ref_len = v.alleles[0].length(); + ref_info.lv0_alt_len = v.alleles[ref_info.parent_allele].length(); + } + v.info["PA"].push_back(std::to_string(ref_info.parent_allele)); + v.info["PL"].push_back(std::to_string(ref_info.parent_len)); + v.info["PR"].push_back(std::to_string(ref_info.parent_ref_len)); + v.info["RC"].push_back(ref_info.lv0_ref_name); + v.info["RS"].push_back(std::to_string(ref_info.lv0_ref_start)); + v.info["RD"].push_back(std::to_string(ref_info.lv0_ref_len)); + v.info["RL"].push_back(std::to_string(ref_info.lv0_alt_len)); + } - // we only bother printing out sites with at least 1 non-reference allele - if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { - if (path_restricted || gbwt_trav_finder.get()) { - // run vcffixup to add some basic INFO like AC - vcf_fixup(v); + if (i == 0 && out_nesting_infos != nullptr) { + // we pass some information down to the children + // todo: do/can we consider all the diferent reference intervals? + // right now, the info passed is hopefully coarse-grained enough not to matter? + assert(in_nesting_info != nullptr && + in_nesting_info->child_snarls.size() == out_nesting_infos->size()); + + for (int64_t j = 0; j < out_nesting_infos->size(); ++j) { + out_nesting_infos->at(j).child_snarls.clear(); + out_nesting_infos->at(j).has_ref = false; + if (child_snarl_to_trav[j] >= 0) { + if (child_snarl_to_trav[j] < trav_steps.size()) { + NestingInfo& child_info = out_nesting_infos->at(j); + child_info.has_ref = true; + child_info.parent_path_interval = trav_steps[child_snarl_to_trav[j]]; + child_info.sample_to_haplotypes = sample_to_haps; + child_info.parent_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ? + trav_to_allele[child_snarl_to_trav[j]] : 0; + child_info.parent_len = v.alleles[child_info.parent_allele].length(); + child_info.parent_ref_len = v.alleles[0].length(); + child_info.lv0_ref_name = ref_info.lv0_ref_name; + child_info.lv0_ref_start = ref_info.lv0_ref_start; + child_info.lv0_ref_len = ref_info.lv0_ref_len; + if (in_nesting_info == nullptr || in_nesting_info->has_ref == false) { + // we're the parent of root, so we want to set this here + child_info.lv0_alt_len = child_info.parent_len; + } else { + child_info.lv0_alt_len = ref_info.lv0_alt_len; + } + } } - add_variant(v); + } + } + + // we only bother printing out sites with at least 1 non-reference allele + if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { + // run vcffixup to add some basic INFO like AC + vcf_fixup(v); + bool added = add_variant(v); + if (!added) { + stringstream ss; + ss << v; + cerr << "Warning [vg deconstruct]: Skipping variant at " << v.sequenceName << ":" << v.position + << " with ID=" << v.id << " because its line length of " << ss.str().length() << " exceeds vg's limit of " + << VCFOutputCaller::max_vcf_line_length << endl; + return false; } } } -//#pragma omp taskwait return true; } -/** - * Convenience wrapper function for deconstruction of multiple paths. - */ -void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, - bool path_restricted_traversals, - int ploidy, - bool include_nested, - int context_jaccard_window, - bool untangle_traversals, - bool keep_conflicted, - bool strict_conflicts, - bool long_ref_contig, - gbwt::GBWT* gbwt) { - - this->graph = graph; - this->snarl_manager = snarl_manager; - this->path_restricted = path_restricted_traversals; - this->ploidy = ploidy; - this->ref_paths = set(ref_paths.begin(), ref_paths.end()); - this->include_nested = include_nested; - this->path_jaccard_window = context_jaccard_window; - this->untangle_allele_traversals = untangle_traversals; - this->keep_conflicted_genotypes = keep_conflicted; - this->strict_conflict_checking = strict_conflicts; - if (gbwt) { - this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); - } - - // the need to use nesting is due to a problem with omp tasks and shared state - // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) - omp_set_nested(1); - omp_set_max_active_levels(3); - +string Deconstructor::get_vcf_header() { // Keep track of the non-reference paths in the graph. They'll be our sample names ref_samples.clear(); set ref_haplotypes; @@ -929,7 +1112,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand ref_haplotypes.insert(PathMetadata::parse_haplotype(ref_path_name)); } if (!long_ref_contig) { - long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1; + long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1 || nested_decomposition; } this->long_ref_contig = long_ref_contig; sample_names.clear(); @@ -984,6 +1167,13 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } } + if (sample_to_haps.empty()) { + cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that " + << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct " + << "now only works on embedded paths and GBWT threads." << endl; + exit(1); + } + // find some stats about the haplotypes for each sample gbwt_sample_to_phase_range.clear(); sample_ploidys.clear(); @@ -995,10 +1185,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand // print the VCF header stringstream stream; stream << "##fileformat=VCFv4.2" << endl; - if (path_restricted || gbwt) { - stream << "##FORMAT=" << endl; - } - if (show_path_info && path_to_sample_phase && path_restricted) { + stream << "##FORMAT=" << endl; + if (show_path_info && path_to_sample_phase) { stream << "##FORMAT=" << endl; } if (path_to_sample_phase || gbwt) { @@ -1008,208 +1196,252 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } stream << "\">" << endl; } - if (path_restricted || gbwt) { - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; + if (path_to_sample_phase && cluster_threshold < 1) { + stream << "##FORMAT=" + << endl; + stream << "##FORMAT=" + << endl; + } + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; if (include_nested) { stream << "##INFO=" << endl; stream << "##INFO=" << endl; } + if (this->nested_decomposition) { + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + } if (untangle_allele_traversals) { stream << "##INFO=|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl; } else { stream << "##INFO=" << endl; } - set gbwt_ref_paths; + + stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; + for (auto& sample_name : sample_names) { + stream << "\t" << sample_name; + } + stream << endl; + return stream.str(); +} + +string Deconstructor::add_contigs_to_vcf_header(const string& vcf_header) const { + + vector header_lines = split_delims(vcf_header, "\n"); + + stringstream patched_header; + for (int64_t i = 0; i < header_lines.size() - 1; ++i) { + patched_header << header_lines[i] << "\n"; + } + + set all_ref_paths = this->ref_paths; + + // add in the off-ref paths that nested deconstruction may have found + for (const unordered_set& off_ref_path_set : this->off_ref_paths) { + for (const path_handle_t& off_ref_path : off_ref_path_set) { + all_ref_paths.insert(graph->get_path_name(off_ref_path)); + } + } + map ref_path_to_length; - for(auto& refpath : ref_paths) { - if (graph->has_path(refpath)) { - int64_t path_len = 0; - path_handle_t path_handle = graph->get_path_handle(refpath); - for (handle_t handle : graph->scan_path(path_handle)) { - path_len += graph->get_length(handle); + for(auto& refpath : all_ref_paths) { + assert(graph->has_path(refpath)); + int64_t path_len = 0; + path_handle_t path_handle = graph->get_path_handle(refpath); + for (handle_t handle : graph->scan_path(path_handle)) { + path_len += graph->get_length(handle); + } + string locus_name = graph->get_locus_name(path_handle); + if (locus_name == PathMetadata::NO_LOCUS_NAME) { + locus_name = refpath; + } else if (long_ref_contig) { + // the sample name isn't unique enough, so put a full ugly name in the vcf + if (graph->get_sense(path_handle) == PathSense::GENERIC) { + locus_name = graph->get_path_name(path_handle); + } else { + locus_name = PathMetadata::create_path_name(PathSense::REFERENCE, + graph->get_sample_name(path_handle), + locus_name, + graph->get_haplotype(path_handle), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); } - string locus_name = graph->get_locus_name(path_handle); - if (locus_name == PathMetadata::NO_LOCUS_NAME) { - locus_name = refpath; - } else if (long_ref_contig) { - // the sample name isn't unique enough, so put a full ugly name in the vcf - if (graph->get_sense(path_handle) == PathSense::GENERIC) { - locus_name = graph->get_path_name(path_handle); - } else { - locus_name = PathMetadata::create_path_name(PathSense::REFERENCE, - graph->get_sample_name(path_handle), - locus_name, - graph->get_haplotype(path_handle), - PathMetadata::NO_PHASE_BLOCK, - PathMetadata::NO_SUBRANGE); - } - } + } - subrange_t subrange = graph->get_subrange(path_handle); - int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; - ref_path_to_length[locus_name] = std::max(ref_path_to_length[locus_name], path_len + offset); - } else { - gbwt_ref_paths.insert(refpath); - } + subrange_t subrange = graph->get_subrange(path_handle); + int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + ref_path_to_length[locus_name] = std::max(ref_path_to_length[locus_name], path_len + offset); } for (auto& ref_path_len : ref_path_to_length) { - stream << "##contig=" << endl; + patched_header << "##contig=" << endl; } - if (!gbwt_ref_paths.empty()) { - unordered_map> gbwt_name_to_ids; - for (size_t i = 0; i < gbwt->metadata.paths(); i++) { - // Collect all the GBWT path IDs for each sample and contig. - gbwt_name_to_ids[compose_short_path_name(*gbwt, i)].push_back(i); + + assert(header_lines.back().substr(0, 6) == "#CHROM"); + patched_header << header_lines.back(); + return patched_header.str(); +} + +void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { + + vector snarls; + vector queue; + + // read all our snarls into a list + snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { + queue.push_back(snarl); + }); + if (include_nested) { + while (!queue.empty()) { + const Snarl* snarl = queue.back(); + queue.pop_back(); + snarls.push_back(snarl); + const vector& children = snarl_manager->children_of(snarl); + queue.insert(queue.end(), children.begin(), children.end()); } - for (const string& refpath : gbwt_ref_paths) { - // For each sample and contig name that is a GBWT ref path - vector& thread_ids = gbwt_name_to_ids.at(refpath); - size_t path_len = 0; - for (gbwt::size_type thread_id : thread_ids) { - // For each actual path in the GBWT for that sample-and-contig, - // we need to see how long it extends the space of the sample - // and contig. - - // TODO: These are probably all guaranteed to be haplotype sense? - PathSense sense = gbwtgraph::get_path_sense(*gbwt, thread_id, gbwt_reference_samples); - subrange_t subrange = gbwtgraph::get_path_subrange(*gbwt, thread_id, sense); - - // TODO: when importing GFAs we might cram the start of a walk - // into the GBWT count field. But we don't ever guarantee that - // we've done that so it might not be visible as a subrange - // here. Fix that somehow??? - size_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; - size_t len = path_to_length(extract_gbwt_path(*graph, *gbwt, thread_id)); - path_len = std::max(path_len, offset + len); + } else { + swap(snarls, queue); + } + + // process the whole shebang in parallel +#pragma omp parallel for schedule(dynamic,1) + for (size_t i = 0; i < snarls.size(); i++) { + deconstruct_site(graph->get_handle(snarls[i]->start().node_id(), snarls[i]->start().backward()), + graph->get_handle(snarls[i]->end().node_id(), snarls[i]->end().backward())); + } +} + +void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { + // logic copied from vg call (graph_caller.cpp) + + size_t thread_count = get_thread_count(); + this->off_ref_paths.clear(); + this->off_ref_paths.resize(get_thread_count()); + // Used to recurse on children of parents that can't be called + vector>> snarl_queue(thread_count); + + // Run the deconstructor on a snarl, and queue up the children if it fails + auto process_snarl = [&](const Snarl* snarl, NestingInfo nesting_info) { + if (!snarl_manager->is_trivial(snarl, *graph)) { + const vector& children = snarl_manager->children_of(snarl); + assert(nesting_info.child_snarls.empty()); + for (const Snarl* child : children) { + nesting_info.child_snarls.push_back(make_pair(graph->get_handle(child->start().node_id(), child->start().backward()), + graph->get_handle(child->end().node_id(), child->end().backward()))); + } - stream << "##contig=" << endl; + vector out_nesting_infos(children.size()); + bool was_deconstructed = deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward()), + include_nested ? &nesting_info : nullptr, + include_nested ? &out_nesting_infos : nullptr); + if (include_nested || !was_deconstructed) { + vector>& thread_queue = snarl_queue[omp_get_thread_num()]; + for (int64_t i = 0; i < children.size(); ++i) { + thread_queue.push_back(make_pair(children[i], out_nesting_infos[i])); + } + + } } + }; + + // Start with the top level snarls + // (note, can't do for_each_top_level_snarl_parallel() because interface wont take nesting info) + vector> top_level_snarls; + snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { + NestingInfo nesting_info; + nesting_info.has_ref = false; + top_level_snarls.push_back(make_pair(snarl, nesting_info)); + }); +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < top_level_snarls.size(); ++i) { + process_snarl(top_level_snarls[i].first, top_level_snarls[i].second); } - - stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - if (path_restricted || gbwt) { - for (auto& sample_name : sample_names) { - stream << "\t" << sample_name; + + // Then recurse on any children the snarl caller failed to handle + while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), + [](const vector>& snarl_vec) {return snarl_vec.empty();})) { + vector> cur_queue; + for (vector>& thread_queue : snarl_queue) { + cur_queue.reserve(cur_queue.size() + thread_queue.size()); + std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); + thread_queue.clear(); + } +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < cur_queue.size(); ++i) { + process_snarl(cur_queue[i].first, cur_queue[i].second); } } - stream << endl; - - string hstr = stream.str(); - assert(output_vcf.openForOutput(hstr)); - cout << output_vcf.header << endl; +} - // create the traversal finder - map reads_by_name; - path_trav_finder = unique_ptr(new PathTraversalFinder(*graph, - *snarl_manager)); - - if (!path_restricted && !gbwt) { - trav_finder = unique_ptr(new ExhaustiveTraversalFinder(*graph, - *snarl_manager, - true)); +/** + * Convenience wrapper function for deconstruction of multiple paths. + */ +void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, + bool include_nested, + int context_jaccard_window, + bool untangle_traversals, + bool keep_conflicted, + bool strict_conflicts, + bool long_ref_contig, + double cluster_threshold, + gbwt::GBWT* gbwt, + bool nested_decomposition, + bool star_allele) { + this->graph = graph; + this->ref_paths = set(ref_paths.begin(), ref_paths.end()); + this->include_nested = include_nested || nested_decomposition; + this->path_jaccard_window = context_jaccard_window; + this->untangle_allele_traversals = untangle_traversals; + this->keep_conflicted_genotypes = keep_conflicted; + this->strict_conflict_checking = strict_conflicts; + this->long_ref_contig = long_ref_contig; + if (gbwt) { + this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); } + this->cluster_threshold = cluster_threshold; + this->gbwt = gbwt; + this->nested_decomposition = nested_decomposition; + this->star_allele = star_allele; + + // the need to use nesting is due to a problem with omp tasks and shared state + // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) + omp_set_nested(1); + omp_set_max_active_levels(3); + // create the traversal finder + map reads_by_name; + path_trav_finder = unique_ptr(new PathTraversalFinder(*graph)); + if (gbwt != nullptr) { gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } - vector snarls_todo; - // Do the top-level snarls in parallel - snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { - vector todo(1, snarl); - vector next; - while (!todo.empty()) { - for (auto next_snarl : todo) { - // if we can't make a variant from the snarl due to not finding - // paths through it, we try again on the children - // note: we may want to push the parallelism down a bit -#pragma omp critical (snarls_todo) - snarls_todo.push_back(next_snarl); - if (include_nested) { - // n.b. we no longer attempt to deconstruct the site to determine if we nest - const vector& children = snarl_manager->children_of(next_snarl); - next.insert(next.end(), children.begin(), children.end()); - } - } - swap(todo, next); - next.clear(); - } - }); + string hstr = this->get_vcf_header(); + assert(output_vcf.openForOutput(hstr)); -//#pragma omp parallel -//#pragma omp single - { -#pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < snarls_todo.size(); i++) { -//#pragma omp task firstprivate(i) - { - auto& snarl = snarls_todo[i]; - deconstruct_site(snarl); - } - } + if (nested_decomposition) { + deconstruct_graph_top_down(snarl_manager); + } else { + deconstruct_graph(snarl_manager); } -//#pragma omp taskwait + + string patched_header = this->add_contigs_to_vcf_header(output_vcf.header); + cout << patched_header << endl; // write variants in sorted order write_variants(cout, snarl_manager); } -bool Deconstructor::check_max_nodes(const Snarl* snarl) const { - unordered_set nodeset = snarl_manager->deep_contents(snarl, *graph, false).first; - int node_count = 0; - for (auto node_id : nodeset) { - handle_t node = graph->get_handle(node_id); - if (graph->get_degree(node, true) > 1 || graph->get_degree(node, false) > 1) { - ++node_count; - if (node_count > max_nodes_for_exhaustive) { - return false; - } - } - } - return true; -}; - -vector Deconstructor::explicit_exhaustive_traversals(const Snarl* snarl) const { - vector out_travs; - bool ultra_all_the_way_down = true; - function extend_trav = - [&](const SnarlTraversal& trav, const Snarl& nested_snarl) { - // exhaustive traversal finder is limited. if we find something - // that's not an ultrabubble, not much we can do - if (nested_snarl.type() != ULTRABUBBLE) { - ultra_all_the_way_down = false; - return; - } - vector nested_travs = trav_finder->find_traversals(nested_snarl); - for (auto& nested_trav : nested_travs) { - SnarlTraversal extended_trav = trav; - bool is_explicit = true; - for (int i = 0; i < nested_trav.visit_size(); ++i) { - if (nested_trav.visit(i).node_id() != 0) { - Visit* visit = extended_trav.add_visit(); - *visit = nested_trav.visit(i); - } else { - extend_trav(extended_trav, nested_trav.visit(i).snarl()); - is_explicit = false; - } - } - if (is_explicit) { - out_travs.push_back(extended_trav); - } - } - }; - SnarlTraversal trav; - extend_trav(trav, *snarl); - if (!ultra_all_the_way_down) { - out_travs.clear(); - } - return out_travs; -} } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index a8c66e369bb..ee537495a8e 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -38,33 +38,94 @@ class Deconstructor : public VCFOutputCaller { // deconstruct the entire graph to cout. // Not even a little bit thread safe. - void deconstruct(vector refpaths, const PathPositionHandleGraph* grpah, SnarlManager* snarl_manager, - bool path_restricted_traversals, - int ploidy, + void deconstruct(vector refpaths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, bool include_nested, int context_jaccard_window, bool untangle_traversals, bool keep_conflicted, bool strict_conflicts, bool long_ref_contig, - gbwt::GBWT* gbwt = nullptr); + double cluster_threshold = 1.0, + gbwt::GBWT* gbwt = nullptr, + bool nested_decomposition = false, + bool star_allele = false); private: + // initialize the vcf and get the header + string get_vcf_header(); + + // the header needs to be initialized *before* construction for vcflib + // but we don't know all the non-ref contigs (in nested mode) until *after* + // construction. end result: this hacky function to patch them in before printing + string add_contigs_to_vcf_header(const string& vcf_header) const; + + // deconstruct all snarls in parallel (ie nesting relationship ignored) + void deconstruct_graph(SnarlManager* snarl_manager); + + // deconstruct all top-level snarls in parallel + // nested snarls are processed after their parents in the same thread + // (same logic as vg call) + void deconstruct_graph_top_down(SnarlManager* snarl_manager); + + // some information we pass from parent to child site when + // doing nested deconstruction + struct NestingInfo { + bool has_ref; + vector> child_snarls; + PathInterval parent_path_interval; + unordered_map> sample_to_haplotypes; + int parent_allele; + int64_t parent_len; + int64_t parent_ref_len; + string lv0_ref_name; + int64_t lv0_ref_start; + int64_t lv0_ref_len; + int64_t lv0_alt_len; + }; + // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) - bool deconstruct_site(const Snarl* site) const; + // the nesting_info structs are optional and used to pass reference information through nested sites... + // the output nesting_info vector writes a record for each child snarl + bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, + const NestingInfo* in_nesting_info = nullptr, + vector* out_nesting_infos = nullptr) const; + + // get the traversals for a given site + // this returns a combination of embedded path traversals and gbwt traversals + // the embedded paths come first, and only they get trav_steps. + // so you can use trav_steps.size() to find the index of the first gbwt traversal... + void get_traversals(const handle_t& snarl_start, const handle_t& snarl_end, + vector& out_travs, + vector& out_trav_path_names, + vector>& out_trav_steps) const; + + // this is a hack to add in * alleles -- these are haplotypes that we genotyped in the + // parent but aren't represented in any of the traversals found in the current + // site. *-alleles are represented as empty traversals. + // todo: conflicts arising from alt-cycles will be able to lead to conflicting + // results -- need to overhaul code to pass more detailed traversal information + // from parent to child to have a chance at consistently resolving + // star traversals are appended onto travs and trav_names + // this funtion returns a map containing both parent and child haploty + unordered_map> add_star_traversals(vector& travs, + vector& trav_names, + vector>& trav_clusters, + vector>& trav_cluster_info, + const unordered_map>& parent_haplotypes) const; // convert traversals to strings. returns mapping of traversal (offset in travs) to allele vector get_alleles(vcflib::Variant& v, - const pair, - vector>>& path_travs, + const vector& travs, + const vector>& trav_steps, int ref_path_idx, - const vector& use_trav, + const vector>& trav_clusters, char prev_char, bool use_start) const; // write traversal path names as genotypes - void get_genotypes(vcflib::Variant& v, const vector& names, const vector& trav_to_allele) const; + void get_genotypes(vcflib::Variant& v, const vector& names, const vector& trav_to_allele, + const vector>& trav_to_cluster_info) const; // given a set of traversals associated with a particular sample, select a set of size for the VCF // the highest-frequency ALT traversal is chosen @@ -74,49 +135,19 @@ class Deconstructor : public VCFOutputCaller { const vector& trav_to_name, const vector& gbwt_phases) const; - // check to see if a snarl is too big to exhaustively traverse - bool check_max_nodes(const Snarl* snarl) const; - - // get traversals from the exhaustive finder. if they have nested visits, fill them in (exhaustively) - // with node visits - vector explicit_exhaustive_traversals(const Snarl* snarl) const; - - // gets a sorted node id context for a given path - vector get_context( - const pair, - vector>>& path_travs, - const int& trav_idx) const; - // the underlying context-getter vector get_context( step_handle_t start_step, step_handle_t end_step) const; - - // compares node contexts - double context_jaccard(const vector& target, - const vector& query) const; - - // specialization for enc_vectors - double context_jaccard( - const dac_vector<>& target, - const vector& query) const; - // toggle between exhaustive and path restricted traversal finder - bool path_restricted = false; - - // the max ploidy we expect. - int ploidy; - // the graph const PathPositionHandleGraph* graph; - // the snarl manager - SnarlManager* snarl_manager; + // the gbwt + gbwt::GBWT* gbwt; // the traversal finders. we always use a path traversal finder to get the reference path unique_ptr path_trav_finder; - // we optionally use another (exhaustive for now) traversal finder if we don't want to rely on paths - unique_ptr trav_finder; // we can also use a gbwt for traversals unique_ptr gbwt_trav_finder; // When using the gbwt we need some precomputed information to ask about stored paths. @@ -128,6 +159,10 @@ class Deconstructor : public VCFOutputCaller { // the ref paths set ref_paths; + // the off-ref paths that may be found during nested deconstruction + // (buffered by thread) + mutable vector> off_ref_paths; + // keep track of reference samples set ref_samples; @@ -143,9 +178,6 @@ class Deconstructor : public VCFOutputCaller { // the sample ploidys given in the phases in our path names unordered_map sample_ploidys; - // upper limit of degree-2+ nodes for exhaustive traversal - int max_nodes_for_exhaustive = 100; - // target window size for determining the correct reference position for allele traversals with path jaccard int path_jaccard_window = 10000; @@ -161,25 +193,22 @@ class Deconstructor : public VCFOutputCaller { // should we keep conflicted genotypes or not bool keep_conflicted_genotypes = false; - // warn about context jaccard not working with exhaustive traversals - mutable atomic exhaustive_jaccard_warning; -}; + // used to merge together similar traversals (to keep allele counts down) + // currently implemented as handle jaccard coefficient. So 1 means only + // merge if identical (which is what deconstruct has always done) + double cluster_threshold = 1.0; -// helpel for measuring set intersectiond and union size -template -class count_back_inserter { - size_t &count; -public: - typedef void value_type; - typedef void difference_type; - typedef void pointer; - typedef void reference; - typedef std::output_iterator_tag iterator_category; - count_back_inserter(size_t &count) : count(count) {}; - void operator=(const T &){ ++count; } - count_back_inserter &operator *(){ return *this; } - count_back_inserter &operator++(){ return *this; } + // activate the new nested decomposition mode, which is like the old include_nested + // (which lives in vcfoutputcaller) but with more of an effort to link + // the parent and child snarls, as well as better support for nested insertions + bool nested_decomposition = false; + + // use *-alleles to represent spanning alleles that do not cross site but do go around it + // ex: a big containing deletion + // only works with nested_decomposition + bool star_allele = false; }; + } #endif diff --git a/src/gbwt_extender.cpp b/src/gbwt_extender.cpp index 9a46b21e25a..ee0eed7dd9f 100644 --- a/src/gbwt_extender.cpp +++ b/src/gbwt_extender.cpp @@ -2206,7 +2206,14 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co } WFAAlignment WFAExtender::suffix(const std::string& sequence, pos_t from) const { - return this->connect(sequence, from, pos_t(0, false, 0)); + WFAAlignment result = this->connect(sequence, from, pos_t(0, false, 0)); + + if (!result.edits.empty() && result.length == sequence.length() && (result.edits.back().first == WFAAlignment::match || result.edits.back().first == WFAAlignment::mismatch)) { + // The alignment used all of the sequence and has a match/mismatch at the appropriate end + result.score += this->aligner->full_length_bonus; + } + + return result; } WFAAlignment WFAExtender::prefix(const std::string& sequence, pos_t to) const { @@ -2219,6 +2226,10 @@ WFAAlignment WFAExtender::prefix(const std::string& sequence, pos_t to) const { WFAAlignment result = this->connect(reverse_complement(sequence), to, pos_t(0, false, 0)); result.flip(*(this->graph), sequence); + if (!result.edits.empty() && result.length == sequence.length() && (result.edits.front().first == WFAAlignment::match || result.edits.front().first == WFAAlignment::mismatch)) { + result.score += this->aligner->full_length_bonus; + } + return result; } diff --git a/src/gbwt_extender.hpp b/src/gbwt_extender.hpp index 362b8571a5e..1ea8d48710d 100644 --- a/src/gbwt_extender.hpp +++ b/src/gbwt_extender.hpp @@ -426,9 +426,11 @@ class WFAExtender { * entire sequence with an acceptable score, returns the highest-scoring * partial alignment, which may be empty. * + * Applies the full-length bonus if the result ends with a match or mismatch. + * TODO: Use the full-length bonus to determine the optimal alignment. + * * NOTE: This creates a suffix of the full alignment by aligning a * prefix of the sequence. - * TODO: Should we use full-length bonuses? */ WFAAlignment suffix(const std::string& sequence, pos_t from) const; @@ -438,9 +440,11 @@ class WFAExtender { * sequence with an acceptable score, returns the highest-scoring partial * alignment, which may be empty. * + * Applies the full-length bonus if the result begins with a match or mismatch. + * TODO: Use the full-length bonus to determine the optimal alignment. + * * NOTE: This creates a prefix of the full alignment by aligning a suffix * of the sequence. - * TODO: Should we use full-length bonuses? */ WFAAlignment prefix(const std::string& sequence, pos_t to) const; diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 5dd250282c5..e3ce68321f2 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -238,15 +238,20 @@ string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector VCFOutputCaller::max_vcf_line_length) { + return false; + } + int ret = zstdutil::CompressString(ss.str(), dest); + assert(ret == 0); // the Variant object is too big to keep in memory when there are many genotypes, so we // store it in a zstd-compressed string output_variants[omp_get_thread_num()].push_back(make_pair(make_pair(var.sequenceName, var.position), dest)); + return true; } void VCFOutputCaller::write_variants(ostream& out_stream, const SnarlManager* snarl_manager) { @@ -265,7 +270,8 @@ void VCFOutputCaller::write_variants(ostream& out_stream, const SnarlManager* sn }); for (auto v : all_variants) { string dest; - zstdutil::DecompressString(v.second, dest); + int ret = zstdutil::DecompressString(v.second, dest); + assert(ret == 0); out_stream << dest << endl; } } @@ -366,6 +372,17 @@ void VCFOutputCaller::set_nested(bool nested) { include_nested = nested; } +void VCFOutputCaller::add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, const Traversal& trav, + bool reversed, bool one_based) const { + SnarlTraversal proto_trav; + for (const handle_t& handle : trav) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph->get_id(handle)); + visit->set_backward(graph->get_is_reverse(handle)); + } + this->add_allele_path_to_info(v, allele, proto_trav, reversed, one_based); +} + void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const { auto& trav_info = v.info["AT"]; @@ -408,6 +425,10 @@ void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, co } prev_visit = &visit; } + if (trav_info[allele].empty()) { + // note: * alleles get empty traversals + trav_info[allele] = "."; + } } string VCFOutputCaller::trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const { @@ -423,7 +444,7 @@ string VCFOutputCaller::trav_string(const HandleGraph& graph, const SnarlTravers return seq; } -void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, +bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, const Snarl& snarl, const vector& called_traversals, const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, @@ -582,8 +603,17 @@ void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa #endif if (genotype_snarls || !out_variant.alt.empty()) { - add_variant(out_variant); + bool added = add_variant(out_variant); + if (!added) { + stringstream ss; + ss << out_variant; + cerr << "Warning [vg call]: Skipping variant at " << out_variant.sequenceName << ":" << out_variant.position + << " with ID=" << out_variant.id << " because its line length of " << ss.str().length() << " exceeds vg's limit of " + << VCFOutputCaller::max_vcf_line_length << endl; + } + return added; } + return true; } tuple VCFOutputCaller::get_ref_interval( @@ -726,6 +756,18 @@ void VCFOutputCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool } } +string VCFOutputCaller::print_snarl(const HandleGraph* graph, const handle_t& snarl_start, + const handle_t& snarl_end, bool in_brackets) const { + Snarl snarl; + Visit* start = snarl.mutable_start(); + start->set_node_id(graph->get_id(snarl_start)); + start->set_backward(graph->get_is_reverse(snarl_start)); + Visit* end = snarl.mutable_end(); + end->set_node_id(graph->get_id(snarl_end)); + end->set_backward(graph->get_is_reverse(snarl_end)); + return this->print_snarl(snarl, in_brackets); +} + string VCFOutputCaller::print_snarl(const Snarl& snarl, bool in_brackets) const { // todo, should we canonicalize here by putting lexicographic lowest node first? nid_t start_node_id = snarl.start().node_id(); @@ -986,7 +1028,8 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager for (auto& thread_buf : output_variants) { for (auto& output_variant_record : thread_buf) { string output_variant_string; - zstdutil::DecompressString(output_variant_record.second, output_variant_string); + int ret = zstdutil::DecompressString(output_variant_record.second, output_variant_string); + assert(ret == 0); vector toks = split_delims(output_variant_string, "\t", 4); names_in_vcf.insert(toks[2]); } @@ -1019,7 +1062,8 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager auto& thread_buf = output_variants[i]; for (auto& output_variant_record : thread_buf) { string output_variant_string; - zstdutil::DecompressString(output_variant_record.second, output_variant_string); + int ret = zstdutil::DecompressString(output_variant_record.second, output_variant_string); + assert(ret == 0); //string& output_variant_string = output_variant_record.second; vector toks = split_delims(output_variant_string, "\t", 9); const string& name = toks[2]; @@ -1043,7 +1087,8 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager } } output_variant_record.second.clear(); - zstdutil::CompressString(output_variant_string, output_variant_record.second); + ret = zstdutil::CompressString(output_variant_string, output_variant_record.second); + assert(ret == 0); } } } @@ -1431,10 +1476,9 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); // emit our vcf variant - emit_variant(graph, snarl_caller, snarl, called_traversals, genotype, 0, call_info, path_name, ref_offsets.find(path_name)->second, false, + was_called = emit_variant(graph, snarl_caller, snarl, called_traversals, genotype, 0, call_info, path_name, ref_offsets.find(path_name)->second, false, ploidy); - was_called = true; } } if (!is_vg) { @@ -1844,15 +1888,16 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { assert(trav_genotype.empty() || trav_genotype.size() == ploidy); + bool added = true; if (!gaf_output) { - emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, - ref_offsets[ref_path_name], genotype_snarls, ploidy); + added = emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, + ref_offsets[ref_path_name], genotype_snarls, ploidy); } else { pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); } - ret_val = trav_genotype.size() == ploidy; + ret_val = trav_genotype.size() == ploidy && added; } return ret_val; @@ -2235,10 +2280,13 @@ bool NestedFlowCaller::emit_snarl_recursive(const Snarl& managed_snarl, int ploi return flatten_alt_allele(allele_string, std::min(allele_ploidy-1, genotype_allele), allele_ploidy, call_table); } }; - + if (!gaf_output) { - emit_variant(graph, snarl_caller, managed_snarl, record.travs, genotype.first, record.ref_trav_idx, genotype.second, record.ref_path_name, - ref_offsets[record.ref_path_name], genotype_snarls, ploidy, trav_to_flat_string); + bool added = emit_variant(graph, snarl_caller, managed_snarl, record.travs, genotype.first, record.ref_trav_idx, genotype.second, record.ref_path_name, + ref_offsets[record.ref_path_name], genotype_snarls, ploidy, trav_to_flat_string); + if (!added) { + return false; + } } else { // todo: // emit_gaf_variant(graph, snarl, travs, trav_genotype); @@ -2302,7 +2350,7 @@ string NestedFlowCaller::flatten_alt_allele(const string& nested_allele, int all // todo: passing in a single ploidy simplisitic, would need to derive from the calls when // reucrising // in practice, the results will nearly the same but still needs fixing - // we try to get the the allele from the genotype if possible, but fallback on the fallback_allele + // we try to get the allele from the genotype if possible, but fallback on the fallback_allele int trav_allele = fallback_allele >= 0 ? fallback_allele : record.genotype_by_ploidy[ploidy-1].first[allele]; const SnarlTraversal& traversal = record.travs[trav_allele]; string nested_snarl_allele = trav_string(graph, traversal); diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 7bdcfe9d880..526dcbef573 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -85,7 +85,8 @@ class VCFOutputCaller { const vector& contig_length_overrides) const; /// Add a variant to our buffer - void add_variant(vcflib::Variant& var) const; + /// Returns false if the variant line length exceeds VCFOutputCaller::max_vcf_line_length + bool add_variant(vcflib::Variant& var) const; /// Sort then write variants in the buffer /// snarl_manager needed if include_nested is true @@ -103,13 +104,18 @@ class VCFOutputCaller { protected: /// add a traversal to the VCF info field in the format of a GFA W-line or GAF path + void add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, + const Traversal& trav, bool reversed, bool one_based) const; + /// legacy version of above void add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const; + /// convert a traversal into an allele string string trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const; /// print a vcf variant - void emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, + /// return value is taken from add_variant (see above) + bool emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, const Snarl& snarl, const vector& called_traversals, const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, @@ -131,6 +137,8 @@ class VCFOutputCaller { /// print a snarl in a consistent form like >3435<12222 /// if in_brackets set to true, do (>3435<12222) instead (this is only used for nested caller) + string print_snarl(const HandleGraph* grpah, const handle_t& snarl_start, const handle_t& snarl_end, bool in_brackets = false) const; + /// legacy version of above string print_snarl(const Snarl& snarl, bool in_brackets = false) const; /// do the opposite of above @@ -160,6 +168,9 @@ class VCFOutputCaller { // need to write LV/PS info tags bool include_nested; + + // prevent giant variants + static const int64_t max_vcf_line_length = 2000000000; }; /** diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 83a9c1b4a58..8f48b15ddc6 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -780,7 +780,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { } } - // we'll partition sequences that have the same samples (chunking the the VCFs + // we'll partition sequences that have the same samples (chunking the VCFs // ultimately requires that we do this) map, vector> sample_set_contigs; for (int i = 0; i < vcf_samples.size(); ++i) { diff --git a/src/io/register_libvg_io.cpp b/src/io/register_libvg_io.cpp index 94c0a643a8c..4d8fb51f603 100644 --- a/src/io/register_libvg_io.cpp +++ b/src/io/register_libvg_io.cpp @@ -20,6 +20,7 @@ #include "register_loader_saver_packed_graph.hpp" #include "register_loader_saver_hash_graph.hpp" #include "register_loader_saver_gfa.hpp" +#include "register_loader_params_json.hpp" #include "register_libvg_io.hpp" @@ -46,6 +47,7 @@ bool register_libvg_io() { register_loader_saver_xg(); register_loader_saver_packed_graph(); register_loader_saver_hash_graph(); + register_loader_params_json(); return true; } diff --git a/src/io/register_loader_params_json.cpp b/src/io/register_loader_params_json.cpp new file mode 100644 index 00000000000..bc82deb0235 --- /dev/null +++ b/src/io/register_loader_params_json.cpp @@ -0,0 +1,29 @@ +/** + * \file register_loader_params_json.cpp + * Defines IO for a VG graph from stream files of Graph objects. + */ + +#include +#include "register_loader_params_json.hpp" + +#include + + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_params_json() { + Registry::register_loader("PARAMS_JSON", wrap_bare_loader([](std::istream& stream) -> void* { + // Read the whole stream with an iterator. See . + return new std::string(std::istreambuf_iterator(stream), {}); + })); +} + +} + +} + diff --git a/src/io/register_loader_params_json.hpp b/src/io/register_loader_params_json.hpp new file mode 100644 index 00000000000..9455c73b36e --- /dev/null +++ b/src/io/register_loader_params_json.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_PARAMS_JSON_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_PARAMS_JSON_HPP_INCLUDED + +/** + * \file register_loader_params_json.hpp + * Defines IO for embedded parameters. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_params_json(); + +} + +} + +#endif diff --git a/src/mcmc_caller.cpp b/src/mcmc_caller.cpp index 956123d6f3c..de526ce3955 100644 --- a/src/mcmc_caller.cpp +++ b/src/mcmc_caller.cpp @@ -96,7 +96,7 @@ namespace vg { return false; } - PathTraversalFinder trav_finder(*path_position_handle_graph, snarl_manager, ref_paths); + PathTraversalFinder trav_finder(*path_position_handle_graph, ref_paths); auto trav_results = trav_finder.find_path_traversals(snarl); vector ref_path = trav_results.first; diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index 4cc4224368e..9def875f42e 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -1364,6 +1364,14 @@ pair, vector> MinimizerMapper::map_paired(Alignment // Map single-ended and bail std::array, 2> mapped_pair = {map(aln1), map(aln2)}; + // We have no way to know which mapping ought to go with each other, so pad out + // the shorter list with copies of the first item, so we can report all mappings + // we got. + for (size_t index = 0; index < 2; index++) { + while (mapped_pair[index].size() < mapped_pair[1 - index].size()) { + mapped_pair[index].push_back(mapped_pair[index].front()); + } + } pair_all(mapped_pair); return {std::move(mapped_pair[0]), std::move(mapped_pair[1])}; } diff --git a/src/multipath_alignment.cpp b/src/multipath_alignment.cpp index 3f5c0d4d51f..d91ae71a03a 100644 --- a/src/multipath_alignment.cpp +++ b/src/multipath_alignment.cpp @@ -77,7 +77,7 @@ namespace vg { return *this; } - multipath_alignment_t::~multipath_alignment_t() { + multipath_alignment_t::~multipath_alignment_t() noexcept { while (!_annotation.empty()) { clear_annotation(_annotation.begin()->first); } diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index a9cf989d726..6af414160c6 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -58,7 +58,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, - const vector, Path>>& path_chunks, + const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) { @@ -75,7 +75,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, - const vector, Path>>& path_chunks, + const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) : MultipathAlignmentGraph(graph, path_chunks, alignment, project, @@ -86,7 +86,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, - const vector, Path>>& path_chunks, + const vector, path_t>>& path_chunks, const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) : MultipathAlignmentGraph(graph, path_chunks, alignment, create_projector(projection_trans), @@ -172,8 +172,9 @@ namespace vg { } // shim the aligned path into the path chunks constructor to make a node for it - vector, Path>> path_holder; - path_holder.emplace_back(make_pair(alignment.sequence().begin(), alignment.sequence().end()), alignment.path()); + vector, path_t>> path_holder; + path_holder.emplace_back(make_pair(alignment.sequence().begin(), alignment.sequence().end()), path_t()); + from_proto_path(alignment.path(), path_holder.front().second); create_path_chunk_nodes(graph, path_holder, alignment, project, injection_trans); // cut the snarls out of the aligned path so we can realign through them @@ -205,7 +206,7 @@ namespace vg { // Nothing to do } - void MultipathAlignmentGraph::create_path_chunk_nodes(const HandleGraph& graph, const vector, Path>>& path_chunks, + void MultipathAlignmentGraph::create_path_chunk_nodes(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, vector* path_node_provenance) { @@ -216,7 +217,7 @@ namespace vg { cerr << "performing DFS to walk out path " << pb2json(path_chunk.second) << endl; #endif - const Path& path = path_chunk.second; + const auto& path = path_chunk.second; auto range = injection_trans.equal_range(path.mapping(0).position().node_id()); for (auto iter = range.first; iter != range.second; iter++) { @@ -248,7 +249,7 @@ namespace vg { #endif pair projected_trav = project(graph.get_id(trav)); - const Position& pos = path.mapping(stack.size() - 1).position(); + const auto& pos = path.mapping(stack.size() - 1).position(); if (projected_trav.first == pos.node_id() && projected_trav.second == (projected_trav.second != graph.get_is_reverse(trav))) { @@ -291,8 +292,8 @@ namespace vg { // move over the path for (size_t i = 0; i < path.mapping_size(); i++) { - const Mapping& mapping = path.mapping(i); - const Position& position = mapping.position(); + const auto& mapping = path.mapping(i); + const auto& position = mapping.position(); auto& stack_record = stack[i]; handle_t& trav = stack_record.second[stack_record.first - 1]; @@ -307,7 +308,7 @@ namespace vg { new_position->set_offset(position.offset()); for (int64_t j = 0; j < mapping.edit_size(); j++) { - from_proto_edit(mapping.edit(j), *new_mapping->add_edit()); + *new_mapping->add_edit() = mapping.edit(j); } } diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index 90b3de547a7..752c9f56cd0 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -85,19 +85,19 @@ namespace vg { /// Construct a graph of the reachability between aligned chunks in a linearized /// path graph. Produces a graph with reachability edges. - MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily - MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily /// and using a lambda for a projector - MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); @@ -261,7 +261,7 @@ namespace vg { int64_t* removed_end_from_length = nullptr); /// Add the path chunks as nodes to the connectivity graph - void create_path_chunk_nodes(const HandleGraph& graph, const vector, Path>>& path_chunks, + void create_path_chunk_nodes(const HandleGraph& graph, const vector, path_t>>& path_chunks, const Alignment& alignment, const function(id_t)>& project, const unordered_multimap>& injection_trans, vector* path_node_provenance = nullptr); diff --git a/src/path.hpp b/src/path.hpp index b50208186b0..e1039393a4b 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -1,4 +1,4 @@ - #ifndef VG_PATH_HPP_INCLUDED +#ifndef VG_PATH_HPP_INCLUDED #define VG_PATH_HPP_INCLUDED #include @@ -379,12 +379,12 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path); */ class edit_t { public: - edit_t() = default; - edit_t(const edit_t&) = default; - edit_t(edit_t&&) = default; + edit_t() noexcept : _from_length(0), _to_length(0), _sequence() {} + edit_t(const edit_t& other) = default; + edit_t(edit_t&& other) = default; ~edit_t() = default; - edit_t& operator=(const edit_t&) = default; - edit_t& operator=(edit_t&&) = default; + edit_t& operator=(const edit_t& other) = default; + edit_t& operator=(edit_t&& other) = default; inline int32_t from_length() const; inline void set_from_length(int32_t l); inline int32_t to_length() const; @@ -404,11 +404,11 @@ class edit_t { class path_mapping_t { public: path_mapping_t() = default; - path_mapping_t(const path_mapping_t&) = default; - path_mapping_t(path_mapping_t&&) = default; + path_mapping_t(const path_mapping_t& other) = default; + path_mapping_t(path_mapping_t&& other) = default; ~path_mapping_t() = default; - path_mapping_t& operator=(const path_mapping_t&) = default; - path_mapping_t& operator=(path_mapping_t&&) = default; + path_mapping_t& operator=(const path_mapping_t& other) = default; + path_mapping_t& operator=(path_mapping_t&& other) = default; inline const position_t& position() const; inline position_t* mutable_position(); inline const vector& edit() const; @@ -427,11 +427,11 @@ class path_mapping_t { class path_t { public: path_t() = default; - path_t(const path_t&) = default; - path_t(path_t&&) = default; + path_t(const path_t& other) = default; + path_t(path_t&& other) = default; ~path_t() = default; - path_t& operator=(const path_t&) = default; - path_t& operator=(path_t&&) = default; + path_t& operator=(const path_t& other) = default; + path_t& operator=(path_t&& other) = default; inline const vector& mapping() const; inline const path_mapping_t& mapping(size_t i) const; inline vector* mutable_mapping(); diff --git a/src/position.hpp b/src/position.hpp index 69c2154f103..4719b42c325 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -22,12 +22,12 @@ using namespace std; // to refactor -- can always eliminate later class position_t { public: - position_t() = default; - position_t(const position_t&) = default; - position_t(position_t&&) = default; + position_t() : _node_id(0), _offset(0), _is_reverse(false) {} + position_t(const position_t& other) = default; + position_t(position_t&& other) = default; ~position_t() = default; - position_t& operator=(const position_t&) = default; - position_t& operator=(position_t&&) = default; + position_t& operator=(const position_t& other) = default; + position_t& operator=(position_t&& other) = default; inline int64_t node_id() const; inline void set_node_id(int64_t i); inline int64_t offset() const; diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 7c0fab58daa..07167217a45 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -43,6 +43,8 @@ class ReadFilter{ /// TODO: This should be a trie but I don't have one handy. /// Must be sorted for vaguely efficient search. vector name_prefixes; + /// Read name must not have anything in it besides the prefix + bool exact_name = false; /// Read must not have a refpos set with a contig name containing a match to any of these vector excluded_refpos_contigs; /// Read must contain at least one of these strings as a subsequence @@ -192,7 +194,8 @@ class ReadFilter{ double get_score(const Read& read) const; /** - * Does the read name have one of the indicated prefixes? + * Does the read name have one of the indicated prefixes? If exact_name is + * set, only finds complete matches of a "prefix" to the whole read name. */ bool matches_name(const Read& read) const; @@ -712,7 +715,8 @@ bool ReadFilter::matches_name(const Read& aln) const { right_match++; } - if (left_match == name_prefixes[left_bound].size() || right_match == name_prefixes[right_bound].size()) { + if ((left_match == name_prefixes[left_bound].size() && (!exact_name || left_match == aln.name().size())) || + (right_match == name_prefixes[right_bound].size() && (!exact_name || right_match == aln.name().size()))) { // We found a match already found = true; } else { @@ -729,7 +733,7 @@ bool ReadFilter::matches_name(const Read& aln) const { center_match++; } - if (center_match == name_prefixes[center].size()) { + if (center_match == name_prefixes[center].size() && (!exact_name || center_match == aln.name().size())) { // We found a hit! found = true; break; diff --git a/src/splicing.cpp b/src/splicing.cpp index 2cb7f94ffd6..c4e1727ccfe 100644 --- a/src/splicing.cpp +++ b/src/splicing.cpp @@ -1633,7 +1633,7 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, } } else if (!have_right_linker) { - // the splice segment on the right side is empty, make connection the the left side + // the splice segment on the right side is empty, make connection the left side auto connection = left_mp_aln.mutable_subpath(right_subpaths_begin - 1)->add_connection(); connection->set_next(left_mp_aln.subpath_size()); connection->set_score(splice_score); diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index c79a5afb90c..22f6e02651e 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -715,11 +715,9 @@ int main_chunk(int argc, char** argv) { unique_ptr subgraph; map trace_thread_frequencies; if (!component_ids.empty()) { - cout << "here?" << endl; subgraph = vg::io::new_output_graph(output_format); chunker.extract_component(component_ids[i], *subgraph, false); output_regions[i] = region; - cout << "nope" << endl; } else if (id_range == false) { subgraph = vg::io::new_output_graph(output_format); diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 975336f6aa4..0d381732533 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -42,17 +42,18 @@ void help_deconstruct(char** argv){ << " -P, --path-prefix NAME All paths [excluding GBWT threads / non-reference GBZ paths] beginning with NAME used as reference (multiple allowed)." << endl << " Other non-ref paths not considered as samples. " << endl << " -r, --snarls FILE Snarls file (from vg snarls) to avoid recomputing." << endl - << " -g, --gbwt FILE only consider alt traversals that correspond to GBWT threads FILE (not needed for GBZ graph input)." << endl + << " -g, --gbwt FILE consider alt traversals that correspond to GBWT haplotypes in FILE (not needed for GBZ graph input)." << endl << " -T, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names and AT fields in output" << endl << " -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl - << " -e, --path-traversals Only consider traversals that correspond to paths in the graph." << endl << " -a, --all-snarls Process all snarls, including nested snarls (by default only top-level snarls reported)." << endl - << " -d, --ploidy N Expected ploidy. If more traversals found, they will be flagged as conflicts (default: 2)" << endl << " -c, --context-jaccard N Set context mapping size used to disambiguate alleles at sites with multiple reference traversals (default: 10000)." << endl << " -u, --untangle-travs Use context mapping to determine the reference-relative positions of each step in allele traversals (AP INFO field)." << endl << " -K, --keep-conflicted Retain conflicted genotypes in output." << endl << " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl << " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl + << " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental]" << endl + << " -n, --nested Write a nested VCF, including special tags. [experimental]" << endl + << " -R, --star-allele Use *-alleles to denote alleles that span but do not cross the site. Only works with -n" << endl << " -t, --threads N Use N threads" << endl << " -v, --verbose Print some status messages" << endl << endl; @@ -73,14 +74,15 @@ int main_deconstruct(int argc, char** argv){ bool gbz_translation = false; bool path_restricted_traversals = false; bool show_progress = false; - int ploidy = 2; - bool set_ploidy = false; bool all_snarls = false; bool keep_conflicted = false; bool strict_conflicts = false; int context_jaccard_window = 10000; bool untangle_traversals = false; bool contig_only_ref = false; + double cluster_threshold = 1.0; + bool nested = false; + bool star_allele = false; int c; optind = 2; // force optind past command positional argument @@ -96,20 +98,22 @@ int main_deconstruct(int argc, char** argv){ {"translation", required_argument, 0, 'T'}, {"gbz-translation", no_argument, 0, 'O'}, {"path-traversals", no_argument, 0, 'e'}, - {"ploidy", required_argument, 0, 'd'}, {"context-jaccard", required_argument, 0, 'c'}, {"untangle-travs", no_argument, 0, 'u'}, {"all-snarls", no_argument, 0, 'a'}, {"keep-conflicted", no_argument, 0, 'K'}, {"strict-conflicts", no_argument, 0, 'S'}, - {"contig-only-ref", no_argument, 0, 'C'}, + {"contig-only-ref", no_argument, 0, 'C'}, + {"cluster", required_argument, 0, 'L'}, + {"nested", no_argument, 0, 'n'}, + {"start-allele", no_argument, 0, 'R'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uat:v", + c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nRt:v", long_options, &option_index); // Detect the end of the options. @@ -140,12 +144,12 @@ int main_deconstruct(int argc, char** argv){ gbz_translation = true; break; case 'e': - path_restricted_traversals = true; + cerr << "Warning [vg deconstruct]: -e is deprecated as it's now on default" << endl; break; case 'd': - ploidy = parse(optarg); - set_ploidy = true; + cerr << "Warning [vg deconstruct]: -d is deprecated - ploidy now inferred from haplotypes in path names" << endl; break; + break; case 'c': context_jaccard_window = parse(optarg); break; @@ -163,7 +167,16 @@ int main_deconstruct(int argc, char** argv){ break; case 'C': contig_only_ref = true; - break; + break; + case 'L': + cluster_threshold = max(0.0, min(1.0, parse(optarg))); + break; + case 'n': + nested = true; + break; + case 'R': + star_allele = true; + break; case 't': omp_set_num_threads(parse(optarg)); break; @@ -181,8 +194,16 @@ int main_deconstruct(int argc, char** argv){ } + if (nested == true && contig_only_ref == true) { + cerr << "Error [vg deconstruct]: -C cannot be used with -n" << endl; + return 1; + } + if (star_allele == true && nested == false) { + cerr << "Error [vg deconstruct]: -R can only be used with -n" << endl; + return 1; + } + // Read the graph - unique_ptr path_handle_graph_up; unique_ptr gbz_graph; gbwt::GBWT* gbwt_index = nullptr; @@ -206,16 +227,6 @@ int main_deconstruct(int argc, char** argv){ cerr << "Error [vg deconstruct]: -O can only be used when input graph is in GBZ format" << endl; } - if (set_ploidy && !path_restricted_traversals && gbwt_file_name.empty() && !gbz_graph) { - cerr << "Error [vg deconstruct]: -d can only be used with -e or -g or GBZ input" << endl; - return 1; - } - - if ((!gbwt_file_name.empty() || gbz_graph) && path_restricted_traversals && !gbz_graph) { - cerr << "Error [vg deconstruct]: -e cannot be used with -g or GBZ input" << endl; - return 1; - } - if (!gbwt_file_name.empty() || gbz_graph) { // context jaccard depends on having steps for each alt traversal, which is // not something we have on hand when getting traversals from the GBWT/GBZ @@ -269,13 +280,22 @@ int main_deconstruct(int argc, char** argv){ } if (refpaths.empty() && refpath_prefixes.empty()) { + bool found_hap; // No paths specified: use them all graph->for_each_path_handle([&](path_handle_t path_handle) { - const string& name = graph->get_path_name(path_handle); - if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) { - refpaths.push_back(name); - } - }); + const string& name = graph->get_path_name(path_handle); + if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) { + refpaths.push_back(name); + } else { + found_hap = true; + } + }); + + if (!found_hap && gbwt_index == nullptr) { + cerr << "error [vg deconstruct]: All graph paths selected as references (leaving no alts). Please use -P/-p " + << "to narrow down the reference to a subset of paths, or GBZ/GBWT input that contains haplotype paths" << endl; + return 1; + } } // Read the translation @@ -351,15 +371,18 @@ int main_deconstruct(int argc, char** argv){ cerr << "Deconstructing top-level snarls" << endl; } dd.set_translation(translation.get()); - dd.set_nested(all_snarls); - dd.deconstruct(refpaths, graph, snarl_manager.get(), path_restricted_traversals, ploidy, + dd.set_nested(all_snarls || nested); + dd.deconstruct(refpaths, graph, snarl_manager.get(), all_snarls, context_jaccard_window, untangle_traversals, keep_conflicted, strict_conflicts, !contig_only_ref, - gbwt_index); + cluster_threshold, + gbwt_index, + nested, + star_allele); return 0; } diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index af1c5678cbb..f9054aabd2c 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -31,6 +31,7 @@ void help_filter(char** argv) { << " -M, --input-mp-alns input is multipath alignments (GAMP) rather than GAM" << endl << " -n, --name-prefix NAME keep only reads with this prefix in their names [default='']" << endl << " -N, --name-prefixes FILE keep reads with names with one of many prefixes, one per nonempty line" << endl + << " -e, --exact-name match read names exactly instead of by prefix" << endl << " -a, --subsequence NAME keep reads that contain this subsequence" << endl << " -A, --subsequences FILE keep reads that contain one of these subsequences, one per nonempty line" << endl << " -p, --proper-pairs keep reads that are annotated as being properly paired" << endl @@ -73,6 +74,7 @@ int main_filter(int argc, char** argv) { bool input_gam = true; vector name_prefixes; + bool exact_name = false; vector excluded_refpos_contigs; unordered_set excluded_features; vector subsequences; @@ -124,6 +126,7 @@ int main_filter(int argc, char** argv) { {"input-mp-alns", no_argument, 0, 'M'}, {"name-prefix", required_argument, 0, 'n'}, {"name-prefixes", required_argument, 0, 'N'}, + {"exact-name", no_argument, 0, 'e'}, {"subsequence", required_argument, 0, 'a'}, {"subsequences", required_argument, 0, 'A'}, {"proper-pairs", no_argument, 0, 'p'}, @@ -157,7 +160,7 @@ int main_filter(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:", + c = getopt_long (argc, argv, "Mn:N:ea:A:pPX:F:s:r:Od:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:", long_options, &option_index); /* Detect the end of the options. */ @@ -185,6 +188,9 @@ int main_filter(int argc, char** argv) { } }); break; + case 'e': + exact_name = true; + break; case 'a': subsequences.push_back(optarg); break; @@ -370,6 +376,7 @@ int main_filter(int argc, char** argv) { // template lambda to set parameters auto set_params = [&](auto& filter) { filter.name_prefixes = name_prefixes; + filter.exact_name = exact_name; filter.subsequences = subsequences; filter.excluded_refpos_contigs = excluded_refpos_contigs; filter.excluded_features = excluded_features; diff --git a/src/subcommand/inject_main.cpp b/src/subcommand/inject_main.cpp index a9ed8cbcb8f..99b1c8fe234 100644 --- a/src/subcommand/inject_main.cpp +++ b/src/subcommand/inject_main.cpp @@ -14,6 +14,7 @@ #include "../alignment.hpp" #include "../vg.hpp" #include "../xg.hpp" +#include "../hts_alignment_emitter.hpp" #include #include #include @@ -23,10 +24,11 @@ using namespace vg; using namespace vg::subcommand; void help_inject(char** argv) { - cerr << "usage: " << argv[0] << " inject [options] input.[bam|sam|cram] >output.gam" << endl + cerr << "usage: " << argv[0] << " inject -x graph.xg [options] input.[bam|sam|cram] >output.gam" << endl << endl << "options:" << endl << " -x, --xg-name FILE use this graph or xg index (required, non-XG formats also accepted)" << endl + << " -o, --output-format NAME output the alignments in NAME format (gam / gaf / json) [gam]" << endl << " -t, --threads N number of threads to use" << endl; } @@ -37,6 +39,8 @@ int main_inject(int argc, char** argv) { } string xg_name; + string output_format = "GAM"; + std::set output_formats = { "GAM", "GAF", "JSON" }; int threads = get_thread_count(); int c; @@ -46,12 +50,13 @@ int main_inject(int argc, char** argv) { { {"help", no_argument, 0, 'h'}, {"xg-name", required_argument, 0, 'x'}, + {"output-format", required_argument, 0, 'o'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:t:", + c = getopt_long (argc, argv, "hx:o:t:", long_options, &option_index); // Detect the end of the options. @@ -63,6 +68,19 @@ int main_inject(int argc, char** argv) { case 'x': xg_name = optarg; break; + + case 'o': + { + output_format = optarg; + for (char& c : output_format) { + c = std::toupper(c); + } + if (output_formats.find(output_format) == output_formats.end()) { + std::cerr << "error: [vg inject] Invalid output format: " << optarg << std::endl; + std::exit(1); + } + } + break; case 't': threads = parse(optarg); @@ -90,14 +108,14 @@ int main_inject(int argc, char** argv) { } unique_ptr path_handle_graph = vg::io::VPKG::load_one(xg_name); bdsg::PathPositionOverlayHelper overlay_helper; - PathPositionHandleGraph* xgidx = overlay_helper.apply(path_handle_graph.get()); + PathPositionHandleGraph* xgidx = overlay_helper.apply(path_handle_graph.get()); - vg::io::ProtobufEmitter buf(cout); - function lambda = [&buf](Alignment& aln) { -#pragma omp critical (buf) - { - buf.write(std::move(aln)); - } + // We don't do HTS output formats but we do need an empty paths collection to make an alignment emitter + vector> paths; + unique_ptr alignment_emitter = get_alignment_emitter("-", output_format, paths, threads, xgidx); + + function lambda = [&](Alignment& aln) { + alignment_emitter->emit_mapped_single({std::move(aln)}); }; if (threads > 1) { hts_for_each_parallel(file_name, lambda, xgidx); diff --git a/src/subcommand/snarls_main.cpp b/src/subcommand/snarls_main.cpp index 8587e93d2d2..20772eb328b 100644 --- a/src/subcommand/snarls_main.cpp +++ b/src/subcommand/snarls_main.cpp @@ -299,7 +299,7 @@ int main_snarl(int argc, char** argv) { if (path_traversals) { // Limit traversals to embedded paths - trav_finder.reset(new PathTraversalFinder(*graph, snarl_manager)); + trav_finder.reset(new PathTraversalFinder(*graph)); } else if (vcf_filename.empty()) { // This finder works with any backing graph trav_finder.reset(new ExhaustiveTraversalFinder(*graph, snarl_manager)); diff --git a/src/subpath_overlay.cpp b/src/subpath_overlay.cpp new file mode 100644 index 00000000000..126c6f97275 --- /dev/null +++ b/src/subpath_overlay.cpp @@ -0,0 +1,105 @@ +#include +#include "subpath_overlay.hpp" + +#include + +namespace vg { + +using namespace std; +using namespace handlegraph; + +SubpathOverlay::SubpathOverlay(const PathHandleGraph* backing, + const step_handle_t& begin, const step_handle_t& end) : backing_graph(backing) { + + for (auto step = begin; step != end; step = backing->get_next_step(step)) { + subpath_handles.emplace_back(backing->get_handle_of_step(step)); + } +} + +bool SubpathOverlay::has_node(nid_t node_id) const { + return node_id <= subpath_handles.size(); +} + +handle_t SubpathOverlay::get_handle(const nid_t& node_id, bool is_reverse) const { + return handlegraph::number_bool_packing::pack(node_id, is_reverse); +} + +nid_t SubpathOverlay::get_id(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle); +} + +bool SubpathOverlay::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); +} + +handle_t SubpathOverlay::flip(const handle_t& handle) const { + return handlegraph::number_bool_packing::toggle_bit(handle); +} + +size_t SubpathOverlay::get_length(const handle_t& handle) const { + return backing_graph->get_length(subpath_handles[get_id(handle) - 1]); +} + +std::string SubpathOverlay::get_sequence(const handle_t& handle) const { + return backing_graph->get_sequence(get_underlying_handle(handle)); +} + +size_t SubpathOverlay::get_node_count() const { + return subpath_handles.size(); +} + +nid_t SubpathOverlay::min_node_id() const { + return 1; +} + +nid_t SubpathOverlay::max_node_id() const { + return subpath_handles.size(); +} + +bool SubpathOverlay::follow_edges_impl(const handle_t& handle, bool go_left, + const std::function& iteratee) const { + if (go_left != get_is_reverse(handle)) { + if (get_id(handle) == 1) { + return true; + } + else { + return iteratee(get_handle(get_id(handle) - 1, get_is_reverse(handle))); + } + } + else { + if (get_id(handle) == subpath_handles.size()) { + return true; + } + else { + return iteratee(get_handle(get_id(handle) + 1, get_is_reverse(handle))); + } + } +} + +bool SubpathOverlay::for_each_handle_impl(const std::function& iteratee, bool parallel) const { + + if (!parallel) { + bool keep_going = true; + for (size_t i = 1; keep_going && i <= subpath_handles.size(); ++i) { + keep_going = iteratee(get_handle(i, false)); + } + return keep_going; + } else { + std::atomic keep_going(true); +#pragma omp parallel for + for (size_t i = 1; i <= subpath_handles.size(); ++i) { + keep_going = keep_going && iteratee(get_handle(i, false)); + } + return keep_going; + } +} + +handle_t SubpathOverlay::get_underlying_handle(const handle_t& handle) const { + handle_t underlying = subpath_handles[get_id(handle) - 1]; + if (get_is_reverse(handle)) { + underlying = backing_graph->flip(underlying); + } + return underlying; +} + +} diff --git a/src/subpath_overlay.hpp b/src/subpath_overlay.hpp new file mode 100644 index 00000000000..91444e14a43 --- /dev/null +++ b/src/subpath_overlay.hpp @@ -0,0 +1,107 @@ +#ifndef VG_SUBGRAPH_OVERLAY_HPP_INCLUDED +#define VG_SUBGRAPH_OVERLAY_HPP_INCLUDED + +/** + * \file subgraph_overlay.hpp + * + * Provides SourceSinkOverlay, a HandleGraph implementation that joins all the + * heads and tails of a backing graph to single source and sink nodes. + * + */ + + +#include "handle.hpp" + +#include + + +namespace vg { + +using namespace handlegraph; + +/** + * Overlay that presents a linear graph corresponding to a subinterval of a path + */ +class SubpathOverlay : public ExpandingOverlayGraph { + +public: + /** + * Construct new SubpathOverlay between two steps. The 'end' step is past-the-last. + */ + SubpathOverlay(const PathHandleGraph* backing, + const step_handle_t& begin, const step_handle_t& end); + SubpathOverlay() = default; + + ~SubpathOverlay() = default; + + //////////////////////////////////////////////////////////////////////////// + // Handle-based interface + //////////////////////////////////////////////////////////////////////////// + + /// Method to check if a node exists by ID + bool has_node(nid_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const nid_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + nid_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + std::string get_sequence(const handle_t& handle) const; + + /// Return the number of nodes in the graph + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + nid_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + nid_t max_node_id() const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + handle_t get_underlying_handle(const handle_t& handle) const; + +protected: + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const std::function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. Returns true if we finished and false if we + /// stopped early. + bool for_each_handle_impl(const std::function& iteratee, bool parallel = false) const; + + /// the backing graph + const HandleGraph* backing_graph = nullptr; + + std::vector subpath_handles; +}; + +} + +#endif diff --git a/src/surjector.cpp b/src/surjector.cpp index 83f008ce335..b3068efea1a 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -12,6 +12,8 @@ #include "memoizing_graph.hpp" #include "multipath_alignment_graph.hpp" #include "reverse_graph.hpp" +#include "subpath_overlay.hpp" +#include "identity_overlay.hpp" #include "algorithms/extract_connecting_graph.hpp" #include "algorithms/prune_to_connecting_graph.hpp" @@ -239,7 +241,7 @@ using namespace std; } cerr << endl; cerr << "\tpath interval " << graph->get_position_of_step(surjection_record.second.second[i].first) << " - " << graph->get_position_of_step(surjection_record.second.second[i].second) << endl; - cerr << "\t" << pb2json(anchor.second) << endl; + cerr << "\t" << debug_string(anchor.second) << endl; } if (connections.count(surjection_record.first)) { cerr << "\tconnections" << endl; @@ -1687,19 +1689,18 @@ using namespace std; final_position.offset() + mapping_from_length(*final_mapping) == first_position.offset()) { for (const auto& edit : first_mapping.edit()) { - *final_mapping->add_edit() = edit; + from_proto_edit(edit, *final_mapping->add_edit()); } ++k; } // copy over the rest of the mappings for (; k < n; ++k) { auto mapping = path_chunks[i].second.add_mapping(); - *mapping = aln.path().mapping(k); - mapping->set_rank(path_chunks[i].second.mapping_size()); + from_proto_mapping(aln.path().mapping(k), *mapping); } #ifdef debug_constrictions - cerr << "extended left path " << i << " to " << pb2json(path_chunks[i].second) << endl; + cerr << "extended left path " << i << " to " << debug_string(path_chunks[i].second) << endl; #endif ++left; } @@ -1735,11 +1736,10 @@ using namespace std; #endif // copy the repair alignment - Path concat_path; + path_t concat_path; for (size_t k = n; k < aln.path().mapping_size(); ++k) { auto mapping = concat_path.add_mapping(); - *mapping = aln.path().mapping(k); - mapping->set_rank(concat_path.mapping_size()); + from_proto_mapping(aln.path().mapping(k), *mapping); } // check if we need to merge the first and last mappings @@ -1761,14 +1761,13 @@ using namespace std; for (; k < path_chunks[i].second.mapping_size(); ++k) { auto mapping = concat_path.add_mapping(); *mapping = path_chunks[i].second.mapping(k); - mapping->set_rank(concat_path.mapping_size()); } // replace the original path - path_chunks[i].second = concat_path; + path_chunks[i].second = std::move(concat_path); #ifdef debug_constrictions - cerr << "extended right path " << i << " to " << pb2json(path_chunks[i].second) << endl; + cerr << "extended right path " << i << " to " << debug_string(path_chunks[i].second) << endl; #endif ++right; } @@ -1878,7 +1877,7 @@ using namespace std; // so we record an overlap overlaps.emplace_back(j, k); #ifdef debug_spliced_surject - cerr << "path chunk " << i << " overlaps " << j << " by " << to_walk << " on read, marking an overlap to split before mapping " << k << " at " << pb2json(path_chunks[j].second.mapping(k)) << endl; + cerr << "path chunk " << i << " overlaps " << j << " by " << to_walk << " on read, marking an overlap to split before mapping " << k << " at " << debug_string(path_chunks[j].second.mapping(k)) << endl; #endif } } @@ -1932,7 +1931,6 @@ using namespace std; for (size_t l = begin_idx; l < end_idx; ++l) { auto mapping = path_chunk.second.add_mapping(); *mapping = path_chunks[i].second.mapping(l); - mapping->set_rank(l - begin_idx + 1); } // identify the read interval path_chunk.first.first = read_begin; @@ -1952,7 +1950,7 @@ using namespace std; #ifdef debug_spliced_surject cerr << "next split for chunk " << i << " as " << split_path_chunks.size() - 1 << ", consisting of " << endl; cerr << "\t" << string(path_chunk.first.first, path_chunk.first.second) << endl; - cerr << "\t" << pb2json(path_chunk.second) << endl; + cerr << "\t" << debug_string(path_chunk.second) << endl; cerr << "\t" << graph->get_position_of_step(ref_chunk.first) << " : " << graph->get_position_of_step(ref_chunk.second) << endl; #endif } @@ -1963,7 +1961,7 @@ using namespace std; #ifdef debug_spliced_surject cerr << "no splits on chunk " << i << ", add as " << split_path_chunks.size() << endl; cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << endl; - cerr << "\t" << pb2json(path_chunks[i].second) << endl; + cerr << "\t" << debug_string(path_chunks[i].second) << endl; cerr << "\t" << graph->get_position_of_step(ref_chunks[i].first) << " : " << graph->get_position_of_step(ref_chunks[i].second) << endl; #endif split_path_chunks.emplace_back(move(path_chunks[i])); @@ -2187,12 +2185,12 @@ using namespace std; surjected.set_mapping_quality(src_mapping_quality); auto surj_subpath = surjected.add_subpath(); - from_proto_path(path_chunks.front().second, *surj_subpath->mutable_path()); + *surj_subpath->mutable_path() = move(path_chunks.front().second); Alignment aln; aln.set_sequence(src_sequence); aln.set_quality(src_quality); - *aln.mutable_path() = move(path_chunks.front().second); + to_proto_path(surj_subpath->path(), *aln.mutable_path()); surj_subpath->set_score(get_aligner(!src_quality.empty())->score_contiguous_alignment(aln)); surjected.add_start(0); @@ -2927,7 +2925,7 @@ using namespace std; cerr << "using overlap chunks on path " << graph->get_path_name(path_handle) << " strand " << rev_strand << ", performing realigning surjection" << endl; cerr << "chunks:" << endl; for (size_t i = 0; i < path_chunks.size(); ++i) { - cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << ", " << pb2json(path_chunks[i].second) << endl; + cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << ", " << debug_string(path_chunks[i].second) << endl; } #endif @@ -2968,7 +2966,7 @@ using namespace std; // just copy it over surjected.set_sequence(source.sequence()); surjected.set_quality(source.quality()); - *surjected.mutable_path() = path_chunks.front().second; + to_proto_path(path_chunks.front().second, *surjected.mutable_path()); surjected.set_score(get_aligner(!source.quality().empty())->score_contiguous_alignment(surjected)); } @@ -2984,22 +2982,23 @@ using namespace std; assert(ref_path_interval.first <= ref_path_interval.second); // get the path graph corresponding to this interval - bdsg::HashGraph path_graph; - unordered_map> node_trans = extract_linearized_path_graph(path_position_graph, &path_graph, path_handle, - ref_path_interval.first, ref_path_interval.second); + step_handle_t begin = graph->get_step_at_position(path_handle, ref_path_interval.first); + step_handle_t end = graph->get_step_at_position(path_handle, ref_path_interval.second); + if (graph->get_position_of_step(end) <= ref_path_interval.second && end != graph->path_end(path_handle)) { + // we actually want part of this step too, so we use the next one as the end iterator + end = graph->get_next_step(end); + } + SubpathOverlay path_graph(path_position_graph, begin, end); // choose an orientation for the path graph ReverseGraph rev_comp_path_graph(&path_graph, true); - HandleGraph* aln_graph; + IdentityOverlay identity_path_graph(&path_graph); + ExpandingOverlayGraph* aln_graph; if (rev_strand) { - // we align to the reverse strand of the path graph, and the translation chages accordingly aln_graph = &rev_comp_path_graph; - for (pair>& translation : node_trans) { - translation.second.second = !translation.second.second; - } } else { - aln_graph = &path_graph; + aln_graph = &identity_path_graph; } #ifdef debug_anchored_surject @@ -3016,15 +3015,21 @@ using namespace std; << subgraph_bases << " bp strand split subgraph for read " << source.name() << "; suppressing further warnings." << endl; } - return move(make_null_alignment(source)); + surjected = move(make_null_alignment(source)); + return surjected; } + std::function(id_t)> projection_trans = [&](id_t node_id) { + handle_t handle = path_graph.get_underlying_handle(aln_graph->get_underlying_handle(aln_graph->get_handle(node_id))); + return pair(path_position_graph->get_id(handle), path_position_graph->get_is_reverse(handle)); + }; + // compute the connectivity between the path chunks // TODO: i'm not sure if we actually need to preserve all indel anchors in either case, but i don't // want to change too much about the anchoring logic at once while i'm switching from blanket preservation // to a more targeted method bool preserve_tail_indel_anchors = (sinks_are_anchors || sources_are_anchors); - MultipathAlignmentGraph mp_aln_graph(*aln_graph, path_chunks, source, node_trans, !preserve_N_alignments, + MultipathAlignmentGraph mp_aln_graph(*aln_graph, path_chunks, source, projection_trans, !preserve_N_alignments, preserve_tail_indel_anchors); #ifdef debug_anchored_surject @@ -3034,7 +3039,8 @@ using namespace std; // we don't overlap this reference path at all or we filtered out all of the path chunks, so just make a sentinel if (mp_aln_graph.empty()) { - return move(make_null_alignment(source)); + surjected = move(make_null_alignment(source)); + return surjected; } // TODO: is this necessary in a linear graph? @@ -3072,7 +3078,7 @@ using namespace std; unordered_map left_align_strand; left_align_strand.reserve(aln_graph->get_node_count()); aln_graph->for_each_handle([&](const handle_t& handle) { - left_align_strand[handle] = node_trans.at(aln_graph->get_id(handle)).second; + left_align_strand[handle] = projection_trans(aln_graph->get_id(handle)).second; }); // align the intervening segments and store the result in a multipath alignment @@ -3103,7 +3109,7 @@ using namespace std; for (size_t i = 0; i < mp_aln.subpath_size(); i++) { // translate back into the original ID space - translate_oriented_node_ids(*mp_aln.mutable_subpath(i)->mutable_path(), node_trans); + translate_oriented_node_ids(*mp_aln.mutable_subpath(i)->mutable_path(), projection_trans); } // identify the source subpaths (necessary for subpath-global optimal alignment algorithm) @@ -3137,22 +3143,24 @@ using namespace std; #ifdef debug_anchored_surject cerr << "looking for path range on " << (rev_strand ? "reverse" : "forward") << " strand, for " << surj_path.mapping_size() << " mappings" << endl; #endif - step_handle_t step = rev_strand ? graph->get_step_at_position(path_handle, ref_path_interval.second) - : graph->get_step_at_position(path_handle, ref_path_interval.first); - step_handle_t end = rev_strand ? graph->get_previous_step(graph->get_step_at_position(path_handle, ref_path_interval.first)) - : graph->get_next_step(graph->get_step_at_position(path_handle, ref_path_interval.second)); + + step_handle_t step = graph->get_step_at_position(path_handle, ref_path_interval.first); + step_handle_t end = graph->get_next_step(graph->get_step_at_position(path_handle, ref_path_interval.second)); // walk the identified interval - for (; step != end; step = rev_strand ? graph->get_previous_step(step) : graph->get_next_step(step)) { - const auto& pos = surj_path.mapping(mappings_matched).position(); + for (; step != end; step = graph->get_next_step(step)) { + size_t idx = rev_strand ? surj_path.mapping_size() - mappings_matched - 1 : mappings_matched; + const auto& pos = surj_path.mapping(idx).position(); handle_t handle = graph->get_handle_of_step(step); if (graph->get_id(handle) == pos.node_id() && ((graph->get_is_reverse(handle) != pos.is_reverse()) == rev_strand)) { // we found the next position we were expecting to - if (mappings_matched == 0) { + if (mappings_matched == 0 || rev_strand) { path_range_out.first = step; } - path_range_out.second = step; + if (mappings_matched == 0 || !rev_strand) { + path_range_out.second = step; + } ++mappings_matched; #ifdef debug_anchored_surject cerr << "\tmatch at node " << graph->get_id(handle) << " " << graph->get_is_reverse(handle) << " at position " << graph->get_position_of_step(step) << endl; @@ -3172,7 +3180,7 @@ using namespace std; mappings_matched = 0; // and go back to where we started on the path // TODO: this is potentially quadratic, there are faster algorithms - step = path_range_out.first; + step = rev_strand ? path_range_out.second : path_range_out.first; } } } @@ -3183,7 +3191,7 @@ using namespace std; mappings_matched = 0; // and go back to where we started on the path // TODO: this is potentially quadratic, there are faster algorithms - step = path_range_out.first; + step = rev_strand ? path_range_out.second : path_range_out.first; } #ifdef debug_anchored_surject cerr << "\tmismatch at node " << graph->get_id(handle) << " " << graph->get_is_reverse(handle) << " at position " << graph->get_position_of_step(step) << endl; @@ -3205,6 +3213,10 @@ using namespace std; cerr << "Surjected read dump: " << pb2json(surjected) << endl; exit(1); } +#ifdef debug_anchored_surject + // TODO: dump out path_range_out + cerr << "identified path range between steps on " << graph->get_id(graph->get_handle_of_step(path_range_out.first)) << " " << graph->get_position_of_step(path_range_out.first) << ", " << graph->get_id(graph->get_handle_of_step(path_range_out.second)) << " " << graph->get_position_of_step(path_range_out.second) << '\n'; +#endif } else { // sentinel to indicate that surjection is unmapped @@ -3369,10 +3381,10 @@ using namespace std; prev_edit->set_sequence(prev_edit->sequence() + next_edit.sequence()); } else { - to_proto_edit(next_edit, *prev_mapping->add_edit()); + *prev_mapping->add_edit() = next_edit; } for (size_t k = 1; k < next_mapping.edit_size(); ++k) { - to_proto_edit(next_mapping.edit(k), *prev_mapping->add_edit()); + *prev_mapping->add_edit() = next_mapping.edit(k); } merged_mapping = true; @@ -3380,7 +3392,7 @@ using namespace std; } if (!merged_mapping) { // make a new mapping - to_proto_mapping(next_mapping, *path_chunk.add_mapping()); + *path_chunk.add_mapping() = next_mapping; } } @@ -3429,7 +3441,7 @@ using namespace std; // remember that we've already emitted all the mappings currently on the stack added_new_mappings = false; #ifdef debug_multipath_surject - cerr << "converted stack into path " << pb2json(path_chunk) << endl; + cerr << "converted stack into path " << debug_string(path_chunk) << endl; cerr << "read interval is " << (chunk.first.first - source.sequence().begin()) << ":" << (chunk.first.second - source.sequence().begin()) << " " << string(chunk.first.first, chunk.first.second) << endl; #endif } @@ -3557,7 +3569,9 @@ using namespace std; // for each path that we're extending, the previous step and the strand we were at on it // mapped to the index of that path chunk in the path's vector - unordered_map, size_t> extending_steps; + // note: we also keep the path_handle_t instead of extracting it from the step because this operation + // is slow on GBZ + unordered_map> fwd_extending_steps, rev_extending_steps; int64_t through_to_length = 0; for (size_t i = 0; i < path.mapping_size(); i++) { @@ -3572,7 +3586,7 @@ using namespace std; cerr << "looking for paths on mapping " << i << " at position " << make_pos_t(pos) << endl; #endif - unordered_map, size_t> next_extending_steps; + unordered_map> next_fwd_extending_steps, next_rev_extending_steps; for (const step_handle_t& step : graph->steps_of_handle(handle)) { @@ -3597,86 +3611,114 @@ using namespace std; // the path does, then the read runs along the path in reverse. bool path_strand = graph->get_is_reverse(handle) != graph->get_is_reverse(graph->get_handle_of_step(step)); - step_handle_t prev_step = path_strand ? graph->get_next_step(step) : graph->get_previous_step(step); - #ifdef debug_anchored_surject - cerr << "path strand is " << (path_strand ? "rev" : "fwd") << ", prev step is "; - if (prev_step == graph->path_end(path_handle)) { - cerr << " path end"; - } - else if (prev_step == graph->path_front_end(path_handle)) { - cerr << " path front end"; - } - else { - cerr << graph->get_id(graph->get_handle_of_step(prev_step)) << (graph->get_is_reverse(graph->get_handle_of_step(prev_step)) ? "-" : "+"); - } - cerr << endl; - cerr << "possible extensions from: " << endl; - for (const auto& record : extending_steps) { - cerr << "\t" << "chunk " << record.second << " at " << graph->get_id(graph->get_handle_of_step(record.first.first)) << (graph->get_is_reverse(graph->get_handle_of_step(record.first.first)) ? "-" : "+") << " on " << graph->get_path_name(graph->get_path_handle_of_step(record.first.first)) << " " << (record.first.second ? "rev" : "fwd") << endl; - } + cerr << "path strand is rev? " << path_strand << endl; #endif - auto& path_chunks = to_return[make_pair(path_handle, path_strand)]; + auto& next_extending_steps = path_strand ? next_rev_extending_steps : next_fwd_extending_steps; - if (extending_steps.count(make_pair(prev_step, path_strand))) { - // we are extending from the previous step, so we continue with the extension + next_extending_steps[step] = make_pair(numeric_limits::max(), path_handle); + } + + // TODO: forward and reverse code is repetitive + + // extend forward strand steps + for (const auto& extending_step : fwd_extending_steps) { + auto next_step = graph->get_next_step(extending_step.first); + auto it = next_fwd_extending_steps.find(next_step); + if (it != next_fwd_extending_steps.end()) { + it->second.first = extending_step.second.first; + auto& path_chunks = to_return[make_pair(it->second.second, false)]; + auto& aln_chunk = path_chunks.first[extending_step.second.first]; + auto& ref_chunk = path_chunks.second[extending_step.second.first]; - size_t chunk_idx = extending_steps[make_pair(prev_step, path_strand)]; - auto& aln_chunk = path_chunks.first[chunk_idx]; - auto& ref_chunk = path_chunks.second[chunk_idx]; + // extend the range of the path on the reference + ref_chunk.second = it->first; + // move the end of the sequence out + aln_chunk.first.second = source.sequence().begin() + through_to_length; + // add this mapping + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); #ifdef debug_anchored_surject - cerr << "comes after chunk " << chunk_idx << endl; + cerr << "step on " << graph->get_id(graph->get_handle_of_step(it->first)) << ", pos " << graph->get_position_of_step(it->first) << " comes after forward chunk " << it->second << endl; #endif + } + } + // initialize new chunks for steps that did not extend + for (pair>& extended_step : next_fwd_extending_steps) { + if (extended_step.second.first == numeric_limits::max()) { - // extend the range of the path on the reference - ref_chunk.second = step; + auto& path_chunks = to_return[make_pair(extended_step.second.second, false)]; + + extended_step.second.first = path_chunks.first.size(); + + path_chunks.first.emplace_back(); + path_chunks.second.emplace_back(); + auto& aln_chunk = path_chunks.first.back(); + auto& ref_chunk = path_chunks.second.back(); + // init the ref interval with the interval along the embedded path + ref_chunk.first = extended_step.first; + ref_chunk.second = extended_step.first; + // init the new chunk with the sequence interval + aln_chunk.first.first = source.sequence().begin() + before_to_length; + aln_chunk.first.second = source.sequence().begin() + through_to_length; + // and with the first mapping + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); + +#ifdef debug_anchored_surject + cerr << "step on " << graph->get_id(graph->get_handle_of_step(extended_step.first)) << ", pos " << graph->get_position_of_step(extended_step.first) << " has no preceeding forward chunk, so start new chunk " << path_chunks.first.size() - 1 << endl; +#endif + } + } + for (pair>& extended_step : next_rev_extending_steps) { + + auto& path_chunks = to_return[make_pair(extended_step.second.second, true)]; + + auto prev_step = graph->get_next_step(extended_step.first); + auto it = rev_extending_steps.find(prev_step); + if (it != rev_extending_steps.end()) { + extended_step.second.first = it->second.first; + auto& aln_chunk = path_chunks.first[extended_step.second.first]; + auto& ref_chunk = path_chunks.second[extended_step.second.first]; + // extend the range of the path on the reference + ref_chunk.second = extended_step.first; // move the end of the sequence out aln_chunk.first.second = source.sequence().begin() + through_to_length; - Mapping* mapping = aln_chunk.second.add_mapping(); // add this mapping - *mapping = path.mapping(i); - mapping->set_rank(aln_chunk.second.mapping(aln_chunk.second.mapping_size() - 2).rank() + 1); + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); - // in the next iteration, this step should point into the chunk it just extended - next_extending_steps[make_pair(step, path_strand)] = extending_steps[make_pair(prev_step, path_strand)]; +#ifdef debug_anchored_surject + cerr << "step on " << graph->get_id(graph->get_handle_of_step(extended_step.first)) << ", pos " << graph->get_position_of_step(extended_step.first) << " comes after reverse chunk " << it->second << endl; +#endif } else { + extended_step.second.first = path_chunks.first.size(); - // this step does not extend a previous step, so we start a new chunk path_chunks.first.emplace_back(); path_chunks.second.emplace_back(); auto& aln_chunk = path_chunks.first.back(); auto& ref_chunk = path_chunks.second.back(); - // init the ref interval with the interval along the embedded path - ref_chunk.first = step; - ref_chunk.second = step; - + ref_chunk.first = extended_step.first; + ref_chunk.second = extended_step.first; // init the new chunk with the sequence interval aln_chunk.first.first = source.sequence().begin() + before_to_length; aln_chunk.first.second = source.sequence().begin() + through_to_length; - // and with the first mapping - Mapping* mapping = aln_chunk.second.add_mapping(); - *mapping = path.mapping(i); - mapping->set_rank(1); - - // keep track of where this chunk is in the vector and which step it came from - // for the next iteration - next_extending_steps[make_pair(step, path_strand)] = path_chunks.first.size() - 1; + from_proto_mapping(path.mapping(i), *aln_chunk.second.add_mapping()); #ifdef debug_anchored_surject - cerr << "no preceeding chunk so start new chunk " << path_chunks.first.size() - 1 << endl; + cerr << "step on " << graph->get_id(graph->get_handle_of_step(extended_step.first)) << ", pos " << graph->get_position_of_step(extended_step.first) << " has no preceeding reverse chunk, so start new chunk " << path_chunks.first.size() - 1 << endl; #endif + } } // we've finished extending the steps from the previous mapping, so we replace them // with the steps we found in this iteration that we want to extend on the next one - extending_steps = next_extending_steps; + fwd_extending_steps = std::move(next_fwd_extending_steps); + rev_extending_steps = std::move(next_rev_extending_steps); } return to_return; @@ -3715,7 +3757,7 @@ using namespace std; #ifdef debug_filter_paths cerr << "original order for chunks" << endl; for (size_t i = 0; i < path_chunks.size(); ++i) { - cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << debug_string(path_chunks[i].second) << endl; } cerr << "connections" << endl; for (size_t i = 0; i < outward_connections.size(); ++i) { @@ -3760,7 +3802,7 @@ using namespace std; #ifdef debug_filter_paths cerr << "sort order for chunks" << endl; for (auto i : order) { - cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << debug_string(path_chunks[i].second) << endl; } #endif @@ -3776,7 +3818,7 @@ using namespace std; auto& chunk_here = path_chunks[order[i]]; #ifdef debug_filter_paths cerr << "looking for overlapping chunks for " << order[i] << endl; - cerr << string(chunk_here.first.first, chunk_here.first.second) << " " << pb2json(chunk_here.second) << endl; + cerr << string(chunk_here.first.first, chunk_here.first.second) << " " << debug_string(chunk_here.second) << endl; #endif // remove items from the heap if they are outside the window of this read interval @@ -3809,7 +3851,7 @@ using namespace std; auto remaining = chunk_here.first.first - chunk_over.first.first; #ifdef debug_filter_paths cerr << "overlap candidate " << j << endl; - cerr << string(chunk_over.first.first, chunk_over.first.second) << " " << pb2json(chunk_over.second) << endl; + cerr << string(chunk_over.first.first, chunk_over.first.second) << " " << debug_string(chunk_over.second) << endl; cerr << "at relative read offset " << remaining << endl; #endif @@ -3986,7 +4028,7 @@ using namespace std; for (size_t i = 0; i < path_chunks.size(); ++i) { if (redundant[i]) { #ifdef debug_filter_paths - cerr << "filtering path chunk " << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + cerr << "filtering path chunk " << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << debug_string(path_chunks[i].second) << endl; #endif ++removed_before[i]; } @@ -4084,7 +4126,7 @@ using namespace std; size_t right_overhang = no_right_expansion ? 0 : (get_aligner()->longest_detectable_gap(source, path_chunk.first.second) + (source.sequence().end() - path_chunk.first.second)); - const Position& first_pos = path_chunk.second.mapping(0).position(); + const auto& first_pos = path_chunk.second.mapping(0).position(); if (rev_strand) { size_t path_offset = (graph->get_position_of_step(ref_chunk.first) + graph->get_length(graph->get_handle_of_step(ref_chunk.first)) @@ -4103,8 +4145,8 @@ using namespace std; } } - const Mapping& final_mapping = path_chunk.second.mapping(path_chunk.second.mapping_size() - 1); - const Position& final_pos = final_mapping.position(); + const auto& final_mapping = path_chunk.second.mapping(path_chunk.second.mapping_size() - 1); + const auto& final_pos = final_mapping.position(); if (rev_strand) { size_t path_offset = (graph->get_position_of_step(ref_chunk.second) + graph->get_length(graph->get_handle_of_step(ref_chunk.second)) @@ -4128,47 +4170,6 @@ using namespace std; return interval; } - - unordered_map> - Surjector::extract_linearized_path_graph(const PathPositionHandleGraph* graph, MutableHandleGraph* into, - path_handle_t path_handle, size_t first, size_t last) const { - - // TODO: we need better semantics than an unsigned interval for surjecting to circular paths - -#ifdef debug_anchored_surject - cerr << "extracting path graph for position interval " << first << ":" << last << " in path of length " << graph->get_path_length(path_handle) << endl; -#endif - - unordered_map> node_trans; - - step_handle_t begin = graph->get_step_at_position(path_handle, first); - step_handle_t end = graph->get_step_at_position(path_handle, last); - - if (graph->get_position_of_step(end) <= last && end != graph->path_end(path_handle)) { - // we actually want part of this step too, so we use the next one as the end iterator - end = graph->get_next_step(end); - } - - handle_t prev_node; - for (step_handle_t step = begin; step != end; step = graph->get_next_step(step)) { - // copy the node with the local orientation now forward - handle_t copying = graph->get_handle_of_step(step); - handle_t node_here = into->create_handle(graph->get_sequence(copying)); - - if (step != begin) { - // add an edge from the previous node - into->create_edge(prev_node, node_here); - } - - // record the translation - node_trans[into->get_id(node_here)] = pair(graph->get_id(copying), - graph->get_is_reverse(copying)); - - prev_node = node_here; - } - - return node_trans; - } void Surjector::set_path_position(const PathPositionHandleGraph* graph, const pos_t& init_surj_pos, const pos_t& final_surj_pos, const step_handle_t& range_begin, const step_handle_t& range_end, diff --git a/src/surjector.hpp b/src/surjector.hpp index c1536abb03b..e3c3acec96a 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -14,6 +14,7 @@ #include "aligner.hpp" #include "handle.hpp" +#include "path.hpp" #include #include "multipath_alignment.hpp" @@ -97,7 +98,7 @@ using namespace std; bool preserve_deletions = false) const; /// a local type that represents a read interval matched to a portion of the alignment path - using path_chunk_t = pair, Path>; + using path_chunk_t = pair, path_t>; /// the minimum length deletion that the spliced algorithm will interpret as a splice event int64_t min_splice_length = 20; @@ -194,11 +195,6 @@ using namespace std; const vector>& ref_chunks, bool no_left_expansion, bool no_right_expansion) const; - /// make a linear graph that corresponds to a path interval, possibly duplicating nodes in case of cycles - unordered_map> - extract_linearized_path_graph(const PathPositionHandleGraph* graph, MutableHandleGraph* into, - path_handle_t path_handle, size_t first, size_t last) const; - /// use the graph position bounds and the path range bounds to assign a path position to a surjected read void set_path_position(const PathPositionHandleGraph* graph, const pos_t& init_surj_pos, const pos_t& final_surj_pos, diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index 90d3ba13bad..42214df12cb 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -281,7 +281,7 @@ class Transcriptome { /// Parse gtf/gff3 attribute value. string parse_attribute_value(const string & attribute, const string & name) const; - /// Returns the the mean node length of the graph + /// Returns the mean node length of the graph float mean_node_length() const; /// Adds the exon coordinates to a transcript. diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp new file mode 100644 index 00000000000..b18348e16c0 --- /dev/null +++ b/src/traversal_clusters.cpp @@ -0,0 +1,287 @@ +#include "traversal_clusters.hpp" + +//#define debug + +namespace vg { + + +// specialized version of jaccard_coefficient() that weights jaccard by node lenths. +double weighted_jaccard_coefficient(const PathHandleGraph* graph, + const multiset& target, + const multiset& query) { + vector intersection_handles; + std::set_intersection(target.begin(), target.end(), + query.begin(), query.end(), + std::back_inserter(intersection_handles)); + vector union_handles; + std::set_union(target.begin(), target.end(), + query.begin(), query.end(), + std::back_inserter(union_handles)); + + int64_t isec_size = 0; + for (const handle_t& handle : intersection_handles) { + isec_size += graph->get_length(handle); + } + int64_t union_size = 0; + for (const handle_t& handle : union_handles) { + union_size += graph->get_length(handle); + } + return (double)isec_size / (double)union_size; +} + + +vector get_traversal_order(const PathHandleGraph* graph, + const vector& traversals, + const vector& trav_path_names, + const vector& ref_travs, + int64_t ref_trav_idx, + const vector& use_traversal) { + set ref_set(ref_travs.begin(), ref_travs.end()); + + // we want to avoid cyclic references whenever possible, so we downweight in ranking + vector trav_to_occurrences(traversals.size(), 1); + unordered_map sample_to_occurrence; + for (int i = 0; i < traversals.size(); ++i) { + if (use_traversal[i] && !trav_path_names[i].empty()) { + sample_to_occurrence[PathMetadata::parse_sample_name(trav_path_names[i])] += 1; + } + } + for (int i = 0; i < traversals.size(); ++i) { + if (use_traversal[i] && !trav_path_names[i].empty()) { + trav_to_occurrences[i] = sample_to_occurrence[trav_path_names[i]]; + } + } + + function trav_less = [&](int i, int j) { + // give reference set preference priority + bool iref = ref_set.count(i); + bool jref = ref_set.count(j); + if (iref != jref) { + return iref; + } + // fall back to occurrences then name comparison + if (trav_to_occurrences[i] < trav_to_occurrences[j] || + (trav_to_occurrences[i] == trav_to_occurrences[j] && !trav_path_names[i].empty() && (trav_path_names[j].empty() || trav_path_names[i] < trav_path_names[j]))) { + return true; + } + return false; + }; + + vector sorted_travs; + sorted_travs.reserve(traversals.size()); + for (int64_t i = 0; i < traversals.size(); ++i) { + if (use_traversal[i] && (i == ref_trav_idx || !ref_set.count(i))) { + sorted_travs.push_back(i); + } + } +#ifdef debug + cerr << "names:"; + for (auto x : trav_path_names) cerr << " " << x; + cerr << endl << "ref set:"; + for (auto x : ref_set) cerr << " " << x; + cerr << endl << "before sort:"; + for (auto x : sorted_travs) cerr << " " << x; +#endif + std::sort(sorted_travs.begin(), sorted_travs.end(), trav_less); +#ifdef debug + cerr << endl << "after sort:"; + for (auto x : sorted_travs) cerr << " " << x; + cerr << endl; +#endif + assert(ref_travs.empty() || sorted_travs.empty() || ref_set.count(sorted_travs.front())); + return sorted_travs; +} + +vector> cluster_traversals(const PathHandleGraph* graph, + const vector& traversals, + const vector& traversal_order, + const vector>& child_snarls, + double min_jaccard, + vector>& out_info, + vector& out_child_snarl_to_trav) { + + assert(traversal_order.size() <= traversals.size()); + + // the values are indexes in the input traversals vector + // the "reference" traversal of each cluster (to which distance is computed) + // is always its first element + vector> clusters; + + out_info.resize(traversals.size(), make_pair(-1., 0)); + out_child_snarl_to_trav.resize(child_snarls.size(), -1); + + // keep track of which traversals cover which child snarls + vector> trav_to_child_snarls = assign_child_snarls_to_traversals(graph, traversals, child_snarls); + // keep track of which child snarls are covered + unordered_set uncovered_child_snarls; + for (const vector& child_snarls : trav_to_child_snarls) { + for (int i : child_snarls) { + uncovered_child_snarls.insert(i); + } + } + + // need the clusters as sorted lists. we'll forget the endpoints while we're at + // it since they're always shared. note we work with multisets since we want to + // count differences between, say, cycle copy numbers. + vector> sorted_traversals(traversals.size()); + for (const int& i : traversal_order) { + const auto& trav = traversals[i]; + assert(trav.size() >=2); + // prune snarl nodes as they're always the same + // todo: does jaccard properly handle empty sets? + int64_t first = trav.size() == 2 ? 0 : 1; + int64_t last = trav.size() == 2 ? trav.size() : trav.size() - 1; + multiset sorted_trav; + for (int64_t i = first; i < last; ++i) { + sorted_trav.insert(trav[i]); + } + sorted_traversals[i] = sorted_trav; + } + + for (const int& i : traversal_order) { + const auto& trav = sorted_traversals[i]; + double max_jaccard = 0; + int64_t max_cluster_idx = -1; + for (int64_t j = 0; j < clusters.size(); ++j) { + const auto& cluster_trav = sorted_traversals[clusters[j][0]]; + double jac = weighted_jaccard_coefficient(graph, trav, cluster_trav); + if (jac > max_jaccard) { + max_jaccard = jac; + max_cluster_idx = j; + if (jac == 1) { + break; + } + } + } + if (max_cluster_idx >= 0 && max_jaccard >= min_jaccard) { + // we've found a suitably similar cluster, add it do that + clusters[max_cluster_idx].push_back(i); + out_info[i] = make_pair(max_jaccard, 0); + } else { + // there's no cluster close enough, need to start a new one + clusters.push_back({i}); + out_info[i] = make_pair(1.0, 0); + // check off all the child snarls it covers + for (const int& child_snarl_idx : trav_to_child_snarls[i]) { + if (uncovered_child_snarls.count(child_snarl_idx)) { + uncovered_child_snarls.erase(child_snarl_idx); + out_child_snarl_to_trav[child_snarl_idx] = i; + } + } + } + } + + // break up clusters until all child snarls are covered + // todo: can/should we factor child coverage into objective of original clustering?? + // todo todo: is it simpler to relax things so that a traversal can be used as a reference + // in a nested snarl, even if it doesn't have an exact allele (ie is a cluster reference) + // in the parent? -- for now we keep things simple -- all reference alleles are explictly represented in vcf + vector> new_clusters; + for (int64_t i = clusters.size() - 1; i >= 0 && !uncovered_child_snarls.empty(); --i) { + for (int64_t j = clusters[i].size() -1; j > 0 && !uncovered_child_snarls.empty(); --j) { + const vector& trav_childs = trav_to_child_snarls[clusters[i][j]]; + bool uncovered = false; + for (int k : trav_childs) { + if (uncovered_child_snarls.count(k)) { + uncovered = true; + uncovered_child_snarls.erase(k); + out_child_snarl_to_trav[k] = clusters[i][j]; + } + } + if (uncovered) { + new_clusters.push_back({clusters[i][j]}); + clusters[i].erase(clusters[i].begin() + j); + } + } + } + assert(uncovered_child_snarls.empty() || traversal_order.size() < traversals.size()); + + // fill in the size deltas + for (vector& cluster : clusters) { + // only non-zero for non-empty clusters + if (cluster.size() > 1) { + int64_t cluster_ref_length = -1; + for (int64_t i = 1; i < cluster.size(); ++i) { + if (out_info[cluster[i]].first < 1) { + // get the cluster reference length on-demand + if (cluster_ref_length == -1) { + cluster_ref_length = 0; + for (const handle_t& handle : traversals[cluster[0]]) { + cluster_ref_length += graph->get_length(handle); + } + } + // compute the length of the non-ref traversal + int64_t length = 0; + for (const handle_t& handle : traversals[cluster[i]]) { + length += graph->get_length(handle); + } + // store the delta + out_info[cluster[i]].second = length - cluster_ref_length; + } + + } + } + } + + return clusters; +} + +vector> assign_child_snarls_to_traversals(const PathHandleGraph* graph, + const vector& traversals, + const vector>& child_snarls) { + + // index the child snarls + unordered_map handle_to_child; + for (int64_t i = 0; i < child_snarls.size(); ++i) { + handle_to_child[child_snarls[i].first] = i; + handle_to_child[child_snarls[i].second] = i; + } + + // use the index to find which snarls are fully contained in a given traversal + // this is a linear scan of the traversal, with above index checked (twice) for each handle + function(const Traversal&, bool)> get_contained_snarls = [&] (const Traversal& trav, bool fully_contained) { + map fw_count; + map rv_count; + for (const handle_t& handle : trav) { + if (handle_to_child.count(handle)) { + fw_count[handle_to_child[handle]] += 1; + } + handle_t rhandle = graph->flip(handle); + if (handle_to_child.count(rhandle)) { + rv_count[handle_to_child[rhandle]] += 1; + } + } + vector contained_snarls; + for (const auto& cs_count : fw_count) { + assert(cs_count.second == 1 || cs_count.second >= 2); + if (cs_count.second >= 2 || (!fully_contained && cs_count.second == 1)) { + contained_snarls.push_back(cs_count.first); + } + } + for (const auto& cs_count : rv_count) { + assert(cs_count.second == 1 || cs_count.second >= 2); + if (cs_count.second >= 2 || (!fully_contained && cs_count.second == 1)) { + contained_snarls.push_back(cs_count.first); + } + } + return contained_snarls; + }; + + // fill in the output map + vector> trav_to_child_snarls(traversals.size()); + + if (!child_snarls.empty()) { + for (int64_t i = 0; i < traversals.size(); ++i) { + trav_to_child_snarls[i] = get_contained_snarls(traversals[i], true); + } + } + + return trav_to_child_snarls; +} + + + + + + +} diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp new file mode 100644 index 00000000000..a4e7710d4b5 --- /dev/null +++ b/src/traversal_clusters.hpp @@ -0,0 +1,93 @@ +#include "handle.hpp" +#include "traversal_finder.hpp" + +#pragma once + + +/** \file +* Utilities for finding and clustering similar snarl traversals +*/ +namespace vg { +using namespace std; + +// +template +class count_back_inserter { + size_t &count; +public: + typedef void value_type; + typedef void difference_type; + typedef void pointer; + typedef void reference; + typedef std::output_iterator_tag iterator_category; + count_back_inserter(size_t &count) : count(count) {}; + void operator=(const T &){ ++count; } + count_back_inserter &operator *(){ return *this; } + count_back_inserter &operator++(){ return *this; } +}; + +// compute the jaccard coefficient (|intersection| / |union|) of two sets +// target and query should be sorted vectors (or something equivalent) +template +inline double jaccard_coefficient(const T& target, const U& query) { + size_t isec_size = 0; + std::set_intersection(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(isec_size)); + size_t union_size = 0; + std::set_union(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(union_size)); + return (double)isec_size / (double)union_size; +} + +// specialized version that weights jaccard by node lenths. +double weighted_jaccard_coefficient(const PathHandleGraph* graph, const multiset& target, const multiset& query); + +// the information needed from the parent traversal in order to +// genotype a child traversal +struct ParentGenotypeInfo { + Traversal ref_traversal; + pair ref_steps; + unordered_map sample_to_ploidy; // to check/enforce consistency +}; + + +/// sort the traversals, putting the reference first then using names +/// traversals masked out by use_traversal will be filrtered out entirely +/// (so the output vector may be smaller than the input...) +vector get_traversal_order(const PathHandleGraph* graph, + const vector& traversals, + const vector& trav_path_names, + const vector& ref_travs, + int64_t ref_trav_idx, + const vector& use_traversal); + + +/// cluster the traversals. The algorithm is: +/// - visit traversals in provided order +/// - if the traversal is <= min_jaccard away from the reference traversal of cluster, add to cluster +/// - else start a new cluster, with the given traversal as a reference +/// note that traversal_order can specify a subset of traversals +/// out_info are Simlarity/length-delta pairs comparing the traversal to its cluster reference +/// (if the traversal wasn't in traversal_order, it'll get -1) +/// if child snarls are given, then we make sure that every child snarl is contained in a reference +/// traversal (this guarantees the snarl nesting structure is exactly represented in the vcf alleles!) +/// if child snarls are given, then we also fill in the mapping of each child snarl to its +/// "reference" traversals -- ie first traversal in a cluster that contains both its endpoints +vector> cluster_traversals(const PathHandleGraph* graph, + const vector& traversals, + const vector& traversal_order, + const vector>& child_snarls, + double min_jaccard, + vector>& out_info, + vector& out_child_snarl_to_trav); + +/// assign a list of child snarls to traversals that fully conrtain them +/// the output is a list (for each traversal) of each child snarl that's contained in it +/// so otuput[i] = {x,y,z} means that child_snarls[x],[y],[z] are in traversals[i] +vector> assign_child_snarls_to_traversals(const PathHandleGraph* graph, + const vector& traversals, + const vector>& child_snarls); + +} diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index ac5b20e7681..b45f79474e8 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -10,6 +10,36 @@ namespace vg { using namespace std; +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, int64_t max_steps) { + string s; + function handle_to_string = [&](handle_t handle) { + string ss = graph->get_is_reverse(handle) ? "<" : ">"; + return ss + std::to_string(graph->get_id(handle)); + }; + if (traversal.size() <= max_steps) { + for (const handle_t& handle : traversal) { + s += handle_to_string(handle); + } + } else { + for (int64_t i = 0; i < max_steps / 2; ++i) { + s += handle_to_string(traversal[i]); + } + s += "..."; + for (int64_t i = 0; i < max_steps / 2; ++i) { + s += handle_to_string(traversal[traversal.size() - 1 - i]); + } + } + return s; +} + +string graph_interval_to_string(const HandleGraph* graph, const handle_t& start_handle, const handle_t& end_handle) { + function handle_to_string = [&](handle_t handle) { + string ss = graph->get_is_reverse(handle) ? "<" : ">"; + return ss + std::to_string(graph->get_id(handle)); + }; + return handle_to_string(start_handle) + handle_to_string(end_handle); +} + PathBasedTraversalFinder::PathBasedTraversalFinder(const PathHandleGraph& g, SnarlManager& sm) : graph(g), snarlmanager(sm){ } @@ -815,9 +845,9 @@ vector ReadRestrictedTraversalFinder::find_traversals(const Snar return to_return; } -PathTraversalFinder::PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, +PathTraversalFinder::PathTraversalFinder(const PathHandleGraph& graph, const vector& path_names) : - graph(graph), snarl_manager(snarl_manager) { + graph(graph) { for (const string& path_name : path_names) { assert(graph.has_path(path_name)); paths.insert(graph.get_path_handle(path_name)); @@ -828,15 +858,38 @@ vector PathTraversalFinder::find_traversals(const Snarl& site) { return find_path_traversals(site).first; } -pair, vector > > PathTraversalFinder::find_path_traversals(const Snarl& site) { +vector PathTraversalFinder::find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + return find_path_traversals(snarl_start, snarl_end).first; +} - handle_t start_handle = graph.get_handle(site.start().node_id(), site.start().backward()); - handle_t end_handle = graph.get_handle(site.end().node_id(), site.end().backward()); - - vector start_steps = graph.steps_of_handle(start_handle); - vector end_steps = graph.steps_of_handle(end_handle); +pair, vector > > find_path_traversals(const Snarl& site); + +pair, vector> PathTraversalFinder::find_path_traversals(const Snarl& site) { - pair, unordered_set > snarl_contents = snarl_manager.deep_contents(&site, graph, true); + pair, vector> path_travs = this->find_path_traversals( + graph.get_handle(site.start().node_id(), site.start().backward()), + graph.get_handle(site.end().node_id(), site.end().backward())); + + vector travs(path_travs.first.size()); + + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + SnarlTraversal& proto_trav = travs[i]; + const Traversal& handle_trav = path_travs.first[i]; + for (int64_t j = 0; j < handle_trav.size(); ++j) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph.get_id(handle_trav[j])); + visit->set_backward(graph.get_is_reverse(handle_trav[j])); + } + } + return make_pair(travs, path_travs.second); +} + +pair, vector> PathTraversalFinder::find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + + vector start_steps = graph.steps_of_handle(snarl_start); + // try to make this stable across different systems + std::sort(start_steps.begin(), start_steps.end()); + vector end_steps = graph.steps_of_handle(snarl_end); // use this to skip paths that don't reach the end node unordered_set end_path_handles; @@ -845,40 +898,48 @@ pair, vector > > PathT } #ifdef debug - cerr << "Finding traversals of " << pb2json(site) << " using PathTraversalFinder" << endl + cerr << "Finding traversals of " << (graph->get_is_reverse(snarl_start) ? "<" : ">") << graph->get_id(snarl_start) + << (graph->get_is_reverse(snarl_end) ? "<" : ">") << graph->get_id(snarl_edn) << " using PathTraversalFinder" << endl << " - there are " << start_steps.size() << " start_steps, " << end_steps.size() << " end_steps" << " and " << end_path_handles.size() << " end_path_handles" << endl; #endif - vector out_travs; + vector out_travs; vector > out_steps; + // collect edges that lead out the snarl to make sure we don't search in the wrong direction + unordered_set block_edges; + graph.follow_edges(snarl_start, true, [&](handle_t prev) { + block_edges.insert(graph.edge_handle(prev, snarl_start)); + }); + graph.follow_edges(snarl_end, false, [&](handle_t next) { + block_edges.insert(graph.edge_handle(snarl_end, next)); + }); + for (const step_handle_t& start_step : start_steps) { path_handle_t start_path_handle = graph.get_path_handle_of_step(start_step); // only crawl paths that have a chance of reaching the end if ((paths.empty() || paths.count(start_path_handle)) && end_path_handles.count(start_path_handle)) { - handle_t end_check = end_handle; + handle_t end_check = snarl_end; #ifdef debug cerr << " - considering path " << graph.get_path_name(start_path_handle) << endl; #endif // try to make a traversal by walking forward - SnarlTraversal trav; + Traversal trav; bool can_continue = true; step_handle_t step = start_step; while (can_continue) { handle_t handle = graph.get_handle_of_step(step); - Visit* start_visit = trav.add_visit(); - start_visit->set_node_id(graph.get_id(handle)); - start_visit->set_backward(graph.get_is_reverse(handle)); + trav.push_back(handle); can_continue = false; - if (graph.has_next_step(step) && handle != end_handle) { + if (graph.has_next_step(step) && handle != snarl_end) { step_handle_t next_step = graph.get_next_step(step); handle_t next_handle = graph.get_handle_of_step(next_step); - if (snarl_contents.first.count(graph.get_id(next_handle)) && - snarl_contents.second.count(graph.edge_handle(handle, next_handle))) { + // todo: we only need this check once + if (!block_edges.count(graph.edge_handle(handle, next_handle))) { step = next_step; can_continue = true; } @@ -890,25 +951,21 @@ pair, vector > > PathT cerr << " - failed to find forward traversal of path " << graph.get_path_name(start_path_handle) << endl; #endif // try to make a traversal by walking backward - end_check = graph.flip(end_handle); + end_check = graph.flip(snarl_end); - trav.Clear(); + trav.clear(); can_continue = true; step = start_step; while (can_continue) { handle_t handle = graph.flip(graph.get_handle_of_step(step)); - - Visit* start_visit = trav.add_visit(); - start_visit->set_node_id(graph.get_id(handle)); - start_visit->set_backward(graph.get_is_reverse(handle)); + trav.push_back(handle); can_continue = false; - if (graph.has_previous_step(step) && handle != end_handle) { + if (graph.has_previous_step(step) && handle != snarl_end) { step_handle_t prev_step = graph.get_previous_step(step); handle_t prev_handle = graph.flip(graph.get_handle_of_step(prev_step)); - - if (snarl_contents.first.count(graph.get_id(prev_handle)) && - snarl_contents.second.count(graph.edge_handle(handle, prev_handle))) { + // todo: we only need this check once + if (!block_edges.count(graph.edge_handle(prev_handle, handle))) { step = prev_step; can_continue = true; } @@ -2556,7 +2613,7 @@ VCFTraversalFinder::VCFTraversalFinder(const PathHandleGraph& graph, SnarlManage snarl_manager(snarl_manager), skip_alt(skip_alt), max_traversal_cutoff(max_traversal_cutoff), - path_finder(graph, snarl_manager, ref_path_names) { + path_finder(graph, ref_path_names) { create_variant_index(vcf, ref_fasta, ins_fasta); } @@ -3380,15 +3437,38 @@ GBWTTraversalFinder::~GBWTTraversalFinder() { pair, vector>> GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) { + + pair, vector>> gbwt_travs = this->find_gbwt_traversals( + graph.get_handle(site.start().node_id(), site.start().backward()), + graph.get_handle(site.end().node_id(), site.end().backward()), + return_paths); + + vector travs(gbwt_travs.first.size()); + + for (int64_t i = 0; i < gbwt_travs.first.size(); ++i) { + SnarlTraversal& proto_trav = travs[i]; + const Traversal& handle_trav = gbwt_travs.first[i]; + for (int64_t j = 0; j < handle_trav.size(); ++j) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph.get_id(handle_trav[j])); + visit->set_backward(graph.get_is_reverse(handle_trav[j])); + } + } + + return make_pair(travs, gbwt_travs.second); +} + +pair, vector>> +GBWTTraversalFinder::find_gbwt_traversals(const handle_t& snarl_start, const handle_t& snarl_end, bool return_paths) { // follow all gbwt threads from start to end vector, gbwt::SearchState> > forward_traversals = list_haplotypes( graph, gbwt, - graph.get_handle(site.start().node_id(), site.start().backward()), + snarl_start, [&] (const vector& new_thread) { - return gbwt::Node::id(new_thread.back()) == site.end().node_id() && - gbwt::Node::is_reverse(new_thread.back()) == site.end().backward(); + return gbwt::Node::id(new_thread.back()) == graph.get_id(snarl_end) && + gbwt::Node::is_reverse(new_thread.back()) == graph.get_is_reverse(snarl_end); }); // follow all gbwt threads from end to start @@ -3397,15 +3477,15 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) backward_traversals = list_haplotypes( graph, gbwt, - graph.get_handle(site.end().node_id(), !site.end().backward()), + graph.flip(snarl_end), [&] (const vector& new_thread) { - return gbwt::Node::id(new_thread.back()) == site.start().node_id() && - gbwt::Node::is_reverse(new_thread.back()) == !site.start().backward(); + return gbwt::Node::id(new_thread.back()) == graph.get_id(snarl_start) && + gbwt::Node::is_reverse(new_thread.back()) == !graph.get_is_reverse(snarl_start); }); } // store them all as snarltraversals - vector traversals; + vector traversals; vector> gbwt_paths; traversals.reserve(forward_traversals.size() + backward_traversals.size()); @@ -3413,8 +3493,7 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) for (int i = 0; i < forward_traversals.size(); ++i) { traversals.emplace_back(); for (auto j = forward_traversals[i].first.begin(); j != forward_traversals[i].first.end(); ++j) { - Visit* visit = traversals.back().add_visit(); - *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + traversals.back().push_back(graph.get_handle(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j))); } if (return_paths) { gbwt_paths.push_back(gbwt.locate(forward_traversals[i].second)); @@ -3458,8 +3537,7 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) // insert if not duplicate of existing forward traversal traversals.emplace_back(); for (auto j = backward_traversals[i].first.begin(); j != backward_traversals[i].first.end(); ++j) { - Visit* visit = traversals.back().add_visit(); - *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + traversals.back().push_back(graph.get_handle(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j))); } if (return_paths) { gbwt_paths.push_back(gbwt.locate(backward_traversals[i].second)); @@ -3474,6 +3552,10 @@ vector GBWTTraversalFinder::find_traversals(const Snarl& site) { return find_gbwt_traversals(site, false).first; } +vector GBWTTraversalFinder::find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + return find_gbwt_traversals(snarl_start, snarl_end, false).first; +} + pair, vector> GBWTTraversalFinder::find_path_traversals(const Snarl& site) { // get the unique traversals pair, vector>> gbwt_traversals = find_gbwt_traversals(site, true); @@ -3492,6 +3574,26 @@ pair, vector> GBWTTraversalFinder::find_ return path_traversals; } +pair, vector> GBWTTraversalFinder::find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + // get the unique traversals + pair, vector>> gbwt_traversals = this->find_gbwt_traversals(snarl_start, snarl_end, true); + + // expand them out to one per path (this is to be consistent with PathTraversalFinder as used in deconstruct) + pair, vector> path_traversals; + for (size_t i = 0; i < gbwt_traversals.first.size(); ++i) { + Traversal& trav = gbwt_traversals.first[i]; + vector& paths = gbwt_traversals.second[i]; + for (size_t j = 0; j < paths.size(); ++j) { + path_traversals.first.push_back(trav); + path_traversals.second.push_back(paths[j]); + } + } + + return path_traversals; +} + + + } diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index 47ad31e80ae..e39499cba03 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -36,6 +36,13 @@ using namespace std; class AugmentedGraph; +// some Protobuf replacements +using Traversal = vector; +using PathInterval = pair; +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, int64_t max_steps = 10); +// replaces pb2json(snarl) +string graph_interval_to_string(const HandleGraph* graph, const handle_t& start_handle, const handle_t& end_handle); + /** * Represents a strategy for finding traversals of (nested) sites. Polymorphic * base class/interface. @@ -44,7 +51,12 @@ class TraversalFinder { public: virtual ~TraversalFinder() = default; - virtual vector find_traversals(const Snarl& site) = 0; + virtual vector find_traversals(const Snarl& site) = 0; + + // new, protobuf-free interface. hope is to eventually deprecate the old one. for now it is only supported in a few places + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + assert(false); return{}; + } }; class ExhaustiveTraversalFinder : public TraversalFinder { @@ -201,25 +213,25 @@ class PathTraversalFinder : public TraversalFinder { // our graph with indexed path positions const PathHandleGraph& graph; - SnarlManager& snarl_manager; - // restrict to these paths unordered_set paths; public: // if path_names not empty, only those paths will be considered - PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, + PathTraversalFinder(const PathHandleGraph& graph, const vector& path_names = {}); /** * Return all traversals through the site that are sub-paths of embedded paths in the graph */ virtual vector find_traversals(const Snarl& site); + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end); /** * Like above, but return the path steps for the for the traversal endpoints */ - virtual pair, vector > > find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end); }; @@ -636,18 +648,22 @@ class GBWTTraversalFinder : public TraversalFinder { /* Return a traversal for every gbwt thread through the snarl */ virtual vector find_traversals(const Snarl& site); + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end); /** Return the traversals, paired with their path identifiers in the gbwt. The traversals are * unique, but there can be more than one path along each one (hence the vector) */ virtual pair, vector>> find_gbwt_traversals(const Snarl& site, bool return_paths = true); + virtual pair, vector>> + find_gbwt_traversals(const handle_t& snarl_start, const handle_t& snarl_end, bool return_paths = true); /** Return traversals paired with path identifiers from the GBWT. The traversals are *not* unique * (which is consistent with PathTraversalFinder) * To get the sample name from the path identifier id, use gbwtgraph::get_path_sample_name(); */ virtual pair, vector> find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end); const gbwt::GBWT& get_gbwt() { return gbwt; } diff --git a/src/unittest/gbwt_extender.cpp b/src/unittest/gbwt_extender.cpp index 6d7151c7177..ca7869ac552 100644 --- a/src/unittest/gbwt_extender.cpp +++ b/src/unittest/gbwt_extender.cpp @@ -115,6 +115,13 @@ std::vector> normalize_seeds(std::vector node_count; - for (int i = 0; i < num_tests; ++i) { + int split_edge_trav_count = 0; + int graph_edge_trav_count = 0; + + split.for_each_handle([&](const handle_t& h) { + REQUIRE(split.get_sequence(h) == graph.get_sequence(split.get_underlying_handle(h))); - bdsg::HashGraph graph; - random_graph(seq_size, var_len, var_count, &graph); - - StrandSplitGraph split(&graph); - REQUIRE(handlealgs::is_single_stranded(&split)); + split.follow_edges(h, false, [&](const handle_t& n) { + REQUIRE(graph.has_edge(split.get_underlying_handle(h), + split.get_underlying_handle(n))); + split_edge_trav_count++; + }); + split.follow_edges(h, true, [&](const handle_t& p) { + REQUIRE(graph.has_edge(split.get_underlying_handle(p), + split.get_underlying_handle(h))); + split_edge_trav_count++; + }); - REQUIRE(split.get_node_count() == 2 * graph.get_node_count()); + node_count[split.get_underlying_handle(h)]++; + }); + + graph.for_each_handle([&](const handle_t& h) { + REQUIRE(node_count[h] == 1); + REQUIRE(node_count[graph.flip(h)] == 1); + graph.follow_edges(h, false, [&](const handle_t& n) { + graph_edge_trav_count++; + }); + graph.follow_edges(h, true, [&](const handle_t& p) { + graph_edge_trav_count++; + }); + }); + + REQUIRE(split_edge_trav_count == 2 * graph_edge_trav_count); + } +} + +TEST_CASE("SubpathOverlay works as expected", "[overlay][handle]") { + + + HashGraph graph; + auto h1 = graph.create_handle("A"); + auto h2 = graph.create_handle("C"); + auto h3 = graph.create_handle("G"); + auto h4 = graph.create_handle("T"); + + graph.create_edge(h1, h2); + graph.create_edge(h1, h3); + graph.create_edge(h2, h4); + graph.create_edge(h3, h4); + + auto p = graph.create_path_handle("p"); + + vector steps; + steps.push_back(graph.append_step(p, h1)); + steps.push_back(graph.append_step(p, h2)); + steps.push_back(graph.append_step(p, h4)); + + for (size_t i = 0; i < steps.size(); ++i) { + for (size_t j = i; j < steps.size(); ++j) { - unordered_map node_count; + SubpathOverlay subpath(&graph, steps[i], graph.get_next_step(steps[j])); - int split_edge_trav_count = 0; - int graph_edge_trav_count = 0; + REQUIRE(handlealgs::is_single_stranded(&subpath)); - split.for_each_handle([&](const handle_t& h) { - REQUIRE(split.get_sequence(h) == graph.get_sequence(split.get_underlying_handle(h))); - - split.follow_edges(h, false, [&](const handle_t& n) { - REQUIRE(graph.has_edge(split.get_underlying_handle(h), - split.get_underlying_handle(n))); - split_edge_trav_count++; - }); - split.follow_edges(h, true, [&](const handle_t& p) { - REQUIRE(graph.has_edge(split.get_underlying_handle(p), - split.get_underlying_handle(h))); - split_edge_trav_count++; - }); - - node_count[split.get_underlying_handle(h)]++; + REQUIRE(subpath.get_node_count() == j - i + 1); + size_t manual_node_count = 0; + subpath.for_each_handle([&](const handle_t& h) { + ++manual_node_count; }); + REQUIRE(manual_node_count == subpath.get_node_count()); - graph.for_each_handle([&](const handle_t& h) { - REQUIRE(node_count[h] == 1); - REQUIRE(node_count[graph.flip(h)] == 1); - graph.follow_edges(h, false, [&](const handle_t& n) { - graph_edge_trav_count++; - }); - graph.follow_edges(h, true, [&](const handle_t& p) { - graph_edge_trav_count++; - }); - }); + auto nodes = handlealgs::lazier_topological_order(&subpath); - REQUIRE(split_edge_trav_count == 2 * graph_edge_trav_count); + for (size_t k = 0; k < nodes.size(); ++k) { + auto s = graph.get_handle_of_step(steps[i + k]); + REQUIRE(subpath.get_underlying_handle(nodes[k]) == s); + REQUIRE(subpath.get_sequence(nodes[k]) == graph.get_sequence(s)); + + REQUIRE(subpath.get_degree(nodes[k], true) == (k == 0 ? 0 : 1)); + REQUIRE(subpath.get_degree(nodes[k], false) == (k + 1 == nodes.size() ? 0 : 1)); + } } } } + + +} } diff --git a/src/unittest/snarls.cpp b/src/unittest/snarls.cpp index 77d06a9dd73..670de4ad76b 100644 --- a/src/unittest/snarls.cpp +++ b/src/unittest/snarls.cpp @@ -3933,7 +3933,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(2); @@ -3972,7 +3972,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(7); @@ -4013,7 +4013,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(8); @@ -4053,7 +4053,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(11); diff --git a/src/variant_recall.cpp b/src/variant_recall.cpp index 340f86c394a..e85821a3d14 100644 --- a/src/variant_recall.cpp +++ b/src/variant_recall.cpp @@ -82,7 +82,7 @@ void genotype_svs(VG* graph, Deconstructor decon; CactusSnarlFinder finder(xg_index); SnarlManager snarl_manager = finder.find_snarls(); - decon.deconstruct({refpath}, &xg_index, &snarl_manager, false, 1, false, 10000, false, false, false, false); + decon.deconstruct({refpath}, &xg_index, &snarl_manager, false, 10000, false, false, false, false); } direct_ins.clear(); diff --git a/test/nesting/nested_snp_in_del.gfa b/test/nesting/nested_snp_in_del.gfa new file mode 100644 index 00000000000..c122d6827be --- /dev/null +++ b/test/nesting/nested_snp_in_del.gfa @@ -0,0 +1,17 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P x 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P a#2#y1 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M diff --git a/test/nesting/nested_snp_in_ins.gfa b/test/nesting/nested_snp_in_ins.gfa new file mode 100644 index 00000000000..0ed215bde22 --- /dev/null +++ b/test/nesting/nested_snp_in_ins.gfa @@ -0,0 +1,17 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M diff --git a/test/nesting/nested_snp_in_ins2.gfa b/test/nesting/nested_snp_in_ins2.gfa new file mode 100644 index 00000000000..0416c076c38 --- /dev/null +++ b/test/nesting/nested_snp_in_ins2.gfa @@ -0,0 +1,19 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +P b#1#y0 1+,2+,4+,5+,6+ * +P b#2#y2 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M diff --git a/test/nesting/nested_snp_in_ins_cycle.gfa b/test/nesting/nested_snp_in_ins_cycle.gfa new file mode 100644 index 00000000000..7574da51fdc --- /dev/null +++ b/test/nesting/nested_snp_in_ins_cycle.gfa @@ -0,0 +1,21 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +P A#2#y1 1+,2+,3+,5+,6+ * +P A#1#y0 1+,2+,4+,5+,2+,4+,5+,6+ * +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M +L 5 + 2 + 0M + diff --git a/test/nesting/nested_snp_in_nested_ins.gfa b/test/nesting/nested_snp_in_nested_ins.gfa new file mode 100644 index 00000000000..98a73015e8e --- /dev/null +++ b/test/nesting/nested_snp_in_nested_ins.gfa @@ -0,0 +1,28 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 31 T +S 32 G +S 33 C +S 34 C +S 35 T +S 4 A +S 5 G +S 6 T +P a#2#y1 1+,2+,3+,31+,32+,34+,35+,5+,6+ * +P a#1#y0 1+,2+,3+,31+,33+,34+,35+,5+,6+ * +P x 1+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 31 + 0M +L 31 + 32 + 0M +L 31 + 33 + 0M +L 32 + 34 + 0M +L 33 + 34 + 0M +L 34 + 35 + 0M +L 35 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index aa11d17d092..03c7f8f3a87 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,22 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 24 - -vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > tiny.vg -vg index tiny.vg -x tiny.xg -vg deconstruct tiny.xg -p x -t 1 > tiny_decon.vcf -# we pop out that GC allele because it gets invented by the adjacent snps in the graph -gzip -dc tiny/tiny.vcf.gz | tail -3 | awk '{print $1 "\t" $2 "\t" $4 "\t" $5}' > tiny_orig.tsv -cat tiny_decon.vcf | grep -v "#" | grep -v GC | awk '{print $1 "\t" $2 "\t" $4 "\t" $5}' > tiny_dec.tsv -diff tiny_orig.tsv tiny_dec.tsv -is "$?" 0 "deconstruct retrieved original VCF (modulo adjacent snp allele)" -grep '>1>6' tiny_decon.vcf | awk '{print $8}' > allele_travs -printf "AT=>1>3>5>6,>1>3>4>6,>1>2>5>6,>1>2>4>6\n" > expected_allele_travs -diff allele_travs expected_allele_travs -is "$?" 0 "deconstruct produced expected AT field" - -rm -f tiny.vg tiny.xg tiny_decon.vcf tiny_orig.tsv tiny_dec.tsv allele_travs expected_allele_travs +plan tests 37 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -156,4 +141,85 @@ is "$?" 0 "gbz deconstruction gives same output as gbwt deconstruction" rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon.vcf x.giraffe.gbz x.min x.dist small.s1.h1.fa small.s1.h2.fa decon.s1.h1.fa decon.s1.h2.fa +# todo you could argue merging shouldn't happen here because there's no child snarl +# this check should come into play with nesting support +vg construct -r small/x.fa -v small/x.vcf.gz | vg view - > small_cluster.gfa +printf "L\t1\t+\t9\t+\t0M\n" >> small_cluster.gfa +printf "P\ty\t1+,2+,4+,6+,7+,9+\t*\n" >> small_cluster.gfa +printf "P\tz\t1+,9+\t*\n" >> small_cluster.gfa +vg deconstruct small_cluster.gfa -p x > small_cluster_0.vcf +vg deconstruct small_cluster.gfa -p x -L 0.3 > small_cluster_3.vcf +is "$(tail -1 small_cluster_0.vcf | awk '{print $5}')" "GATTTGA,G" "cluster-free deconstruction finds all alt alleles" +is "$(tail -1 small_cluster_3.vcf | awk '{print $5}')" "G" "clustered deconstruction finds fewer alt alleles" +is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered deconstruction finds correct allele info" + +rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf + +vg deconstruct nesting/nested_snp_in_del.gfa -p x -nR > nested_snp_in_del.vcf +grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv +printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv +printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv +diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletion" + +is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both sites of neseted deletion" + +rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv + +vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv +printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv +printf "T\tA\t1|.\n" >> nested_snp_in_del_truth.tsv +diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion (without -R)" + +rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv + +vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf +grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv +printf "A\tT\t0|1\n" > nested_snp_in_ins_truth.tsv +printf "C\tCAAG,CATG\t1|2\n" >> nested_snp_in_ins_truth.tsv +diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv +is "$?" 0 "nested deconstruction gets correct allele for snp inside insert" + +is $(grep LV=0 nested_snp_in_ins.vcf | head -1 | grep PA=0 | wc -l) 1 "PA=0 set for base allele of nested insertion" +is $(grep LV=1 nested_snp_in_ins.vcf | tail -1 | grep PA=1 | wc -l) 1 "PA=1 set for nested allele of nested insertion" + +grep ^##contig nested_snp_in_ins.vcf > nested_snp_in_ins_contigs.tsv +printf "##contig=\n" > nested_snp_in_ins_contigs_truth.tsv +printf "##contig=\n" >> nested_snp_in_ins_contigs_truth.tsv +diff nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv +is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp inside insert" + +rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv + +vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -nR | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv +printf "A\tT,*\t0|1\t0|2\n" > nested_snp_in_ins2._truth.tsv +printf "C\tCAAG,CATG\t1|2\t1|0\n" >> nested_snp_in_ins2._truth.tsv +diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv +is "$?" 0 "nested deconstruction gets correct star allele for snp ins2.ide ins2.ert" + +rm -f nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv + +# todo: the integrated snarl finder doesnt anchor to the reference +# probably not an issue on most real graphs from vg construct / minigraph cacuts +# but seems like something that needs reviewing +vg snarls nesting/nested_snp_in_nested_ins.gfa -A cactus > nested_snp_in_nested_ins.snarls +vg deconstruct nesting/nested_snp_in_nested_ins.gfa -r nested_snp_in_nested_ins.snarls -P x -n > nested_snp_in_nested_ins.vcf +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0" "INFO tags correct for level-0 site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RD=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for level-1 site of double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RD=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for level-2 site of double-nested SNP" + +rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf + +vg deconstruct nesting/nested_snp_in_ins_cycle.gfa -P x -n > nested_snp_in_ins_cycle.vcf +printf "A#1#y0\t3\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RD=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv +printf "x\t1\t>1>6\tC\tCAAGAAG,CATG,CAAG\tAC=1,2,1;AF=0.25,0.5,0.25;AN=4;AT=>1>6,>1>2>4>5>2>4>5>6,>1>2>3>5>6,>1>2>4>5>6;NS=2;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0\t1|2\t3|2\n" >> nested_snp_in_ins_cycle_truth.tsv +grep -v ^# nested_snp_in_ins_cycle.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $8 "\t" $10 "\t" $11}' > nested_snp_in_ins_cycle.tsv +diff nested_snp_in_ins_cycle.tsv nested_snp_in_ins_cycle_truth.tsv +is "$?" 0 "nested deconstruction handles cycle" + +rm -f nested_snp_in_ins_cycle.vcf nested_snp_in_ins_cycle_truth.tsv nested_snp_in_ins_cycle_truth.tsv + + diff --git a/test/t/39_vg_inject.t b/test/t/39_vg_inject.t index 541c2e9cbf6..f1aa362cee1 100644 --- a/test/t/39_vg_inject.t +++ b/test/t/39_vg_inject.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 12 +plan tests 13 vg construct -r small/x.fa > j.vg vg index -x j.xg j.vg @@ -52,5 +52,7 @@ cat <(samtools view -H small/x.bam) <(printf "name\t4\t*\t0\t0\t*\t*\t0\t0\tACGT is "$(vg inject -x x.xg unmapped.sam | vg view -aj - | grep "path" | wc -l)" 0 "vg inject does not make an alignment for an umapped read" is "$(echo $?)" 0 "vg inject does not crash on an unmapped read" +is $(vg inject -x x.xg small/x.bam -o GAF | wc -l) \ + 1000 "vg inject supports GAF output" rm j.vg j.xg x.vg x.gcsa x.gcsa.lcp x.xg unmapped.sam diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 639f17c58d8..66e1ac75f9e 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -212,7 +212,7 @@ vg index -j 1mb1kgp.dist 1mb1kgp.vg vg autoindex -p 1mb1kgp -w giraffe -P "VG w/ Variant Paths:1mb1kgp.vg" -P "Giraffe Distance Index:1mb1kgp.dist" -r 1mb1kgp/z.fa -v 1mb1kgp/z.vcf.gz vg giraffe -Z 1mb1kgp.giraffe.gbz -f reads/1mb1kgp_longread.fq >longread.gam -U 300 --progress --track-provenance --align-from-chains # This is an 8001 bp read with 1 insert and 1 substitution -is "$(vg view -aj longread.gam | jq -r '.score')" "7989" "A long read can be correctly aligned" +is "$(vg view -aj longread.gam | jq -r '.score')" "7999" "A long read can be correctly aligned" is "$(vg view -aj longread.gam | jq -c '.path.mapping[].edit[] | select(.sequence)' | wc -l)" "2" "A long read has the correct edits found" is "$(vg view -aj longread.gam | jq -c '. | select(.annotation["filter_3_cluster-coverage_cluster_passed_size_total"] <= 300)' | wc -l)" "1" "Long read minimizer set is correctly restricted" diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 192d6d19839..54b52f09911 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -340,12 +340,14 @@ then # Dependencies for running tests. Need numpy, scipy and sklearn for # running toil-vg mapeval, and dateutils and reqests for - # ./mins_since_last_build.py. Make sure to get the giant buildable modules - # as binaries only to avoid wasting CI time building some slightly nicer - # version Pip prefers. - pip3 install pytest timeout_decorator requests dateutils + # ./mins_since_last_build.py. Need exactly the right version of requests + # for the Python Docker API to work (see + # ). + pip3 install pytest timeout_decorator 'requests==2.31.0' dateutils # TODO: To upgrade to Numpy 1.24+, we need to remove usages of `numpy.int` # AKA `np.int` from toil-vg. See + # Make sure to get the giant buildable modules as binaries only to avoid + # wasting CI time building some slightly nicer version Pip prefers. pip3 install 'numpy==1.23.5' scipy scikit-learn --only-binary :all: # Install Toil