diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2bafca1cc8a..4f331e3dfa5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -11,7 +11,7 @@ before_script:
- sudo apt-get -q -y update
# Make sure we have some curl stuff for pycurl which we need for some Python stuff
# And the CI report upload needs uuidgen from uuid-runtime
- - sudo apt-get -q -y install --no-upgrade docker.io python3-pip python3-virtualenv libcurl4-gnutls-dev python-dev npm nodejs node-gyp uuid-runtime libgnutls28-dev doxygen libzstd-dev
+ - sudo apt-get -q -y install --no-upgrade docker.io python3-pip python3-virtualenv libcurl4-gnutls-dev python-dev npm nodejs node-gyp uuid-runtime libgnutls28-dev doxygen libzstd-dev bcftools
- which junit-merge || sudo npm install -g junit-merge
# Configure Docker to use a mirror for Docker Hub and restart the daemon
- |
diff --git a/Makefile b/Makefile
index f77ae12c82e..6c8fd7262a1 100644
--- a/Makefile
+++ b/Makefile
@@ -215,6 +215,9 @@ ifeq ($(shell uname -s),Darwin)
# We don't actually do any static linking on Mac, so we leave this empty.
START_STATIC =
END_STATIC =
+
+ # We need to use special flags to let us rename libraries
+ LD_RENAMEABLE_FLAGS = -Wl,-headerpad -Wl,-headerpad_max_install_names
else
# We are not running on OS X
$(info OS is Linux)
@@ -252,7 +255,8 @@ else
# Note that END_STATIC is only safe to use in a mostly-dynamic build, and has to appear or we will try to statically link secret trailing libraries.
END_STATIC = -Wl,-Bdynamic
-
+ # We don't need any flags because we don't need to rename libraries with install_name_tool
+ LD_RENAMEABLE_FLAGS =
endif
# Set the C++ standard we are using
@@ -656,7 +660,7 @@ ifeq ($(shell uname -s),Darwin)
endif
$(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c
- +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR)
+ +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR)
# We build htslib after libdeflate so it can use libdeflate.
# We need to do some wizardry to get it to pick up the right build and target system types on modern autotools.
diff --git a/README.md b/README.md
index 55556f6f44d..6acfeb9ce29 100644
--- a/README.md
+++ b/README.md
@@ -55,27 +55,41 @@ The easiest way to get vg is to download one of our release builds for Linux. We
If you don't want to or can't use a pre-built release of vg, or if you want to become a vg developer, you can build it from source instead.
+#### Linux: Clone VG
+
First, obtain the repo and its submodules:
git clone --recursive https://github.com/vgteam/vg.git
cd vg
+
+#### Linux: Install Dependencies
Then, install VG's dependencies. You'll need the protobuf and jansson development libraries installed, and to run the tests you will need:
- * `jq`, `bc`, `rs`, and `parallel`
- * `hexdump` and `column` from `bsdmainutils`
- * [`npm` for testing documentation examples](https://github.com/anko/txm)).
+* `jq`, `bc`, `rs`, and `parallel`
+* `hexdump` and `column` from `bsdmainutils`
+* [`npm` for testing documentation examples](https://github.com/anko/txm)).
+
On Ubuntu, you should be able to do:
make get-deps
+
+If you get complaints that `sudo` is not found, install it:
+
+ apt update
+ apt install sudo
+
+If you get a bunch of errors like `E: Unable to locate package build-essential`, make sure your package index files are up to date by running:
+
+ sudo apt update
-On other distros, you will need to perform the equivalent of:
+On other distros, or if you do not have root access, you will need to perform the equivalent of:
sudo apt-get install build-essential git cmake pkg-config libncurses-dev libbz2-dev \
protobuf-compiler libprotoc-dev libprotobuf-dev libjansson-dev \
automake gettext autopoint libtool jq bsdmainutils bc rs parallel \
npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \
liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \
- libzstd-devel pybind11-dev python3-pybind11
+ libzstd-dev pybind11-dev python3-pybind11
Note that **Ubuntu 16.04** does not ship a sufficiently new Protobuf; vg requires **Protobuf 3** which will have to be manually installed.
@@ -85,22 +99,47 @@ Other libraries may be required. Please report any build difficulties.
Note that a 64-bit OS is required. Ubuntu 20.04 should work.
-When you are ready, build with `. ./source_me.sh && make`, and run with `./bin/vg`.
+#### Linux: Build
+
+When you are ready, build with `. ./source_me.sh && make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers.
Note that vg can take anywhere from 10 minutes to more than an hour to compile depending on your machine and the number of threads used.
You can also produce a static binary with `make static`, assuming you have static versions of all the dependencies installed on your system.
+#### Linux: Run
+
+Once vg is built, the binary will be at `bin/vg` inside the vg repository directory. You can run it with:
+
+```
+./bin/vg
+```
+
+You can also add its directory to your `PATH` enviornment variable, so that you can invoke `vg` from any directory. To do that on Bash, use this command from the vg repository directory:
+
+```
+echo 'export PATH="${PATH}:'"$(pwd)"'/bin"' >>~/.bashrc
+```
+
+Then close your terminal and open a new one. Run `vg` to make sure it worked.
+
+If it did not work, make sure that you have a `.bash_profile` file in your home directory that will run your `.bashrc`:
+```
+if [ -f ~/.bashrc ]; then
+ source ~/.bashrc
+fi
+```
+
### Building on MacOS
-#### Clone VG
+#### Mac: Clone VG
The first step is to clone the vg repository:
git clone --recursive https://github.com/vgteam/vg.git
cd vg
-#### Install Dependencies
+#### Mac: Install Dependencies
VG depends on a number of packages being installed on the system where it is being built. Dependencies can be installed using either [MacPorts](https://www.macports.org/install.php) or [Homebrew](http://brew.sh/).
@@ -118,17 +157,35 @@ Homebrew provides another package management solution for OSX, and may be prefer
# Install all the dependencies in the Brewfile
brew bundle
-#### Build
+#### Mac: Build
With dependencies installed, VG can now be built:
. ./source_me.sh && make
+
+As with Linux, you can add `-j16` or other numbers at the end to run multiple build tasks at once, if your computer can handle them.
**Note that static binaries cannot yet be built for Mac.**
Our team has successfully built vg on Mac with GCC versions 4.9, 5.3, 6, 7, and 7.3, as well as Clang 9.0.
-#### Migrating to ARM Macs
+#### Mac: Run
+
+Once vg is built, the binary will be at `bin/vg` inside the vg repository directory. You can run it with:
+
+```
+./bin/vg
+```
+
+You can also add its directory to your `PATH` enviornment variable, so that you can invoke `vg` from any directory. To do that on the default `zsh` Mac shell, use this command from the vg repository directory:
+
+```
+echo 'export PATH="${PATH}:'"$(pwd)"'/bin"' >>~/.zshrc
+```
+
+Then close your terminal and open a new one. Run `vg` to make sure it worked.
+
+##### Migrate a VG installation from x86 to ARM
The Mac platform is moving to ARM, with Apple's M1, M1 Pro, M1 Max, and subsequent chip designs. The vg codebase supports ARM on Mac as well as on Linux. **The normal installation instructions work on a factory-fresh ARM Mac**.
@@ -396,14 +453,6 @@ vg index hla.vg -x hla.xg
vg deconstruct hla.xg -e -p "gi|568815592:29791752-29792749" > hla_variants.vcf
```
-Variants can also be inferred strictly from topology by not using `-e`, though unlike the above example, cycles are not supported. "Deconstruct" the VCF variants that were used to construct the graph. The output will be similar but identical to `small/x.vcf.gz` as `vg construct` can add edges between adjacent alts and/or do some normalization:
-
-
-```sh
-# using the same graph from the `map` example
-vg deconstruct x.xg -p x > x.vcf
-```
-
Haplotype paths from `.gbz` or `.gbwt` indexes input can be considered using `-z` and `-g', respectively.
As with `vg call`, it is best to compute snarls separately and pass them in with `-r` when working with large graphs.
diff --git a/deps/libvgio b/deps/libvgio
index 9b0d0e11df6..def4827b903 160000
--- a/deps/libvgio
+++ b/deps/libvgio
@@ -1 +1 @@
-Subproject commit 9b0d0e11df6f9bd389ba4dba08d107953eabff8f
+Subproject commit def4827b9034d9624179c442c8568978ca33e5b8
diff --git a/ontology/vg.html b/ontology/vg.html
index 982fc7a668a..586ed09aaa5 100644
--- a/ontology/vg.html
+++ b/ontology/vg.html
@@ -688,7 +688,7 @@
rdfs:comment |
- "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."
+ "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."
xsd:string
|
diff --git a/ontology/vg.ttl b/ontology/vg.ttl
index 056e421fa01..39ef0fc795b 100644
--- a/ontology/vg.ttl
+++ b/ontology/vg.ttl
@@ -31,7 +31,7 @@
.
:Step
rdf:type owl:Class ;
- rdfs:comment "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."^^xsd:string ;
+ rdfs:comment "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)."^^xsd:string ;
rdfs:label "Step"^^xsd:string ;
rdfs:subClassOf owl:Thing ;
.
diff --git a/src/algorithms/alignment_path_offsets.cpp b/src/algorithms/alignment_path_offsets.cpp
index d50f9100818..f781b042377 100644
--- a/src/algorithms/alignment_path_offsets.cpp
+++ b/src/algorithms/alignment_path_offsets.cpp
@@ -3,94 +3,94 @@
//#define debug_mpaln_offsets
namespace vg {
-namespace algorithms {
-
-unordered_map > >
-alignment_path_offsets(const PathPositionHandleGraph& graph,
- const Alignment& aln,
- bool just_min,
- bool nearby,
- size_t search_limit,
- const std::function* path_filter) {
- if (nearby && search_limit == 0) {
- // Fill in the search limit
- search_limit = aln.sequence().size();
- }
- unordered_map > > offsets;
- if (graph.get_path_count() == 0) return offsets;
- for (auto& mapping : aln.path().mapping()) {
- // How many bases does this Mapping cover over?
- size_t mapping_width = mapping_from_length(mapping);
- if (mapping_width == 0 && !nearby) {
- // Just skip over this mapping; it touches no bases.
- continue;
- }
- // We may have to consider both the starts and ends of mappings
- vector end = {false};
- if (just_min && !nearby) {
- // We want the min actually touched position along each path. It
- // could come from the Mapping start or the Mapping end.
- end.push_back(true);
- }
- // Find the position of this end of this mapping
- pos_t mapping_pos = make_pos_t(mapping.position());
- // Find the positions for this end of this Mapping
- auto pos_offs = algorithms::nearest_offsets_in_paths(&graph, mapping_pos, nearby ? search_limit : -1, path_filter);
- for (auto look_at_end : end) {
- // For the start and the end of the Mapping, as needed
- for (auto& p : pos_offs) {
- // For each path, splice the list of path positions for this Mapping
- // onto the end of the list of positions we found in that path
- auto& v = offsets[p.first];
- for (pair& y : p.second) {
- v.emplace_back(y.second ? y.first - mapping_width : y.first,
- y.second);
+ namespace algorithms {
+
+ unordered_map > >
+ alignment_path_offsets(const PathPositionHandleGraph& graph,
+ const Alignment& aln,
+ bool just_min,
+ bool nearby,
+ size_t search_limit,
+ const std::function* path_filter) {
+ if (nearby && search_limit == 0) {
+ // Fill in the search limit
+ search_limit = aln.sequence().size();
+ }
+ unordered_map > > offsets;
+ if (graph.get_path_count() == 0) return offsets;
+ for (auto& mapping : aln.path().mapping()) {
+ // How many bases does this Mapping cover over?
+ size_t mapping_width = mapping_from_length(mapping);
+ if (mapping_width == 0 && !nearby) {
+ // Just skip over this mapping; it touches no bases.
+ continue;
+ }
+ // We may have to consider both the starts and ends of mappings
+ vector end = {false};
+ if (just_min && !nearby) {
+ // We want the min actually touched position along each path. It
+ // could come from the Mapping start or the Mapping end.
+ end.push_back(true);
+ }
+ // Find the position of this end of this mapping
+ pos_t mapping_pos = make_pos_t(mapping.position());
+ // Find the positions for this end of this Mapping
+ auto pos_offs = algorithms::nearest_offsets_in_paths(&graph, mapping_pos, nearby ? search_limit : -1, path_filter);
+ for (auto look_at_end : end) {
+ // For the start and the end of the Mapping, as needed
+ for (auto& p : pos_offs) {
+ // For each path, splice the list of path positions for this Mapping
+ // onto the end of the list of positions we found in that path
+ auto& v = offsets[p.first];
+ for (pair& y : p.second) {
+ v.emplace_back(y.second ? y.first - mapping_width : y.first,
+ y.second);
+ }
+ }
}
}
- }
- }
- if (!nearby && offsets.empty()) {
- // find the nearest if we couldn't find any before
- return alignment_path_offsets(graph, aln, just_min, true, search_limit, path_filter);
- }
- if (just_min) {
- // We need the minimum position for each path
- for (auto& p : offsets) {
- auto& v = p.second;
- auto m = *min_element(v.begin(), v.end(),
- [](const pair& a,
- const pair& b)
- { return a.first < b.first; });
- v.clear();
- v.push_back(m);
- }
- }
- return offsets;
-}
-
-unordered_map > >
-multipath_alignment_path_offsets(const PathPositionHandleGraph& graph,
- const multipath_alignment_t& mp_aln,
- const std::function* path_filter) {
-
- using path_positions_t = unordered_map>>;
-
- // collect the search results for each mapping on each subpath
- vector> search_results(mp_aln.subpath_size());
- for (size_t i = 0; i < mp_aln.subpath_size(); ++i) {
- const subpath_t& subpath = mp_aln.subpath(i);
- auto& subpath_search_results = search_results[i];
- subpath_search_results.resize(subpath.path().mapping_size());
- for (size_t j = 0; j < subpath.path().mapping_size(); ++j) {
- // get the positions on paths that this mapping touches
- pos_t mapping_pos = make_pos_t(subpath.path().mapping(j).position());
- subpath_search_results[j] = nearest_offsets_in_paths(&graph, mapping_pos, 0, path_filter);
- // make sure that offsets are stored in increasing order
- for (pair>>& search_record : subpath_search_results[j]) {
- sort(search_record.second.begin(), search_record.second.end());
+ if (!nearby && offsets.empty()) {
+ // find the nearest if we couldn't find any before
+ return alignment_path_offsets(graph, aln, just_min, true, search_limit, path_filter);
+ }
+ if (just_min) {
+ // We need the minimum position for each path
+ for (auto& p : offsets) {
+ auto& v = p.second;
+ auto m = *min_element(v.begin(), v.end(),
+ [](const pair& a,
+ const pair& b)
+ { return a.first < b.first; });
+ v.clear();
+ v.push_back(m);
+ }
}
+ return offsets;
+ }
+
+ unordered_map > >
+ multipath_alignment_path_offsets(const PathPositionHandleGraph& graph,
+ const multipath_alignment_t& mp_aln,
+ const std::function* path_filter) {
+
+ using path_positions_t = unordered_map>>;
+
+ // collect the search results for each mapping on each subpath
+ vector> search_results(mp_aln.subpath_size());
+ for (size_t i = 0; i < mp_aln.subpath_size(); ++i) {
+ const subpath_t& subpath = mp_aln.subpath(i);
+ auto& subpath_search_results = search_results[i];
+ subpath_search_results.resize(subpath.path().mapping_size());
+ for (size_t j = 0; j < subpath.path().mapping_size(); ++j) {
+ // get the positions on paths that this mapping touches
+ pos_t mapping_pos = make_pos_t(subpath.path().mapping(j).position());
+ subpath_search_results[j] = nearest_offsets_in_paths(&graph, mapping_pos, 0, path_filter);
+ // make sure that offsets are stored in increasing order
+ for (pair>>& search_record : subpath_search_results[j]) {
+ sort(search_record.second.begin(), search_record.second.end());
+ }
#ifdef debug_mpaln_offsets
- cerr << "subpath " << i << ", mapping " << j << " path locations" << endl;
+ cerr << "subpath " << i << ", mapping " << j << " path locations" << endl;
for (const auto& pps : subpath_search_results[j]) {
cerr << graph.get_path_name(pps.first) << endl;
for (const auto& pp : pps.second) {
@@ -98,132 +98,137 @@ multipath_alignment_path_offsets(const PathPositionHandleGraph& graph,
}
}
#endif
- }
- }
-
- path_positions_t return_val;
-
- // to keep track of whether we've already chosen a position on each path
- // earlier in the multipath alignment in either the forward or reverse pass
- vector> covered_fwd(mp_aln.subpath_size());
- vector> covered_rev(mp_aln.subpath_size());
-
- // forward pass looking for positions on the forward strand of paths
- for (size_t i = 0; i < mp_aln.subpath_size(); ++i) {
- const auto& subpath_search_results = search_results[i];
- for (size_t j = 0; j < subpath_search_results.size(); ++j) {
- for (const auto& path_pos : subpath_search_results[j]) {
- if (!covered_fwd[i].count(path_pos.first)) {
- // we haven't already covered this path at an earlier position on the alignment
- for (const auto& path_offset : path_pos.second) {
- if (!path_offset.second) {
- // there's a position on the forward strand of this path
- return_val[path_pos.first].emplace_back(path_offset);
-
- // we're now covering this path for future search results
- covered_fwd[i].insert(path_pos.first);
-
+ }
+ }
+
+ path_positions_t return_val;
+
+ // to keep track of whether we've already chosen a position on each path
+ // earlier in the multipath alignment in either the forward or reverse pass
+ vector> covered_fwd(mp_aln.subpath_size());
+ vector> covered_rev(mp_aln.subpath_size());
+
+ // forward pass looking for positions on the forward strand of paths
+ for (size_t i = 0; i < mp_aln.subpath_size(); ++i) {
+ const auto& subpath_search_results = search_results[i];
+ for (size_t j = 0; j < subpath_search_results.size(); ++j) {
+ for (const auto& path_pos : subpath_search_results[j]) {
+ if (!covered_fwd[i].count(path_pos.first)) {
+ // we haven't already covered this path at an earlier position on the alignment
+ for (const auto& path_offset : path_pos.second) {
+ if (!path_offset.second) {
+ // there's a position on the forward strand of this path
+ return_val[path_pos.first].emplace_back(path_offset);
+
+ // we're now covering this path for future search results
+ covered_fwd[i].insert(path_pos.first);
+
#ifdef debug_mpaln_offsets
- cerr << "found fwd pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first << " " << path_offset.second << endl;
+ cerr << "found fwd pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first << " " << path_offset.second << endl;
#endif
-
- break;
+
+ break;
+ }
+ }
}
}
}
+
+ // the following subpaths will be covered for any path that this
+ // one is covered for
+ for (auto n : mp_aln.subpath(i).next()) {
+ auto& next_coverings = covered_fwd[n];
+ for (auto path_handle : covered_fwd[i]) {
+ next_coverings.insert(path_handle);
+ }
+ }
+ for (const auto& c : mp_aln.subpath(i).connection()) {
+ auto& next_coverings = covered_fwd[c.next()];
+ for (auto path_handle : covered_fwd[i]) {
+ next_coverings.insert(path_handle);
+ }
+ }
}
- }
-
- // the following subpaths will be covered for any path that this
- // one is covered for
- for (auto n : mp_aln.subpath(i).next()) {
- auto& next_coverings = covered_fwd[n];
- for (auto path_handle : covered_fwd[i]) {
- next_coverings.insert(path_handle);
- }
- }
- for (const auto& c : mp_aln.subpath(i).connection()) {
- auto& next_coverings = covered_fwd[c.next()];
- for (auto path_handle : covered_fwd[i]) {
- next_coverings.insert(path_handle);
- }
- }
- }
-
- // now do a backward pass for the reverse strand of paths
- for (int64_t i = mp_aln.subpath_size() - 1; i >= 0; --i) {
- // find which paths are already covered in the reverse
- for (auto n : mp_aln.subpath(i).next()) {
- for (auto path_handle : covered_rev[n]) {
- covered_rev[i].insert(path_handle);
- }
- }
- for (const auto& c : mp_aln.subpath(i).connection()) {
- for (auto path_handle : covered_rev[c.next()]) {
- covered_rev[i].insert(path_handle);
- }
- }
-
- const auto& subpath_search_results = search_results[i];
- for (int64_t j = subpath_search_results.size() - 1; j >= 0; --j) {
- for (const auto& path_pos : subpath_search_results[j]) {
- if (!covered_rev[i].count(path_pos.first)) {
- // we haven't already covered this path at an earlier position on the alignment
- for (const auto& path_offset : path_pos.second) {
- if (path_offset.second) {
- // there's a position on the reverse strand of this path
- auto mapping_len = mapping_from_length(mp_aln.subpath(i).path().mapping(j));
- return_val[path_pos.first].emplace_back(path_offset.first - mapping_len,
- path_offset.second);
-
+
+ // now do a backward pass for the reverse strand of paths
+ for (int64_t i = mp_aln.subpath_size() - 1; i >= 0; --i) {
+ // find which paths are already covered in the reverse
+ for (auto n : mp_aln.subpath(i).next()) {
+ for (auto path_handle : covered_rev[n]) {
+ covered_rev[i].insert(path_handle);
+ }
+ }
+ for (const auto& c : mp_aln.subpath(i).connection()) {
+ for (auto path_handle : covered_rev[c.next()]) {
+ covered_rev[i].insert(path_handle);
+ }
+ }
+
+ const auto& subpath_search_results = search_results[i];
+ for (int64_t j = subpath_search_results.size() - 1; j >= 0; --j) {
+ for (const auto& path_pos : subpath_search_results[j]) {
+ if (!covered_rev[i].count(path_pos.first)) {
+ // we haven't already covered this path at an earlier position on the alignment
+ for (const auto& path_offset : path_pos.second) {
+ if (path_offset.second) {
+ // there's a position on the reverse strand of this path
+ auto mapping_len = mapping_from_length(mp_aln.subpath(i).path().mapping(j));
+ return_val[path_pos.first].emplace_back(path_offset.first - mapping_len,
+ path_offset.second);
+
#ifdef debug_mpaln_offsets
- cerr << "found rev pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first - mapping_len << " " << path_offset.second << endl;
+ cerr << "found rev pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first - mapping_len << " " << path_offset.second << endl;
#endif
- // we're now covering this path for future search results
- covered_rev[i].insert(path_pos.first);
-
- break;
+ // we're now covering this path for future search results
+ covered_rev[i].insert(path_pos.first);
+
+ break;
+ }
+ }
}
}
}
}
+
+ return return_val;
}
- }
-
- return return_val;
-}
-
-void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) {
- annotate_with_path_positions(graph, aln, true, search_limit, path_filter);
-}
-
-void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) {
- annotate_with_path_positions(graph, aln, false, search_limit, path_filter);
-}
-
-void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit, const std::function* path_filter) {
- if (!aln.refpos_size()) {
- // Get requested path positions
- unordered_map > > positions = alignment_path_offsets(graph, aln, just_min, false, search_limit, path_filter);
- // emit them in order of the path handle
- vector ordered;
- for (auto& path : positions) { ordered.push_back(path.first); }
- std::sort(ordered.begin(), ordered.end(), [](const path_handle_t& a, const path_handle_t& b) { return as_integer(a) < as_integer(b); });
- for (auto& path : ordered) {
- for (auto& p : positions[path]) {
- // Add each determined refpos
- Position* refpos = aln.add_refpos();
- refpos->set_name(graph.get_path_name(path));
- refpos->set_offset(p.first);
- refpos->set_is_reverse(p.second);
+
+ void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) {
+ annotate_with_path_positions(graph, aln, true, search_limit, path_filter);
+ }
+
+ void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) {
+ annotate_with_path_positions(graph, aln, false, search_limit, path_filter);
+ }
+
+ void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit, const std::function* path_filter) {
+ if (!aln.refpos_size()) {
+ // Get requested path positions
+ unordered_map > > positions = alignment_path_offsets(graph, aln, just_min, false, search_limit, path_filter);
+ // emit them in order of the path handle
+ vector ordered;
+ for (auto& path : positions) { ordered.push_back(path.first); }
+ std::sort(ordered.begin(), ordered.end(), [](const path_handle_t& a, const path_handle_t& b) { return as_integer(a) < as_integer(b); });
+ for (auto& path : ordered) {
+ for (auto& p : positions[path]) {
+ // Add each determined refpos
+
+ Position* refpos = aln.add_refpos();
+ subrange_t subrange;
+ string path_name = graph.get_path_name(path);
+ path_name = Paths::strip_subrange(path_name, &subrange);
+ int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
+ refpos->set_name(path_name);
+ refpos->set_offset(offset + p.first);
+ refpos->set_is_reverse(p.second);
+ }
+ }
}
}
- }
-}
-void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& alns, size_t search_limit, const std::function* path_filter) {
- for (auto& aln : alns) annotate_with_initial_path_positions(graph, aln, search_limit, path_filter);
-}
+ void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& alns, size_t search_limit, const std::function* path_filter) {
+ for (auto& aln : alns) annotate_with_initial_path_positions(graph, aln, search_limit, path_filter);
+ }
-}
-}
+ }
+}
\ No newline at end of file
diff --git a/src/algorithms/k_widest_paths.cpp b/src/algorithms/k_widest_paths.cpp
index 3576b71ae02..5bcfd0267b8 100644
--- a/src/algorithms/k_widest_paths.cpp
+++ b/src/algorithms/k_widest_paths.cpp
@@ -246,7 +246,7 @@ vector>> yens_k_widest_paths(const HandleGraph* g,
forgotten_nodes.insert(g->flip(prev_path[j]));
}
- // find our path from the the spur_node to the sink
+ // find our path from the spur_node to the sink
pair> spur_path_v = widest_dijkstra(g, spur_node, sink, node_weight_callback, edge_weight_callback,
[&](handle_t h) {return forgotten_nodes.count(h);},
[&](edge_t e) {return forgotten_edges.count(e);},
diff --git a/src/cactus.cpp b/src/cactus.cpp
index 7d5a6bc8ac3..a66c8e90e22 100644
--- a/src/cactus.cpp
+++ b/src/cactus.cpp
@@ -148,7 +148,7 @@ void getReachableBridges(stCactusEdgeEnd *edgeEnd1, stList *bridgeEnds) {
}
/**
- * Finds an arbitrary pair of telomeres in a Cactus graph, which are are either
+ * Finds an arbitrary pair of telomeres in a Cactus graph, which are either
* a pair of bridge edge ends or a pair of chain edge ends, oriented such that
* they form a pair of boundaries.
*
diff --git a/src/clip.cpp b/src/clip.cpp
index 0d2c5efd78d..8b078619fc9 100644
--- a/src/clip.cpp
+++ b/src/clip.cpp
@@ -135,7 +135,7 @@ void visit_contained_snarls(const PathPositionHandleGraph* graph, const vector>& com
}
cerr << strm.str();
#endif
- // the the original component
+ // the original component
components[component_idx] = move(new_components[0]);
// add the remaining to the end
for (size_t i = 1; i < new_components.size(); i++) {
diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp
index 118d9df45a5..aa052db955a 100644
--- a/src/deconstructor.cpp
+++ b/src/deconstructor.cpp
@@ -1,6 +1,7 @@
#include "deconstructor.hpp"
#include "traversal_finder.hpp"
#include
+#include "traversal_clusters.hpp"
//#define debug
@@ -8,8 +9,7 @@ using namespace std;
namespace vg {
-Deconstructor::Deconstructor() : VCFOutputCaller(""),
- exhaustive_jaccard_warning(false){
+Deconstructor::Deconstructor() : VCFOutputCaller("") {
}
Deconstructor::~Deconstructor(){
}
@@ -23,13 +23,12 @@ Deconstructor::~Deconstructor(){
* ought to become in the VCF. If a traversal is flagged off, it gets a -1.
*/
vector Deconstructor::get_alleles(vcflib::Variant& v,
- const pair,
- vector>>& path_travs,
+ const vector& travs,
+ const vector>& trav_steps,
int ref_path_idx,
- const vector& use_trav,
+ const vector>& trav_clusters,
char prev_char, bool use_start) const {
- auto& travs = path_travs.first;
assert(ref_path_idx >=0 && ref_path_idx < travs.size());
// map strings to allele numbers (and their traversal)
@@ -42,14 +41,18 @@ vector Deconstructor::get_alleles(vcflib::Variant& v,
vector trav_to_allele(travs.size());
// compute the allele as a string
- auto trav_to_string = [&](const SnarlTraversal& trav) {
+ auto trav_to_string = [&](const Traversal& trav) {
string allele;
- // we skip the snarl endpoints
- for (int j = 1; j < trav.visit_size() - 1; ++j) {
- const string& node_sequence = graph->get_sequence(graph->get_handle(trav.visit(j).node_id()));
- allele += trav.visit(j).backward() ? reverse_complement(node_sequence) : node_sequence;
+ // hack to support star alleles
+ if (trav.size() == 0) {
+ allele = "*";
+ } else {
+ // we skip the snarl endpoints
+ for (int j = 1; j < trav.size() - 1; ++j) {
+ allele += toUppercase(graph->get_sequence(trav[j]));
+ }
}
- return toUppercase(allele);
+ return allele;
};
// set the reference allele
@@ -59,10 +62,11 @@ vector Deconstructor::get_alleles(vcflib::Variant& v,
bool substitution = true;
// set the other alleles (they can end up as 0 alleles too if their strings match the reference)
- for (int i = 0; i < travs.size(); ++i) {
- if (i != ref_path_idx) {
- if (use_trav[i]) {
- string allele = trav_to_string(travs[i]);
+ // note that we have one (unique) allele per cluster, so we take advantage of that here
+ for (const vector& cluster : trav_clusters) {
+ string allele = trav_to_string(travs[cluster.front()]);
+ for (const int& i : cluster) {
+ if (i != ref_path_idx) {
auto ai_it = allele_idx.find(allele);
if (ai_it == allele_idx.end()) {
// make a new allele for this string
@@ -133,15 +137,15 @@ vector Deconstructor::get_alleles(vcflib::Variant& v,
if (untangle_allele_traversals) {
// set up for reference position context mapping across allele traversals
- path_handle_t ref_path = graph->get_path_handle_of_step(path_travs.second.at(ref_path_idx).first);
+ path_handle_t ref_path = graph->get_path_handle_of_step(trav_steps.at(ref_path_idx).first);
unordered_map>>> ref_dup_nodes;
unordered_map ref_simple_pos;
{
auto& trav = travs.at(ref_path_idx);
- for (size_t i = 0; i < trav.visit_size(); ++i) {
- size_t j = !reversed ? i : trav.visit_size() - 1 - i;
- const Visit& visit = trav.visit(j);
- nid_t node_id = visit.node_id();
+ for (size_t i = 0; i < trav.size(); ++i) {
+ size_t j = !reversed ? i : trav.size() - 1 - i;
+ const handle_t& handle = trav[j];
+ nid_t node_id = graph->get_id(handle);
if (ref_simple_pos.find(node_id) != ref_simple_pos.end()) continue;
if (ref_dup_nodes.find(node_id) != ref_dup_nodes.end()) continue;
handle_t h = graph->get_handle(node_id);
@@ -188,8 +192,8 @@ vector Deconstructor::get_alleles(vcflib::Variant& v,
for (size_t i = 0; i < allele_idx_unfolded.size(); i++) {
int allele_no = i;
int allele_trav_no = allele_idx_unfolded[i];
- auto start_step = path_travs.second.at(allele_trav_no).first;
- auto end_step = path_travs.second.at(allele_trav_no).second;
+ auto start_step = trav_steps.at(allele_trav_no).first;
+ auto end_step = trav_steps.at(allele_trav_no).second;
auto start_pos = graph->get_position_of_step(start_step);
auto end_pos = graph->get_position_of_step(end_step);
bool flip_path = start_pos > end_pos;
@@ -251,7 +255,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v,
for (auto& c : ref_contexts) {
auto& ref_context = c.second;
auto& ref_pos = c.first;
- double j = context_jaccard(ref_context, path_context);
+ double j = jaccard_coefficient(ref_context, path_context);
if (j > best_jaccard) {
best_jaccard = j;
best_pos = ref_pos;
@@ -273,7 +277,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v,
int allele_no = ai_pair.second.first;
int allele_trav_no = ai_pair.second.second;
// update the traversal info
- add_allele_path_to_info(v, allele_no, travs[allele_trav_no], reversed, !substitution);
+ add_allele_path_to_info(graph, v, allele_no, travs[allele_trav_no], reversed, !substitution);
}
}
@@ -289,13 +293,18 @@ vector Deconstructor::get_alleles(vcflib::Variant& v,
}
void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& names,
- const vector& trav_to_allele) const {
+ const vector& trav_to_allele,
+ const vector>& trav_to_cluster_info) const {
assert(names.size() == trav_to_allele.size());
// set up our variant fields
v.format.push_back("GT");
- if (show_path_info && path_to_sample_phase && path_restricted) {
+ if (show_path_info && path_to_sample_phase) {
v.format.push_back("PI");
}
+ if (this->cluster_threshold < 1.0) {
+ v.format.push_back("TS");
+ v.format.push_back("TL");
+ }
// get a list of traversals for every vcf sample
// (this will be 1:1 unless we're using the path_to_sample name map)
@@ -345,6 +354,18 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name
}
genotype += (chosen_travs[i] != -1 && (!conflict || keep_conflicted_genotypes))
? std::to_string(trav_to_allele[chosen_travs[i]]) : ".";
+ if (this->cluster_threshold < 1.0) {
+ if (*genotype.rbegin() == '.') {
+ v.samples[sample_name]["TS"].push_back(".");
+ v.samples[sample_name]["TL"].push_back(".");
+ } else {
+ ostringstream ss;
+ ss.precision(3);
+ ss << trav_to_cluster_info[chosen_travs[i]].first;
+ v.samples[sample_name]["TS"].push_back(ss.str());
+ v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second));
+ }
+ }
}
v.samples[sample_name]["GT"] = {genotype};
if (show_path_info && path_to_sample_phase) {
@@ -364,7 +385,7 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name
}
}
v.samples[sample_name]["GT"] = {blank_gt};
- if (show_path_info && path_to_sample_phase && path_restricted) {
+ if (show_path_info && path_to_sample_phase) {
v.samples[sample_name]["PI"] = {blank_gt};
}
}
@@ -422,9 +443,9 @@ pair, bool> Deconstructor::choose_traversals(const string& sample_na
std::any_of(gbwt_phases.begin(), gbwt_phases.end(), [](int i) { return i >= 0; });
//|| path_to_sample_phase;
bool phasing_conflict = false;
- int sample_ploidy = ploidy;
+ int sample_ploidy = 1;
int min_phase = 1;
- int max_phase = ploidy;
+ int max_phase = 1;
if (has_phasing || path_to_sample_phase) {
if (has_phasing) {
// override ploidy with information about all phases found in input
@@ -490,38 +511,6 @@ pair, bool> Deconstructor::choose_traversals(const string& sample_na
return make_pair(most_frequent_travs, conflict);
}
-
-// todo refactor if we need to reuse elsewhere in vg
-// implemented inline for development
-// assumes sorted input
-double Deconstructor::context_jaccard(
- const vector& target,
- const vector& query) const {
- size_t node_isec = 0;
- std::set_intersection(target.begin(), target.end(),
- query.begin(), query.end(),
- count_back_inserter(node_isec));
- size_t node_union = 0;
- std::set_union(target.begin(), target.end(),
- query.begin(), query.end(),
- count_back_inserter(node_union));
- return (double)node_isec / (double)node_union;
-}
-
-double Deconstructor::context_jaccard(
- const dac_vector<>& target,
- const vector& query) const {
- size_t node_isec = 0;
- std::set_intersection(target.begin(), target.end(),
- query.begin(), query.end(),
- count_back_inserter(node_isec));
- size_t node_union = 0;
- std::set_union(target.begin(), target.end(),
- query.begin(), query.end(),
- count_back_inserter(node_union));
- return (double)node_isec / (double)node_union;
-}
-
vector Deconstructor::get_context(
step_handle_t start_step,
step_handle_t end_step) const {
@@ -564,69 +553,40 @@ vector Deconstructor::get_context(
return context;
}
-vector Deconstructor::get_context(
- const pair,
- vector>>& path_travs,
- const int& trav_idx) const {
- step_handle_t start_step = path_travs.second[trav_idx].first;
- step_handle_t end_step = path_travs.second[trav_idx].second;
- return get_context(start_step, end_step);
-}
-bool Deconstructor::deconstruct_site(const Snarl* snarl) const {
-
- auto contents = snarl_manager->shallow_contents(snarl, *graph, false);
- if (contents.first.empty()) {
- // Nothing but the boundary nodes in this snarl
+void Deconstructor::get_traversals(const handle_t& snarl_start, const handle_t& snarl_end,
+ vector& out_travs,
+ vector& out_trav_path_names,
+ vector>& out_trav_steps) const {
+ // empty snarl check
+ vector next_handles;
+ graph->follow_edges(snarl_start, false, [&](handle_t handle) {
+ next_handles.push_back(handle);
+ });
+ if (next_handles.size() == 1 && next_handles.back() == snarl_end) {
#ifdef debug
#pragma omp critical (cerr)
- cerr << "Skipping empty site " << pb2json(*snarl) << endl;
-#endif
- return false;
+ cerr << "Skipping empty site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl;
+#endif
+ return;
}
+
#ifdef debug
#pragma omp crtiical (cerr)
- cerr << "Computing traversals of site " << pb2json(*snarl) << endl;
+ cerr << "Computing traversals of site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl;
#endif
// find every traversal that runs through a path in the graph
- pair, vector > > path_travs;
- path_travs = path_trav_finder->find_path_traversals(*snarl);
- vector path_trav_names;
- for (const pair& trav_ends : path_travs.second) {
- path_trav_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_ends.first)));
- }
-
- // pick out the traversal corresponding to a reference path, breaking ties consistently
- string ref_trav_name;
- for (int i = 0; i < path_travs.first.size(); ++i) {
- const string& path_trav_name = path_trav_names.at(i);
-#ifdef debug
-#pragma omp critical (cerr)
- {
- cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << path_travs.first[i].visit_size()
- << ", start=" << graph->get_position_of_step(path_travs.second[i].first)
- << ", end=" << graph->get_position_of_step(path_travs.second[i].second) << endl
- << " trav=" << pb2json(path_travs.first[i]) << endl;
- }
-#endif
- if (ref_paths.count(path_trav_name) &&
- (ref_trav_name.empty() || path_trav_name < ref_trav_name)) {
- ref_trav_name = path_trav_name;
-#ifdef debug
-#pragma omp critical (cerr)
- cerr << "Setting ref_trav_name " << ref_trav_name << endl;
-#endif
- }
+ std::tie(out_travs, out_trav_steps) = path_trav_finder->find_path_traversals(snarl_start, snarl_end);
+ for (const pair& trav_steps : out_trav_steps) {
+ out_trav_path_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_steps.first)));
}
// add in the gbwt traversals
// after this, all traversals are treated the same, with metadata embedded in their names
- int64_t first_gbwt_trav_idx = path_trav_names.size();
- vector gbwt_path_ids;
if (gbwt_trav_finder.get() != nullptr) {
const gbwt::GBWT& gbwt_index = gbwt_trav_finder->get_gbwt();
- pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(*snarl);
+ pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(snarl_start, snarl_end);
for (int i = 0; i < thread_travs.first.size(); ++i) {
// We need to get a bunch of metadata about the path, but the GBWT
// we have might not even have structured path names stored.
@@ -635,7 +595,6 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const {
continue;
}
- gbwt_path_ids.push_back(path_id);
PathSense sense = gbwtgraph::get_path_sense(gbwt_index, path_id, gbwt_reference_samples);
if (sense == PathSense::HAPLOTYPE) {
@@ -648,26 +607,151 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const {
gbwtgraph::get_path_haplotype(gbwt_index, path_id, sense),
gbwtgraph::get_path_phase_block(gbwt_index, path_id, sense),
gbwtgraph::get_path_subrange(gbwt_index, path_id, sense));
- path_trav_names.push_back(path_name);
- path_travs.first.push_back(thread_travs.first[i]);
- // dummy handles so we can use the same code as the named path traversals above
- path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t()));
+ out_trav_path_names.push_back(path_name);
+ out_travs.push_back(std::move(thread_travs.first[i]));
+ }
+ }
+ }
+}
+
+unordered_map> Deconstructor::add_star_traversals(vector& travs,
+ vector& names,
+ vector>& trav_clusters,
+ vector>& trav_cluster_info,
+ const unordered_map>& parent_haplotypes) const {
+ // todo: refactor this into general genotyping code
+ unordered_map> sample_to_haps;
+
+ // find out what's in the traversals
+ assert(names.size() == travs.size());
+ for (int64_t i = 0; i < names.size(); ++i) {
+ string sample_name = PathMetadata::parse_sample_name(names[i]);
+ // for backward compatibility
+ if (sample_name.empty()) {
+ sample_name = names[i];
+ }
+ auto phase = PathMetadata::parse_haplotype(names[i]);
+ if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) {
+ // THis probably won't fit in an int. Use 0 instead.
+ phase = 0;
+ }
+ sample_to_haps[sample_name].push_back(phase);
+ }
+
+ // find everything that's in parent_haplotyes but not the travefsals,
+ // and add in dummy start-alleles for them
+ for (const auto& parent_sample_haps : parent_haplotypes) {
+ string parent_sample_name = PathMetadata::parse_sample_name(parent_sample_haps.first);
+ if (parent_sample_name.empty()) {
+ parent_sample_name = parent_sample_haps.first;
+ }
+ if (!this->sample_names.count(parent_sample_name)) {
+ // dont' bother for purely reference samples -- we don't need to force and allele for them.
+ continue;
+ }
+ for (int parent_hap : parent_sample_haps.second) {
+ bool found = false;
+ if (sample_to_haps.count(parent_sample_haps.first)) {
+ // note: this is brute-force search, but number of haplotypes usually tiny.
+ for (int hap : sample_to_haps[parent_sample_haps.first]) {
+ if (parent_hap == hap) {
+ found = true;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ travs.push_back(Traversal());
+ names.push_back(PathMetadata::create_path_name(PathSense::REFERENCE,
+ parent_sample_haps.first,
+ "star",
+ parent_hap,
+ PathMetadata::NO_PHASE_BLOCK,
+ PathMetadata::NO_SUBRANGE));
+ sample_to_haps[parent_sample_haps.first].push_back(parent_hap);
+ trav_clusters.push_back({(int)travs.size() - 1});
+ trav_cluster_info.push_back(make_pair(0, 0));
}
}
}
+ return sample_to_haps;
+}
+
+
+bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end,
+ const NestingInfo* in_nesting_info,
+ vector* out_nesting_infos) const {
+
+
+ vector travs;
+ vector trav_path_names;
+ // note that this vector (unlike the above two) is for embedded paths only (not GBWT)
+ vector> trav_steps;
+
+ // compute all the traversals from embedded paths and gbwt
+ this->get_traversals(snarl_start, snarl_end, travs, trav_path_names, trav_steps);
+ int64_t trav_count = travs.size();
+ int64_t trav_step_count = trav_steps.size();
+
+ if (travs.empty()) {
+ return false;
+ }
+
+ // pick out the traversal corresponding to an embedded reference path, breaking ties consistently
+ string ref_trav_name;
+ string parent_ref_trav_name;
+ if (in_nesting_info != nullptr && in_nesting_info->has_ref) {
+ parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first));
+#ifdef debug
+#pragma omp critical (cerr)
+ cerr << "Using nesting information to set reference to " << parent_ref_trav_name << endl;
+#endif
+ // remember it for the vcf header
+ this->off_ref_paths[omp_get_thread_num()].insert(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first));
+ }
+ for (int i = 0; i < travs.size(); ++i) {
+ const string& path_trav_name = trav_path_names[i];
+#ifdef debug
+#pragma omp critical (cerr)
+ {
+ cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].size();
+ if (i < trav_steps.size()) {
+ cerr << ", start=" << graph->get_position_of_step(trav_steps[i].first)
+ << ", end=" << graph->get_position_of_step(trav_steps[i].second) << endl;
+ }
+ cerr << " trav=" << traversal_to_string(graph, travs[i]) << endl;
+ }
+#endif
+ bool ref_path_check;
+ if (!parent_ref_trav_name.empty()) {
+ // the reference was specified by the parent
+ ref_path_check = path_trav_name == parent_ref_trav_name;
+ } else {
+ // the reference comes from the global options
+ ref_path_check = ref_paths.count(path_trav_name);
+ }
+ if (ref_path_check &&
+ (ref_trav_name.empty() || path_trav_name < ref_trav_name)) {
+ ref_trav_name = path_trav_name;
+#ifdef debug
+#pragma omp critical (cerr)
+ cerr << "Setting ref_trav_name " << ref_trav_name << (in_nesting_info ? " using nesting info" : "") << endl;
+#endif
+ }
+ }
+
// remember all the reference traversals (there can be more than one only in the case of a
// cycle in the reference path
// in case of cycles, we need our allele traversals to be associated to the correct reference position
// this is done with the path jaccard metric over all overlapping reference paths the given path_jaccard_window size
-
vector ref_travs;
// hacky subpath support -- gets added to variant on output
vector ref_offsets;
if (!ref_trav_name.empty()) {
- for (int i = 0; i < path_travs.first.size(); ++i) {
- const string& path_trav_name = path_trav_names.at(i);
+ for (int i = 0; i < trav_steps.size(); ++i) {
+ const string& path_trav_name = trav_path_names.at(i);
subrange_t subrange ;
Paths::strip_subrange(path_trav_name, &subrange);
int64_t sub_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
@@ -676,7 +760,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const {
ref_offsets.push_back(sub_offset);
#ifdef debug
#pragma omp critical (cerr)
- cerr << "Adding ref_tav idx=" << i << " offset=" << sub_offset << " because " << path_trav_name << " == " << ref_trav_name << endl;
+ cerr << "Adding ref_trav idx=" << i << " offset=" << sub_offset << " because " << path_trav_name << " == " << ref_trav_name << endl;
#endif
}
}
@@ -687,240 +771,339 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const {
if (ref_travs.empty()) {
#ifdef debug
#pragma omp critical (cerr)
- cerr << "Skipping site because no reference traversal was found " << pb2json(*snarl) << endl;
+ cerr << "Skipping site because no reference traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl;
#endif
return false;
}
-
- bool exhaustive = !path_restricted && gbwt_trav_finder.get() == nullptr;
- if (exhaustive) {
- // add in the exhaustive traversals
- vector additional_travs;
-
- // exhaustive traversal can't do all snarls
- if (snarl->type() != ULTRABUBBLE) {
- return false;
- }
- if (!check_max_nodes(snarl)) {
-#pragma omp critical (cerr)
- cerr << "Warning: Skipping site because it is too complex for exhaustive traversal enumeration: " << pb2json(*snarl) << endl << " Consider using -e to traverse embedded paths" << endl;
- return false;
- }
- additional_travs = explicit_exhaustive_traversals(snarl);
-
- // happens when there was a nested non-ultrabubble snarl
- if (additional_travs.empty()) {
- return false;
- }
- path_travs.first.insert(path_travs.first.end(), additional_travs.begin(), additional_travs.end());
- for (int i = 0; i < additional_travs.size(); ++i) {
- // dummy names so we can use the same code as the named path traversals above
- path_trav_names.push_back(" >>" + std::to_string(i));
- // dummy handles so we can use the same code as the named path traversals above
- path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t()));
- }
-
- }
// there's not alt path through the snarl, so we can't make an interesting variant
- if (path_travs.first.size() < 2) {
+ if (travs.size() < 2) {
#ifdef debug
#pragma omp critical (cerr)
- cerr << "Skipping site because to alt traversal was found " << pb2json(*snarl) << endl;
+ cerr << "Skipping site because to alt traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl;
#endif
return false;
}
+ if (ref_travs.size() > 1 && this->nested_decomposition) {
+#ifdef debug
+#pragma omp critical (cerr)
+ cerr << "Multiple ref traversals not yet supported with nested decomposition: removing all but first" << endl;
+#endif
+ size_t min_start_pos = numeric_limits::max();
+ int64_t first_ref_trav;
+ for (int64_t i = 0; i < ref_travs.size(); ++i) {
+ auto& ref_trav_idx = ref_travs[i];
+ step_handle_t start_step = trav_steps[ref_trav_idx].first;
+ step_handle_t end_step = trav_steps[ref_trav_idx].second;
+ size_t ref_trav_pos = min(graph->get_position_of_step(start_step), graph->get_position_of_step(end_step));
+ if (ref_trav_pos < min_start_pos) {
+ min_start_pos = ref_trav_pos;
+ first_ref_trav = i;
+ }
+ }
+ ref_travs = {ref_travs[first_ref_trav]};
+ }
+
// XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles
// we collect windows around the reference traversals
// to compare with equivalent windows from the alternate allele paths
// we will associate these 1:1 with reference traversals
- // remember that path_travs := pair, vector > > path_travs;
+ // remember that path_travs := pair, vector > > path_travs;
// map from each path_trav index to the ref_trav index it best maps to
vector path_trav_to_ref_trav;
- if (ref_travs.size() > 1 && this->path_jaccard_window && exhaustive && !exhaustive_jaccard_warning) {
-#pragma omp critical (cerr)
- cerr << "warning [vg deconstruct]: Conext Jaccard logic for multiple references disabled with exhaustive traversals. Use -e, -g or GBZ input to switch to path-based traversals only (recommended)." << endl;
- exhaustive_jaccard_warning = true;
- }
- if (ref_travs.size() > 1 && this->path_jaccard_window && !exhaustive) {
- path_trav_to_ref_trav.resize(path_travs.first.size());
+ if (ref_travs.size() > 1 && this->path_jaccard_window) {
+ path_trav_to_ref_trav.resize(trav_steps.size());
#ifdef debug
#pragma omp critical (cerr)
cerr << "Multiple ref traversals!" << endl;
#endif
- {
- vector> ref_contexts(ref_travs.size());
+ vector> ref_contexts(ref_travs.size());
#pragma omp parallel for schedule(dynamic,1)
- for (size_t i = 0; i < ref_travs.size(); ++i) {
- auto& trav_id = ref_travs[i];
- ref_contexts[i] = get_context(path_travs, trav_id);
- }
- // now for each traversal, we compute and equivalent context and match it to a ref context
- // using a jaccard metric over node ids
+ for (size_t i = 0; i < ref_travs.size(); ++i) {
+ ref_contexts[i] = get_context(trav_steps[ref_travs[i]].first, trav_steps[ref_travs[i]].second);
+ }
+
+ // now for each traversal, we compute and equivalent context and match it to a ref context
+ // using a jaccard metric over node ids
#pragma omp parallel for schedule(dynamic,1)
- for (size_t i = 0; i < path_travs.first.size(); ++i) {
- vector context = get_context(path_travs, i);
- // map jaccard metric to the index of the ref_trav
- vector> ref_mappings;
- for (uint64_t j = 0; j < ref_travs.size(); ++j) {
- ref_mappings.push_back(make_pair(
- context_jaccard(
- ref_contexts[j],
- context),
- ref_travs[j]));
- }
- std::sort(ref_mappings.begin(), ref_mappings.end());
- // the best is the last, which has the highest jaccard
- path_trav_to_ref_trav[i] = ref_mappings.back().second;
+ for (size_t i = 0; i < trav_steps.size(); ++i) {
+ vector context = get_context(trav_steps[i].first, trav_steps[i].second);
+ // map jaccard metric to the index of the ref_trav
+ vector> ref_mappings;
+ for (uint64_t j = 0; j < ref_travs.size(); ++j) {
+ ref_mappings.push_back(make_pair(
+ jaccard_coefficient(
+ ref_contexts[j],
+ context),
+ ref_travs[j]));
}
+ std::stable_sort(ref_mappings.begin(), ref_mappings.end());
+ // the best is the last, which has the highest jaccard
+ path_trav_to_ref_trav[i] = ref_mappings.back().second;
}
}
// we write a variant for every reference traversal
// (optionally) selecting the subset of path traversals that are 1:1
-//#pragma omp parallel for
for (size_t i = 0; i < ref_travs.size(); ++i) {
-//#pragma omp task firstprivate(i)
- {
- auto& ref_trav_idx = ref_travs[i];
- auto& ref_trav_offset = ref_offsets[i];
-
- const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx];
-
- vcflib::Variant v;
- v.quality = 60;
-
- // in VCF we usually just want the contig
- string contig_name = PathMetadata::parse_locus_name(ref_trav_name);
- if (contig_name == PathMetadata::NO_LOCUS_NAME) {
+ // we zap these to their original size, as the nesting logic can add
+ // dummy traversals and these are reference-specific (and so need to be cleaned each iteration here)
+ travs.resize(trav_count);
+ trav_path_names.resize(trav_count);
+ trav_steps.resize(trav_step_count);
+ auto& ref_trav_idx = ref_travs[i];
+ auto& ref_trav_offset = ref_offsets[i];
+
+ const Traversal& ref_trav = travs[ref_trav_idx];
+
+ vcflib::Variant v;
+ v.quality = 60;
+
+ // in VCF we usually just want the contig
+ string contig_name = PathMetadata::parse_locus_name(ref_trav_name);
+ if (contig_name == PathMetadata::NO_LOCUS_NAME) {
+ contig_name = ref_trav_name;
+ } else if (long_ref_contig) {
+ // the sample name isn't unique enough, so put a full ugly name in the vcf
+ if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) {
contig_name = ref_trav_name;
- } else if (long_ref_contig) {
- // the sample name isn't unique enough, so put a full ugly name in the vcf
- if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) {
- contig_name = ref_trav_name;
- } else {
- contig_name = PathMetadata::create_path_name(PathSense::REFERENCE,
- PathMetadata::parse_sample_name(ref_trav_name),
- contig_name,
- PathMetadata::parse_haplotype(ref_trav_name),
- PathMetadata::NO_PHASE_BLOCK,
- PathMetadata::NO_SUBRANGE);
- }
+ } else {
+ contig_name = PathMetadata::create_path_name(PathSense::REFERENCE,
+ PathMetadata::parse_sample_name(ref_trav_name),
+ contig_name,
+ PathMetadata::parse_haplotype(ref_trav_name),
+ PathMetadata::NO_PHASE_BLOCK,
+ PathMetadata::NO_SUBRANGE);
}
+ }
- // write variant's sequenceName (VCF contig)
- v.sequenceName = contig_name;
-
- // Map our snarl endpoints to oriented positions in the embedded path in the graph
- handle_t first_path_handle;
- size_t first_path_pos;
- bool use_start;
- assert(ref_trav_idx < first_gbwt_trav_idx);
- step_handle_t start_step = path_travs.second[ref_trav_idx].first;
- step_handle_t end_step = path_travs.second[ref_trav_idx].second;
- handle_t start_handle = graph->get_handle_of_step(start_step);
- handle_t end_handle = graph->get_handle_of_step(end_step);
- size_t start_pos = graph->get_position_of_step(start_step);
- size_t end_pos = graph->get_position_of_step(end_step);
- use_start = start_pos < end_pos;
- first_path_handle = use_start ? start_handle : end_handle;
- first_path_pos = use_start ? start_pos : end_pos;
+ // write variant's sequenceName (VCF contig)
+ v.sequenceName = contig_name;
+
+ // Map our snarl endpoints to oriented positions in the embedded path in the graph
+ handle_t first_path_handle;
+ size_t first_path_pos;
+ bool use_start;
+ step_handle_t start_step = trav_steps[ref_trav_idx].first;
+ step_handle_t end_step = trav_steps[ref_trav_idx].second;
+ handle_t start_handle = graph->get_handle_of_step(start_step);
+ handle_t end_handle = graph->get_handle_of_step(end_step);
+ size_t start_pos = graph->get_position_of_step(start_step);
+ size_t end_pos = graph->get_position_of_step(end_step);
+ use_start = start_pos < end_pos;
+ first_path_handle = use_start ? start_handle : end_handle;
+ first_path_pos = use_start ? start_pos : end_pos;
- // Get the first visit of our snarl traversal
- const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1);
-
- char prev_char;
- if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) ||
- (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) {
- // Our path and traversal have consistent orientation. leave off the end of the start node going forward
- first_path_pos += graph->get_length(first_path_handle);
- prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]);
- } else {
- // They are flipped: leave off the beginning of the start node going backward
- prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0]));
- }
+ // Get the first visit of our snarl traversal
+ const handle_t& first_trav_handle = use_start ? ref_trav.front() : ref_trav.back();
+
+ char prev_char;
+ if ((use_start && graph->get_is_reverse(first_trav_handle) == graph->get_is_reverse(first_path_handle)) ||
+ (!use_start && graph->get_is_reverse(first_trav_handle) != graph->get_is_reverse(first_path_handle))) {
+ // Our path and traversal have consistent orientation. leave off the end of the start node going forward
+ first_path_pos += graph->get_length(first_path_handle);
+ prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]);
+ } else {
+ // They are flipped: leave off the beginning of the start node going backward
+ prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0]));
+ }
- // shift from 0-based to 1-based for VCF
- first_path_pos += 1;
+ // shift from 0-based to 1-based for VCF
+ first_path_pos += 1;
- v.position = first_path_pos + ref_trav_offset;
+ v.position = first_path_pos + ref_trav_offset;
- v.id = print_snarl(*snarl);
+ v.id = print_snarl(graph, snarl_start, snarl_end);
- // Convert the snarl traversals to strings and add them to the variant
- vector use_trav(path_travs.first.size());
- if (path_trav_to_ref_trav.size()) {
- for (uint64_t i = 0; i < use_trav.size(); ++i) {
- use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]);
+ // Convert the snarl traversals to strings and add them to the variant
+ vector use_trav(travs.size());
+ if (path_trav_to_ref_trav.size()) {
+ for (uint64_t i = 0; i < use_trav.size(); ++i) {
+ use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]);
+ }
+ } else {
+ for (uint64_t i = 0; i < use_trav.size(); ++i) {
+ use_trav[i] = true;
+ }
+ }
+
+ if (std::none_of(use_trav.begin(), use_trav.end(), [](bool b) {return b;})) {
+ // no alts were jaccard-assigned to this reference, so abort before an assertion gets tripped
+ continue;
+ }
+
+ // Sort the traversals for clustering
+ vector sorted_travs = get_traversal_order(graph, travs, trav_path_names, ref_travs, ref_trav_idx, use_trav);
+
+ // jaccard clustering (using handles for now) on traversals
+ vector> trav_cluster_info;
+ vector child_snarl_to_trav;
+ vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs,
+ (in_nesting_info ? in_nesting_info->child_snarls :
+ vector>()),
+ cluster_threshold,
+ trav_cluster_info,
+ child_snarl_to_trav);
+
+#ifdef debug
+ cerr << "cluster priority";
+ for (const auto& t: sorted_travs) {
+ cerr << " " << t;
+ }
+ cerr << endl;
+ for (const auto& tc : trav_clusters) {
+ cerr << "traversal cluster: { ";
+ for (const auto& t: tc) {
+ cerr << t << "(" << trav_cluster_info[t].first << "," << trav_cluster_info[t].second << ") ";
+ }
+ cerr << " }" << endl;
+ }
+#endif
+
+ unordered_map> sample_to_haps;
+ if (in_nesting_info != nullptr) {
+ // if the reference traversal is also an alt traversal, we pop out an extra copy
+ // todo: this is a hack add add off-reference support while keeping the current
+ // logic where the reference traversal is always distinct from the alts. this step
+ // could be avoided, but it would come at the cost of some detailed refactoring of the
+ // allele getting code...
+ string ref_sample_name = PathMetadata::parse_sample_name(trav_path_names[ref_trav_idx]);
+ if (this->sample_names.count(ref_sample_name)) {
+ int alt_trav_copy = travs.size();
+ travs.push_back(travs[ref_trav_idx]);
+ trav_path_names.push_back(trav_path_names[ref_trav_idx]);
+ trav_cluster_info.push_back(make_pair(0, 0));
+ if (trav_steps.size() == travs.size()) {
+ trav_steps.push_back(trav_steps[ref_trav_idx]);
}
- } else {
- for (uint64_t i = 0; i < use_trav.size(); ++i) {
- use_trav[i] = true;
+ bool found_cluster = false;
+ for (vector& cluster : trav_clusters) {
+ if (cluster[0] == ref_trav_idx) {
+ found_cluster =true;
+ cluster.push_back(alt_trav_copy);
+ break;
+ }
}
+ assert(found_cluster == true);
+ }
+
+ // add in the star alleles -- these are alleles that were genotyped in the parent but not
+ // the current allele, and are treated as *'s in VCF.
+ if (this->star_allele) {
+ sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info,
+ in_nesting_info->sample_to_haplotypes);
}
+
+ }
- vector trav_to_allele = get_alleles(v, path_travs,
- ref_trav_idx,
- use_trav,
- prev_char, use_start);
+ vector trav_to_allele = get_alleles(v, travs, trav_steps,
+ ref_trav_idx,
+ trav_clusters,
+ prev_char, use_start);
- // Fill in the genotypes
- if (path_restricted || gbwt_trav_finder.get()) {
- get_genotypes(v, path_trav_names, trav_to_allele);
- }
+
+#ifdef debug
+ assert(trav_to_allele.size() == travs.size());
+ cerr << "trav_to_allele =";
+ for (const auto& tta : trav_to_allele) {
+ cerr << " " << tta;
+ }
+ cerr << endl;
+#endif
+
+ // Fill in the genotypes
+ get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info);
+
+ // Fill in some nesting-specific (site-level) tags
+ NestingInfo ref_info; // since in_nesting_info is const, we put top-level stuff here
+ if (this->nested_decomposition) {
+ if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) {
+ // if we're a child, just take what's passed in
+ ref_info.parent_allele = in_nesting_info->parent_allele;
+ ref_info.parent_len = in_nesting_info->parent_len;
+ ref_info.parent_ref_len = in_nesting_info->parent_ref_len;
+ ref_info.lv0_ref_name = in_nesting_info->lv0_ref_name;
+ ref_info.lv0_ref_start = in_nesting_info->lv0_ref_start;
+ ref_info.lv0_ref_len = in_nesting_info->lv0_ref_len;
+ ref_info.lv0_alt_len = in_nesting_info->lv0_alt_len;
+ } else {
+ // if we're a root, compute values from the prsent site
+ // todo: should they just be left undefined?
+ ref_info.parent_allele = 0;
+ ref_info.parent_len = v.alleles[0].length();
+ ref_info.parent_ref_len = v.alleles[0].length();
+ ref_info.lv0_ref_name = v.sequenceName;
+ ref_info.lv0_ref_start = v.position;
+ ref_info.lv0_ref_len = v.alleles[0].length();
+ ref_info.lv0_alt_len = v.alleles[ref_info.parent_allele].length();
+ }
+ v.info["PA"].push_back(std::to_string(ref_info.parent_allele));
+ v.info["PL"].push_back(std::to_string(ref_info.parent_len));
+ v.info["PR"].push_back(std::to_string(ref_info.parent_ref_len));
+ v.info["RC"].push_back(ref_info.lv0_ref_name);
+ v.info["RS"].push_back(std::to_string(ref_info.lv0_ref_start));
+ v.info["RD"].push_back(std::to_string(ref_info.lv0_ref_len));
+ v.info["RL"].push_back(std::to_string(ref_info.lv0_alt_len));
+ }
- // we only bother printing out sites with at least 1 non-reference allele
- if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) {
- if (path_restricted || gbwt_trav_finder.get()) {
- // run vcffixup to add some basic INFO like AC
- vcf_fixup(v);
+ if (i == 0 && out_nesting_infos != nullptr) {
+ // we pass some information down to the children
+ // todo: do/can we consider all the diferent reference intervals?
+ // right now, the info passed is hopefully coarse-grained enough not to matter?
+ assert(in_nesting_info != nullptr &&
+ in_nesting_info->child_snarls.size() == out_nesting_infos->size());
+
+ for (int64_t j = 0; j < out_nesting_infos->size(); ++j) {
+ out_nesting_infos->at(j).child_snarls.clear();
+ out_nesting_infos->at(j).has_ref = false;
+ if (child_snarl_to_trav[j] >= 0) {
+ if (child_snarl_to_trav[j] < trav_steps.size()) {
+ NestingInfo& child_info = out_nesting_infos->at(j);
+ child_info.has_ref = true;
+ child_info.parent_path_interval = trav_steps[child_snarl_to_trav[j]];
+ child_info.sample_to_haplotypes = sample_to_haps;
+ child_info.parent_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ?
+ trav_to_allele[child_snarl_to_trav[j]] : 0;
+ child_info.parent_len = v.alleles[child_info.parent_allele].length();
+ child_info.parent_ref_len = v.alleles[0].length();
+ child_info.lv0_ref_name = ref_info.lv0_ref_name;
+ child_info.lv0_ref_start = ref_info.lv0_ref_start;
+ child_info.lv0_ref_len = ref_info.lv0_ref_len;
+ if (in_nesting_info == nullptr || in_nesting_info->has_ref == false) {
+ // we're the parent of root, so we want to set this here
+ child_info.lv0_alt_len = child_info.parent_len;
+ } else {
+ child_info.lv0_alt_len = ref_info.lv0_alt_len;
+ }
+ }
}
- add_variant(v);
+ }
+ }
+
+ // we only bother printing out sites with at least 1 non-reference allele
+ if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) {
+ // run vcffixup to add some basic INFO like AC
+ vcf_fixup(v);
+ bool added = add_variant(v);
+ if (!added) {
+ stringstream ss;
+ ss << v;
+ cerr << "Warning [vg deconstruct]: Skipping variant at " << v.sequenceName << ":" << v.position
+ << " with ID=" << v.id << " because its line length of " << ss.str().length() << " exceeds vg's limit of "
+ << VCFOutputCaller::max_vcf_line_length << endl;
+ return false;
}
}
}
-//#pragma omp taskwait
return true;
}
-/**
- * Convenience wrapper function for deconstruction of multiple paths.
- */
-void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager,
- bool path_restricted_traversals,
- int ploidy,
- bool include_nested,
- int context_jaccard_window,
- bool untangle_traversals,
- bool keep_conflicted,
- bool strict_conflicts,
- bool long_ref_contig,
- gbwt::GBWT* gbwt) {
-
- this->graph = graph;
- this->snarl_manager = snarl_manager;
- this->path_restricted = path_restricted_traversals;
- this->ploidy = ploidy;
- this->ref_paths = set(ref_paths.begin(), ref_paths.end());
- this->include_nested = include_nested;
- this->path_jaccard_window = context_jaccard_window;
- this->untangle_allele_traversals = untangle_traversals;
- this->keep_conflicted_genotypes = keep_conflicted;
- this->strict_conflict_checking = strict_conflicts;
- if (gbwt) {
- this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt);
- }
-
- // the need to use nesting is due to a problem with omp tasks and shared state
- // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1)
- omp_set_nested(1);
- omp_set_max_active_levels(3);
-
+string Deconstructor::get_vcf_header() {
// Keep track of the non-reference paths in the graph. They'll be our sample names
ref_samples.clear();
set ref_haplotypes;
@@ -929,7 +1112,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand
ref_haplotypes.insert(PathMetadata::parse_haplotype(ref_path_name));
}
if (!long_ref_contig) {
- long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1;
+ long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1 || nested_decomposition;
}
this->long_ref_contig = long_ref_contig;
sample_names.clear();
@@ -984,6 +1167,13 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand
}
}
+ if (sample_to_haps.empty()) {
+ cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that "
+ << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct "
+ << "now only works on embedded paths and GBWT threads." << endl;
+ exit(1);
+ }
+
// find some stats about the haplotypes for each sample
gbwt_sample_to_phase_range.clear();
sample_ploidys.clear();
@@ -995,10 +1185,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand
// print the VCF header
stringstream stream;
stream << "##fileformat=VCFv4.2" << endl;
- if (path_restricted || gbwt) {
- stream << "##FORMAT=" << endl;
- }
- if (show_path_info && path_to_sample_phase && path_restricted) {
+ stream << "##FORMAT=" << endl;
+ if (show_path_info && path_to_sample_phase) {
stream << "##FORMAT=" << endl;
}
if (path_to_sample_phase || gbwt) {
@@ -1008,208 +1196,252 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand
}
stream << "\">" << endl;
}
- if (path_restricted || gbwt) {
- stream << "##INFO=" << endl;
- stream << "##INFO=" << endl;
- stream << "##INFO=" << endl;
- stream << "##INFO=" << endl;
+ if (path_to_sample_phase && cluster_threshold < 1) {
+ stream << "##FORMAT="
+ << endl;
+ stream << "##FORMAT="
+ << endl;
+
}
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
if (include_nested) {
stream << "##INFO=" << endl;
stream << "##INFO=" << endl;
}
+ if (this->nested_decomposition) {
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ stream << "##INFO=" << endl;
+ }
if (untangle_allele_traversals) {
stream << "##INFO=|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl;
} else {
stream << "##INFO=" << endl;
}
- set gbwt_ref_paths;
+
+ stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
+ for (auto& sample_name : sample_names) {
+ stream << "\t" << sample_name;
+ }
+ stream << endl;
+ return stream.str();
+}
+
+string Deconstructor::add_contigs_to_vcf_header(const string& vcf_header) const {
+
+ vector header_lines = split_delims(vcf_header, "\n");
+
+ stringstream patched_header;
+ for (int64_t i = 0; i < header_lines.size() - 1; ++i) {
+ patched_header << header_lines[i] << "\n";
+ }
+
+ set all_ref_paths = this->ref_paths;
+
+ // add in the off-ref paths that nested deconstruction may have found
+ for (const unordered_set& off_ref_path_set : this->off_ref_paths) {
+ for (const path_handle_t& off_ref_path : off_ref_path_set) {
+ all_ref_paths.insert(graph->get_path_name(off_ref_path));
+ }
+ }
+
map ref_path_to_length;
- for(auto& refpath : ref_paths) {
- if (graph->has_path(refpath)) {
- int64_t path_len = 0;
- path_handle_t path_handle = graph->get_path_handle(refpath);
- for (handle_t handle : graph->scan_path(path_handle)) {
- path_len += graph->get_length(handle);
+ for(auto& refpath : all_ref_paths) {
+ assert(graph->has_path(refpath));
+ int64_t path_len = 0;
+ path_handle_t path_handle = graph->get_path_handle(refpath);
+ for (handle_t handle : graph->scan_path(path_handle)) {
+ path_len += graph->get_length(handle);
+ }
+ string locus_name = graph->get_locus_name(path_handle);
+ if (locus_name == PathMetadata::NO_LOCUS_NAME) {
+ locus_name = refpath;
+ } else if (long_ref_contig) {
+ // the sample name isn't unique enough, so put a full ugly name in the vcf
+ if (graph->get_sense(path_handle) == PathSense::GENERIC) {
+ locus_name = graph->get_path_name(path_handle);
+ } else {
+ locus_name = PathMetadata::create_path_name(PathSense::REFERENCE,
+ graph->get_sample_name(path_handle),
+ locus_name,
+ graph->get_haplotype(path_handle),
+ PathMetadata::NO_PHASE_BLOCK,
+ PathMetadata::NO_SUBRANGE);
}
- string locus_name = graph->get_locus_name(path_handle);
- if (locus_name == PathMetadata::NO_LOCUS_NAME) {
- locus_name = refpath;
- } else if (long_ref_contig) {
- // the sample name isn't unique enough, so put a full ugly name in the vcf
- if (graph->get_sense(path_handle) == PathSense::GENERIC) {
- locus_name = graph->get_path_name(path_handle);
- } else {
- locus_name = PathMetadata::create_path_name(PathSense::REFERENCE,
- graph->get_sample_name(path_handle),
- locus_name,
- graph->get_haplotype(path_handle),
- PathMetadata::NO_PHASE_BLOCK,
- PathMetadata::NO_SUBRANGE);
- }
- }
+ }
- subrange_t subrange = graph->get_subrange(path_handle);
- int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
- ref_path_to_length[locus_name] = std::max(ref_path_to_length[locus_name], path_len + offset);
- } else {
- gbwt_ref_paths.insert(refpath);
- }
+ subrange_t subrange = graph->get_subrange(path_handle);
+ int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
+ ref_path_to_length[locus_name] = std::max(ref_path_to_length[locus_name], path_len + offset);
}
for (auto& ref_path_len : ref_path_to_length) {
- stream << "##contig=" << endl;
+ patched_header << "##contig=" << endl;
}
- if (!gbwt_ref_paths.empty()) {
- unordered_map> gbwt_name_to_ids;
- for (size_t i = 0; i < gbwt->metadata.paths(); i++) {
- // Collect all the GBWT path IDs for each sample and contig.
- gbwt_name_to_ids[compose_short_path_name(*gbwt, i)].push_back(i);
+
+ assert(header_lines.back().substr(0, 6) == "#CHROM");
+ patched_header << header_lines.back();
+ return patched_header.str();
+}
+
+void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) {
+
+ vector snarls;
+ vector queue;
+
+ // read all our snarls into a list
+ snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) {
+ queue.push_back(snarl);
+ });
+ if (include_nested) {
+ while (!queue.empty()) {
+ const Snarl* snarl = queue.back();
+ queue.pop_back();
+ snarls.push_back(snarl);
+ const vector& children = snarl_manager->children_of(snarl);
+ queue.insert(queue.end(), children.begin(), children.end());
}
- for (const string& refpath : gbwt_ref_paths) {
- // For each sample and contig name that is a GBWT ref path
- vector& thread_ids = gbwt_name_to_ids.at(refpath);
- size_t path_len = 0;
- for (gbwt::size_type thread_id : thread_ids) {
- // For each actual path in the GBWT for that sample-and-contig,
- // we need to see how long it extends the space of the sample
- // and contig.
-
- // TODO: These are probably all guaranteed to be haplotype sense?
- PathSense sense = gbwtgraph::get_path_sense(*gbwt, thread_id, gbwt_reference_samples);
- subrange_t subrange = gbwtgraph::get_path_subrange(*gbwt, thread_id, sense);
-
- // TODO: when importing GFAs we might cram the start of a walk
- // into the GBWT count field. But we don't ever guarantee that
- // we've done that so it might not be visible as a subrange
- // here. Fix that somehow???
- size_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
- size_t len = path_to_length(extract_gbwt_path(*graph, *gbwt, thread_id));
- path_len = std::max(path_len, offset + len);
+ } else {
+ swap(snarls, queue);
+ }
+
+ // process the whole shebang in parallel
+#pragma omp parallel for schedule(dynamic,1)
+ for (size_t i = 0; i < snarls.size(); i++) {
+ deconstruct_site(graph->get_handle(snarls[i]->start().node_id(), snarls[i]->start().backward()),
+ graph->get_handle(snarls[i]->end().node_id(), snarls[i]->end().backward()));
+ }
+}
+
+void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) {
+ // logic copied from vg call (graph_caller.cpp)
+
+ size_t thread_count = get_thread_count();
+ this->off_ref_paths.clear();
+ this->off_ref_paths.resize(get_thread_count());
+ // Used to recurse on children of parents that can't be called
+ vector>> snarl_queue(thread_count);
+
+ // Run the deconstructor on a snarl, and queue up the children if it fails
+ auto process_snarl = [&](const Snarl* snarl, NestingInfo nesting_info) {
+ if (!snarl_manager->is_trivial(snarl, *graph)) {
+ const vector& children = snarl_manager->children_of(snarl);
+ assert(nesting_info.child_snarls.empty());
+ for (const Snarl* child : children) {
+ nesting_info.child_snarls.push_back(make_pair(graph->get_handle(child->start().node_id(), child->start().backward()),
+ graph->get_handle(child->end().node_id(), child->end().backward())));
+
}
- stream << "##contig=" << endl;
+ vector out_nesting_infos(children.size());
+ bool was_deconstructed = deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()),
+ graph->get_handle(snarl->end().node_id(), snarl->end().backward()),
+ include_nested ? &nesting_info : nullptr,
+ include_nested ? &out_nesting_infos : nullptr);
+ if (include_nested || !was_deconstructed) {
+ vector>& thread_queue = snarl_queue[omp_get_thread_num()];
+ for (int64_t i = 0; i < children.size(); ++i) {
+ thread_queue.push_back(make_pair(children[i], out_nesting_infos[i]));
+ }
+
+ }
}
+ };
+
+ // Start with the top level snarls
+ // (note, can't do for_each_top_level_snarl_parallel() because interface wont take nesting info)
+ vector> top_level_snarls;
+ snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) {
+ NestingInfo nesting_info;
+ nesting_info.has_ref = false;
+ top_level_snarls.push_back(make_pair(snarl, nesting_info));
+ });
+#pragma omp parallel for schedule(dynamic, 1)
+ for (int i = 0; i < top_level_snarls.size(); ++i) {
+ process_snarl(top_level_snarls[i].first, top_level_snarls[i].second);
}
-
- stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
- if (path_restricted || gbwt) {
- for (auto& sample_name : sample_names) {
- stream << "\t" << sample_name;
+
+ // Then recurse on any children the snarl caller failed to handle
+ while (!std::all_of(snarl_queue.begin(), snarl_queue.end(),
+ [](const vector>& snarl_vec) {return snarl_vec.empty();})) {
+ vector> cur_queue;
+ for (vector>& thread_queue : snarl_queue) {
+ cur_queue.reserve(cur_queue.size() + thread_queue.size());
+ std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue));
+ thread_queue.clear();
+ }
+#pragma omp parallel for schedule(dynamic, 1)
+ for (int i = 0; i < cur_queue.size(); ++i) {
+ process_snarl(cur_queue[i].first, cur_queue[i].second);
}
}
- stream << endl;
-
- string hstr = stream.str();
- assert(output_vcf.openForOutput(hstr));
- cout << output_vcf.header << endl;
+}
- // create the traversal finder
- map reads_by_name;
- path_trav_finder = unique_ptr(new PathTraversalFinder(*graph,
- *snarl_manager));
-
- if (!path_restricted && !gbwt) {
- trav_finder = unique_ptr(new ExhaustiveTraversalFinder(*graph,
- *snarl_manager,
- true));
+/**
+ * Convenience wrapper function for deconstruction of multiple paths.
+ */
+void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager,
+ bool include_nested,
+ int context_jaccard_window,
+ bool untangle_traversals,
+ bool keep_conflicted,
+ bool strict_conflicts,
+ bool long_ref_contig,
+ double cluster_threshold,
+ gbwt::GBWT* gbwt,
+ bool nested_decomposition,
+ bool star_allele) {
+ this->graph = graph;
+ this->ref_paths = set(ref_paths.begin(), ref_paths.end());
+ this->include_nested = include_nested || nested_decomposition;
+ this->path_jaccard_window = context_jaccard_window;
+ this->untangle_allele_traversals = untangle_traversals;
+ this->keep_conflicted_genotypes = keep_conflicted;
+ this->strict_conflict_checking = strict_conflicts;
+ this->long_ref_contig = long_ref_contig;
+ if (gbwt) {
+ this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt);
}
+ this->cluster_threshold = cluster_threshold;
+ this->gbwt = gbwt;
+ this->nested_decomposition = nested_decomposition;
+ this->star_allele = star_allele;
+
+ // the need to use nesting is due to a problem with omp tasks and shared state
+ // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1)
+ omp_set_nested(1);
+ omp_set_max_active_levels(3);
+ // create the traversal finder
+ map reads_by_name;
+ path_trav_finder = unique_ptr(new PathTraversalFinder(*graph));
+
if (gbwt != nullptr) {
gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt));
}
- vector snarls_todo;
- // Do the top-level snarls in parallel
- snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) {
- vector todo(1, snarl);
- vector next;
- while (!todo.empty()) {
- for (auto next_snarl : todo) {
- // if we can't make a variant from the snarl due to not finding
- // paths through it, we try again on the children
- // note: we may want to push the parallelism down a bit
-#pragma omp critical (snarls_todo)
- snarls_todo.push_back(next_snarl);
- if (include_nested) {
- // n.b. we no longer attempt to deconstruct the site to determine if we nest
- const vector& children = snarl_manager->children_of(next_snarl);
- next.insert(next.end(), children.begin(), children.end());
- }
- }
- swap(todo, next);
- next.clear();
- }
- });
+ string hstr = this->get_vcf_header();
+ assert(output_vcf.openForOutput(hstr));
-//#pragma omp parallel
-//#pragma omp single
- {
-#pragma omp parallel for schedule(dynamic,1)
- for (size_t i = 0; i < snarls_todo.size(); i++) {
-//#pragma omp task firstprivate(i)
- {
- auto& snarl = snarls_todo[i];
- deconstruct_site(snarl);
- }
- }
+ if (nested_decomposition) {
+ deconstruct_graph_top_down(snarl_manager);
+ } else {
+ deconstruct_graph(snarl_manager);
}
-//#pragma omp taskwait
+
+ string patched_header = this->add_contigs_to_vcf_header(output_vcf.header);
+ cout << patched_header << endl;
// write variants in sorted order
write_variants(cout, snarl_manager);
}
-bool Deconstructor::check_max_nodes(const Snarl* snarl) const {
- unordered_set nodeset = snarl_manager->deep_contents(snarl, *graph, false).first;
- int node_count = 0;
- for (auto node_id : nodeset) {
- handle_t node = graph->get_handle(node_id);
- if (graph->get_degree(node, true) > 1 || graph->get_degree(node, false) > 1) {
- ++node_count;
- if (node_count > max_nodes_for_exhaustive) {
- return false;
- }
- }
- }
- return true;
-};
-
-vector Deconstructor::explicit_exhaustive_traversals(const Snarl* snarl) const {
- vector out_travs;
- bool ultra_all_the_way_down = true;
- function extend_trav =
- [&](const SnarlTraversal& trav, const Snarl& nested_snarl) {
- // exhaustive traversal finder is limited. if we find something
- // that's not an ultrabubble, not much we can do
- if (nested_snarl.type() != ULTRABUBBLE) {
- ultra_all_the_way_down = false;
- return;
- }
- vector nested_travs = trav_finder->find_traversals(nested_snarl);
- for (auto& nested_trav : nested_travs) {
- SnarlTraversal extended_trav = trav;
- bool is_explicit = true;
- for (int i = 0; i < nested_trav.visit_size(); ++i) {
- if (nested_trav.visit(i).node_id() != 0) {
- Visit* visit = extended_trav.add_visit();
- *visit = nested_trav.visit(i);
- } else {
- extend_trav(extended_trav, nested_trav.visit(i).snarl());
- is_explicit = false;
- }
- }
- if (is_explicit) {
- out_travs.push_back(extended_trav);
- }
- }
- };
- SnarlTraversal trav;
- extend_trav(trav, *snarl);
- if (!ultra_all_the_way_down) {
- out_travs.clear();
- }
- return out_travs;
-}
}
diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp
index a8c66e369bb..ee537495a8e 100644
--- a/src/deconstructor.hpp
+++ b/src/deconstructor.hpp
@@ -38,33 +38,94 @@ class Deconstructor : public VCFOutputCaller {
// deconstruct the entire graph to cout.
// Not even a little bit thread safe.
- void deconstruct(vector refpaths, const PathPositionHandleGraph* grpah, SnarlManager* snarl_manager,
- bool path_restricted_traversals,
- int ploidy,
+ void deconstruct(vector refpaths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager,
bool include_nested,
int context_jaccard_window,
bool untangle_traversals,
bool keep_conflicted,
bool strict_conflicts,
bool long_ref_contig,
- gbwt::GBWT* gbwt = nullptr);
+ double cluster_threshold = 1.0,
+ gbwt::GBWT* gbwt = nullptr,
+ bool nested_decomposition = false,
+ bool star_allele = false);
private:
+ // initialize the vcf and get the header
+ string get_vcf_header();
+
+ // the header needs to be initialized *before* construction for vcflib
+ // but we don't know all the non-ref contigs (in nested mode) until *after*
+ // construction. end result: this hacky function to patch them in before printing
+ string add_contigs_to_vcf_header(const string& vcf_header) const;
+
+ // deconstruct all snarls in parallel (ie nesting relationship ignored)
+ void deconstruct_graph(SnarlManager* snarl_manager);
+
+ // deconstruct all top-level snarls in parallel
+ // nested snarls are processed after their parents in the same thread
+ // (same logic as vg call)
+ void deconstruct_graph_top_down(SnarlManager* snarl_manager);
+
+ // some information we pass from parent to child site when
+ // doing nested deconstruction
+ struct NestingInfo {
+ bool has_ref;
+ vector> child_snarls;
+ PathInterval parent_path_interval;
+ unordered_map> sample_to_haplotypes;
+ int parent_allele;
+ int64_t parent_len;
+ int64_t parent_ref_len;
+ string lv0_ref_name;
+ int64_t lv0_ref_start;
+ int64_t lv0_ref_len;
+ int64_t lv0_alt_len;
+ };
+
// write a vcf record for the given site. returns true if a record was written
// (need to have a path going through the site)
- bool deconstruct_site(const Snarl* site) const;
+ // the nesting_info structs are optional and used to pass reference information through nested sites...
+ // the output nesting_info vector writes a record for each child snarl
+ bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end,
+ const NestingInfo* in_nesting_info = nullptr,
+ vector* out_nesting_infos = nullptr) const;
+
+ // get the traversals for a given site
+ // this returns a combination of embedded path traversals and gbwt traversals
+ // the embedded paths come first, and only they get trav_steps.
+ // so you can use trav_steps.size() to find the index of the first gbwt traversal...
+ void get_traversals(const handle_t& snarl_start, const handle_t& snarl_end,
+ vector& out_travs,
+ vector& out_trav_path_names,
+ vector>& out_trav_steps) const;
+
+ // this is a hack to add in * alleles -- these are haplotypes that we genotyped in the
+ // parent but aren't represented in any of the traversals found in the current
+ // site. *-alleles are represented as empty traversals.
+ // todo: conflicts arising from alt-cycles will be able to lead to conflicting
+ // results -- need to overhaul code to pass more detailed traversal information
+ // from parent to child to have a chance at consistently resolving
+ // star traversals are appended onto travs and trav_names
+ // this funtion returns a map containing both parent and child haploty
+ unordered_map> add_star_traversals(vector& travs,
+ vector& trav_names,
+ vector>& trav_clusters,
+ vector>& trav_cluster_info,
+ const unordered_map>& parent_haplotypes) const;
// convert traversals to strings. returns mapping of traversal (offset in travs) to allele
vector get_alleles(vcflib::Variant& v,
- const pair,
- vector>>& path_travs,
+ const vector& travs,
+ const vector>& trav_steps,
int ref_path_idx,
- const vector& use_trav,
+ const vector>& trav_clusters,
char prev_char, bool use_start) const;
// write traversal path names as genotypes
- void get_genotypes(vcflib::Variant& v, const vector& names, const vector& trav_to_allele) const;
+ void get_genotypes(vcflib::Variant& v, const vector& names, const vector& trav_to_allele,
+ const vector>& trav_to_cluster_info) const;
// given a set of traversals associated with a particular sample, select a set of size for the VCF
// the highest-frequency ALT traversal is chosen
@@ -74,49 +135,19 @@ class Deconstructor : public VCFOutputCaller {
const vector& trav_to_name,
const vector& gbwt_phases) const;
- // check to see if a snarl is too big to exhaustively traverse
- bool check_max_nodes(const Snarl* snarl) const;
-
- // get traversals from the exhaustive finder. if they have nested visits, fill them in (exhaustively)
- // with node visits
- vector explicit_exhaustive_traversals(const Snarl* snarl) const;
-
- // gets a sorted node id context for a given path
- vector get_context(
- const pair,
- vector>>& path_travs,
- const int& trav_idx) const;
-
// the underlying context-getter
vector get_context(
step_handle_t start_step,
step_handle_t end_step) const;
-
- // compares node contexts
- double context_jaccard(const vector& target,
- const vector& query) const;
-
- // specialization for enc_vectors
- double context_jaccard(
- const dac_vector<>& target,
- const vector& query) const;
- // toggle between exhaustive and path restricted traversal finder
- bool path_restricted = false;
-
- // the max ploidy we expect.
- int ploidy;
-
// the graph
const PathPositionHandleGraph* graph;
- // the snarl manager
- SnarlManager* snarl_manager;
+ // the gbwt
+ gbwt::GBWT* gbwt;
// the traversal finders. we always use a path traversal finder to get the reference path
unique_ptr path_trav_finder;
- // we optionally use another (exhaustive for now) traversal finder if we don't want to rely on paths
- unique_ptr trav_finder;
// we can also use a gbwt for traversals
unique_ptr gbwt_trav_finder;
// When using the gbwt we need some precomputed information to ask about stored paths.
@@ -128,6 +159,10 @@ class Deconstructor : public VCFOutputCaller {
// the ref paths
set ref_paths;
+ // the off-ref paths that may be found during nested deconstruction
+ // (buffered by thread)
+ mutable vector> off_ref_paths;
+
// keep track of reference samples
set ref_samples;
@@ -143,9 +178,6 @@ class Deconstructor : public VCFOutputCaller {
// the sample ploidys given in the phases in our path names
unordered_map sample_ploidys;
- // upper limit of degree-2+ nodes for exhaustive traversal
- int max_nodes_for_exhaustive = 100;
-
// target window size for determining the correct reference position for allele traversals with path jaccard
int path_jaccard_window = 10000;
@@ -161,25 +193,22 @@ class Deconstructor : public VCFOutputCaller {
// should we keep conflicted genotypes or not
bool keep_conflicted_genotypes = false;
- // warn about context jaccard not working with exhaustive traversals
- mutable atomic exhaustive_jaccard_warning;
-};
+ // used to merge together similar traversals (to keep allele counts down)
+ // currently implemented as handle jaccard coefficient. So 1 means only
+ // merge if identical (which is what deconstruct has always done)
+ double cluster_threshold = 1.0;
-// helpel for measuring set intersectiond and union size
-template
-class count_back_inserter {
- size_t &count;
-public:
- typedef void value_type;
- typedef void difference_type;
- typedef void pointer;
- typedef void reference;
- typedef std::output_iterator_tag iterator_category;
- count_back_inserter(size_t &count) : count(count) {};
- void operator=(const T &){ ++count; }
- count_back_inserter &operator *(){ return *this; }
- count_back_inserter &operator++(){ return *this; }
+ // activate the new nested decomposition mode, which is like the old include_nested
+ // (which lives in vcfoutputcaller) but with more of an effort to link
+ // the parent and child snarls, as well as better support for nested insertions
+ bool nested_decomposition = false;
+
+ // use *-alleles to represent spanning alleles that do not cross site but do go around it
+ // ex: a big containing deletion
+ // only works with nested_decomposition
+ bool star_allele = false;
};
+
}
#endif
diff --git a/src/gbwt_extender.cpp b/src/gbwt_extender.cpp
index 9a46b21e25a..ee0eed7dd9f 100644
--- a/src/gbwt_extender.cpp
+++ b/src/gbwt_extender.cpp
@@ -2206,7 +2206,14 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co
}
WFAAlignment WFAExtender::suffix(const std::string& sequence, pos_t from) const {
- return this->connect(sequence, from, pos_t(0, false, 0));
+ WFAAlignment result = this->connect(sequence, from, pos_t(0, false, 0));
+
+ if (!result.edits.empty() && result.length == sequence.length() && (result.edits.back().first == WFAAlignment::match || result.edits.back().first == WFAAlignment::mismatch)) {
+ // The alignment used all of the sequence and has a match/mismatch at the appropriate end
+ result.score += this->aligner->full_length_bonus;
+ }
+
+ return result;
}
WFAAlignment WFAExtender::prefix(const std::string& sequence, pos_t to) const {
@@ -2219,6 +2226,10 @@ WFAAlignment WFAExtender::prefix(const std::string& sequence, pos_t to) const {
WFAAlignment result = this->connect(reverse_complement(sequence), to, pos_t(0, false, 0));
result.flip(*(this->graph), sequence);
+ if (!result.edits.empty() && result.length == sequence.length() && (result.edits.front().first == WFAAlignment::match || result.edits.front().first == WFAAlignment::mismatch)) {
+ result.score += this->aligner->full_length_bonus;
+ }
+
return result;
}
diff --git a/src/gbwt_extender.hpp b/src/gbwt_extender.hpp
index 362b8571a5e..1ea8d48710d 100644
--- a/src/gbwt_extender.hpp
+++ b/src/gbwt_extender.hpp
@@ -426,9 +426,11 @@ class WFAExtender {
* entire sequence with an acceptable score, returns the highest-scoring
* partial alignment, which may be empty.
*
+ * Applies the full-length bonus if the result ends with a match or mismatch.
+ * TODO: Use the full-length bonus to determine the optimal alignment.
+ *
* NOTE: This creates a suffix of the full alignment by aligning a
* prefix of the sequence.
- * TODO: Should we use full-length bonuses?
*/
WFAAlignment suffix(const std::string& sequence, pos_t from) const;
@@ -438,9 +440,11 @@ class WFAExtender {
* sequence with an acceptable score, returns the highest-scoring partial
* alignment, which may be empty.
*
+ * Applies the full-length bonus if the result begins with a match or mismatch.
+ * TODO: Use the full-length bonus to determine the optimal alignment.
+ *
* NOTE: This creates a prefix of the full alignment by aligning a suffix
* of the sequence.
- * TODO: Should we use full-length bonuses?
*/
WFAAlignment prefix(const std::string& sequence, pos_t to) const;
diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index 5dd250282c5..e3ce68321f2 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -238,15 +238,20 @@ string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector VCFOutputCaller::max_vcf_line_length) {
+ return false;
+ }
+ int ret = zstdutil::CompressString(ss.str(), dest);
+ assert(ret == 0);
// the Variant object is too big to keep in memory when there are many genotypes, so we
// store it in a zstd-compressed string
output_variants[omp_get_thread_num()].push_back(make_pair(make_pair(var.sequenceName, var.position), dest));
+ return true;
}
void VCFOutputCaller::write_variants(ostream& out_stream, const SnarlManager* snarl_manager) {
@@ -265,7 +270,8 @@ void VCFOutputCaller::write_variants(ostream& out_stream, const SnarlManager* sn
});
for (auto v : all_variants) {
string dest;
- zstdutil::DecompressString(v.second, dest);
+ int ret = zstdutil::DecompressString(v.second, dest);
+ assert(ret == 0);
out_stream << dest << endl;
}
}
@@ -366,6 +372,17 @@ void VCFOutputCaller::set_nested(bool nested) {
include_nested = nested;
}
+void VCFOutputCaller::add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, const Traversal& trav,
+ bool reversed, bool one_based) const {
+ SnarlTraversal proto_trav;
+ for (const handle_t& handle : trav) {
+ Visit* visit = proto_trav.add_visit();
+ visit->set_node_id(graph->get_id(handle));
+ visit->set_backward(graph->get_is_reverse(handle));
+ }
+ this->add_allele_path_to_info(v, allele, proto_trav, reversed, one_based);
+}
+
void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav,
bool reversed, bool one_based) const {
auto& trav_info = v.info["AT"];
@@ -408,6 +425,10 @@ void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, co
}
prev_visit = &visit;
}
+ if (trav_info[allele].empty()) {
+ // note: * alleles get empty traversals
+ trav_info[allele] = ".";
+ }
}
string VCFOutputCaller::trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const {
@@ -423,7 +444,7 @@ string VCFOutputCaller::trav_string(const HandleGraph& graph, const SnarlTravers
return seq;
}
-void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller,
+bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller,
const Snarl& snarl, const vector& called_traversals,
const vector