Skip to content

Commit

Permalink
Merge pull request #4529 from vgteam/minimizer-autoindex
Browse files Browse the repository at this point in the history
Minimizer autoindex
  • Loading branch information
adamnovak authored Feb 28, 2025
2 parents 1ff257b + 67637a6 commit f3d69b6
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 25 deletions.
16 changes: 16 additions & 0 deletions src/index_registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4104,6 +4104,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
auto zipcode_output = *constructing.rbegin();
auto& output_name_minimizer = all_outputs[0];
auto& output_name_zipcodes = all_outputs[1];

//TODO: This doesn't work because everything is const and the compiler doesn't like it
//Make sure that we're writing both the minimizers and zipcodes
//assert(!plan->registry->get_index(minimizer_output)->was_provided_directly() && !plan->registry->get_index(zipcode_output)->was_provided_directly());


ifstream infile_gbz;
Expand Down Expand Up @@ -4551,6 +4555,18 @@ void IndexRegistry::provide(const IndexName& identifier, const vector<string>& f
get_index(identifier)->provide(filenames);
}

void IndexRegistry::reset(const IndexName& identifier) {
if (IndexingParameters::verbosity >= IndexingParameters::Debug) {
cerr << "[IndexRegistry]: Reset provided: " << identifier << endl;
}
if (!index_registry.count(identifier)) {
cerr << "error:[IndexRegistry] cannot reset unregistered index: " << identifier << endl;
exit(1);
}
get_index(identifier)->reset();
}


bool IndexRegistry::available(const IndexName& identifier) const {
if (!index_registry.count(identifier)) {
// Index is not registered
Expand Down
3 changes: 3 additions & 0 deletions src/index_registry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ class IndexRegistry {

/// Indicate a list of serialized files that contains some identified index
void provide(const IndexName& identifier, const vector<string>& filenames);

/// Remove a provided index
void reset(const IndexName& identifier);

/// Return true if the given index is available and can be require()'d, and
/// false otherwise.
Expand Down
11 changes: 7 additions & 4 deletions src/io/register_loader_saver_minimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@ void register_loader_saver_minimizer() {

Registry::register_bare_loader_saver_with_magic<gbwtgraph::DefaultMinimizerIndex>("MinimizerIndex", magic_string, [](istream& input) -> void* {
gbwtgraph::DefaultMinimizerIndex* index = new gbwtgraph::DefaultMinimizerIndex();
index->deserialize(input);

// Return the index so the caller owns it.
return static_cast<void*>(index);
if (index->deserialize(input)) {
// Return the index so the caller owns it.
return static_cast<void*>(index);
} else {
//If deserializing failed
throw std::runtime_error("Failed to load minimizer index");
}
}, [](const void* index_void, ostream& output) {
assert(index_void != nullptr);
static_cast<const gbwtgraph::DefaultMinimizerIndex*>(index_void)->serialize(output);
Expand Down
34 changes: 30 additions & 4 deletions src/subcommand/giraffe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1598,13 +1598,15 @@ int main_giraffe(int argc, char** argv) {
// Otherwise we use the provided indexes.
for (auto& index : provided_indexes) {
registry.provide(index.first, index.second);

}
}
registry.set_prefix(index_basename);

// The IndexRegistry doesn't try to infer index files based on the
// basename, so do that here. We can have multiple extension options that
// we try in order of priority.
// When found, the index is removed from this map.
unordered_map<string, vector<string>> indexes_and_extensions = {
{"Giraffe GBZ", {"giraffe.gbz", "gbz"}},
{"XG", {"xg"}},
Expand All @@ -1623,9 +1625,10 @@ int main_giraffe(int argc, char** argv) {
// Drop anything we already got from the list
indexes_and_extensions.erase(completed);
}
for (auto& index_and_extensions : indexes_and_extensions) {
for (auto index_and_extensions = indexes_and_extensions.begin(); index_and_extensions != indexes_and_extensions.end(); ) {
// For each index type
for (auto& extension : index_and_extensions.second) {
bool found = false;
for (auto& extension : index_and_extensions->second) {
// For each extension in priority order
string inferred_filename = registry.get_prefix() + "." + extension;
if (ifstream(inferred_filename).is_open()) {
Expand All @@ -1635,13 +1638,36 @@ int main_giraffe(int argc, char** argv) {
cerr << "warning:[vg giraffe] " << inferred_filename << " exists and will be overwritten" << endl;
} else {
// Report it because this may not be desired behavior.
cerr << "Guessing that " << inferred_filename << " is " << index_and_extensions.first << endl;
registry.provide(index_and_extensions.first, inferred_filename);
cerr << "Guessing that " << inferred_filename << " is " << index_and_extensions->first << endl;
registry.provide(index_and_extensions->first, inferred_filename);
found = true;
}
// Skip other extension options for the index
break;
}
}
if (found) {
// Erase this one and go to the next one.
index_and_extensions = indexes_and_extensions.erase(index_and_extensions);
} else {
// Leave this one and go to the next one.
++index_and_extensions;
}
}

//If we're making new zipcodes, we should rebuild the minimizers too
if (!indexes_and_extensions.count(std::string("Long Read Minimizers")) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) {
cerr << "Rebuilding minimizer index to include zipcodes" << endl;
registry.reset(std::string("Long Read Minimizers"));
} else if (indexes_and_extensions.count(std::string("Long Read Minimizers")) && !indexes_and_extensions.count(std::string("Long Read Zipcodes"))) {
cerr << "Rebuilding zipcodes index to match new minimizers" << endl;
registry.reset(std::string("Long Read Zipcodes"));
} else if (!indexes_and_extensions.count(std::string("Short Read Minimizers")) && indexes_and_extensions.count(std::string("Short Read Zipcodes"))) {
cerr << "Rebuilding minimizer index to include zipcodes" << endl;
registry.reset(std::string("Short Read Minimizers"));
} else if (indexes_and_extensions.count(std::string("Short Read Minimizers")) && !indexes_and_extensions.count(std::string("Short Read Zipcodes"))) {
cerr << "Rebuilding zipcodes to match new minimizers" << endl;
registry.reset(std::string("Short Read Zipcodes"));
}

// create in-memory objects
Expand Down
43 changes: 26 additions & 17 deletions test/t/50_vg_giraffe.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 62
plan tests 64

vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg
vg index -x x.xg x.vg
Expand All @@ -14,23 +14,32 @@ vg gbwt -o x-paths.gbwt -x x.vg --index-paths
vg gbwt -o x-merged.gbwt -m x-haps.gbwt x-paths.gbwt
vg gbwt -o x.gbwt --augment-gbwt -x x.vg x-merged.gbwt
vg index -j x.dist x.vg
vg minimizer -k 29 -w 11 -d x.dist -g x.gbwt -o x.shortread.withzip.min x.xg
vg minimizer -k 29 -w 11 -d x.dist -g x.gbwt -o x.shortread.withzip.min -z x.shortread.zipcodes x.xg

vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with xg + gbwt + min + dist specified without crashing"
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with xg + gbwt + min + zips + dist specified without crashing"
rm -f x-haps.gbwt x-paths.gbwt x-merged.gbwt

vg giraffe -Z x.giraffe.gbz -m x.shortread.withzip.min -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with gbz + min + dist specified without crashing"
vg giraffe -Z x.giraffe.gbz -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with gbz + min + zips + dist specified without crashing"

rm -f x.giraffe.gbz
vg gbwt -x x.xg -g x.gg x.gbwt
vg giraffe -g x.gg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with gg + gbwt + min + dist specified without crashing"
vg giraffe -g x.gg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with gg + gbwt + min + zips + dist specified without crashing"

rm -f x.shortread.withzip.min
rm -f x.shortread.zipcodes
vg giraffe -Z x.giraffe.gbz -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with the minimizer index being regenerated"
is "${?}" "0" "a read can be mapped with the minimizer index and zipcodes being regenerated"

rm -f x.shortread.withzip.min
vg giraffe -Z x.giraffe.gbz -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with the minimizer index regenerated"

rm -f x.shortread.zipcodes
vg giraffe -Z x.giraffe.gbz -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with the zipcodes being regenerated"

vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with the indexes being inferred by name"
Expand Down Expand Up @@ -72,25 +81,25 @@ vg giraffe -x x.xg -H x.gbwt -m x.sync -d x.dist -f reads/small.middle.ref.fq >
is "${?}" "0" "a read can be mapped with syncmer indexes without crashing"

chmod 400 x.dist
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f reads/small.middle.ref.fq >/dev/null
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f reads/small.middle.ref.fq >/dev/null
is "${?}" "0" "a read can be mapped when the distance index is not writable"

echo "@read" >read.fq
echo "GATTACATTAGGAGATAGCCATACGACGTAGCATCTAGCTCAGCCACA$(cat small/x.fa | head -n2 | tail -n1)" >>read.fq
echo "+" >>read.fq
echo "GATTACATTAGGAGATAGCCATACGACGTAGCATCTAGCTCAGCCACA$(cat small/x.fa | head -n2 | tail -n1)" | tr 'ACGTN' '(((((' >>read.fq

vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f read.fq > read.gam
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f read.fq > read.gam
LOOP_LINES="$(vg view -aj read.gam | jq -c 'select(.path.mapping[0].position.node_id == .path.mapping[1].position.node_id)' | wc -l | sed 's/^[[:space:]]*//')"
is "${LOOP_LINES}" "0" "a read which softclips does not appear to loop"


printf "@read1\tT1:A:t T2:i:1\t T3:f:3.5e-7\nCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAG\n+\nHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n" > tagged1.fq
printf "@read2 T4:Z:str T5:H:FF00\tT6:B:S,0,10 T7:B:f,8.0,5.0\nCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAG\n+\nHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n" > tagged2.fq
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f tagged1.fq --comments-as-tags -o BAM > t1.bam
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f tagged2.fq --comments-as-tags -o BAM > t2.bam
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f tagged1.fq -f tagged2.fq --comments-as-tags -o BAM > t3.bam
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -d x.dist -f tagged1.fq --comments-as-tags -o GAF > t1.gaf
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq --comments-as-tags -o BAM > t1.bam
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged2.fq --comments-as-tags -o BAM > t2.bam
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq -f tagged2.fq --comments-as-tags -o BAM > t3.bam
vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq --comments-as-tags -o GAF > t1.gaf


is "$(samtools view t1.bam | grep T1 | grep T2 | grep T3 | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM tags are preserved on read 1"
Expand Down Expand Up @@ -231,7 +240,7 @@ is "$?" "0" "Mapping reads to named coordinates on a nontrivial graph without wa
is "$(vg view -aj mapped.gam | jq -r '.score' | grep -v "^0" | grep -v "null" | wc -l | sed 's/^[[:space:]]*//')" "200" "Reads are all mapped"
is "$(vg view -aj mapped.gam | jq -r '.path.mapping[].position.name' | grep null | wc -l | sed 's/^[[:space:]]*//')" "0" "GFA segment names are all set"

vg giraffe -Z brca.giraffe.gbz -m brca.shortread.withzip.min -d brca.dist -G reads.gam --named-coordinates -o gaf > mapped.gaf
vg giraffe -Z brca.giraffe.gbz -m brca.shortread.withzip.min -z brca.shortread.zipcodes -d brca.dist -G reads.gam --named-coordinates -o gaf > mapped.gaf
is "$?" "0" "Mapping reads as GAF to named coordinates on a nontrivial graph without walks succeeds"

vg view -aj mapped.gam | jq -r '.path.mapping[].position.name' | sort | uniq > gam_names.txt
Expand All @@ -254,5 +263,5 @@ is "$(vg view -aj longread.gam | jq -c '. | select(.annotation["filter_3_cluster
is "$(vg view -aj longread.gam | jq -c '.refpos[]' | wc -l)" "$(vg view -aj longread.gam | jq -c '.path.mapping[]' | wc -l)" "Giraffe sets refpos for each reference node"
is "$(vg view --extract-tag PARAMS_JSON longread.gam | jq '.["track-provenance"]')" "true" "Giraffe embeds parameters in GAM"

rm -f longread.gam 1mb1kgp.dist 1mb1kgp.giraffe.gbz 1mb1kgp.shortread.withzip.min log.txt
rm -f longread.gam 1mb1kgp.dist 1mb1kgp.giraffe.gbz 1mb1kgp.shortread.withzip.min 1mb1kgp.shortread.zipcodes log.txt

1 comment on commit f3d69b6

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17365 seconds

Please sign in to comment.