Skip to content

Commit

Permalink
Add presets for haplotype sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
jltsiren committed Jul 3, 2024
1 parent a614f71 commit a018e65
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 15 deletions.
13 changes: 13 additions & 0 deletions src/recombinator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,19 @@ Recombinator::Recombinator(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotyp
{
}

//------------------------------------------------------------------------------

Recombinator::Parameters::Parameters(preset_t preset) {
if (preset == preset_haploid) {
this->haploid_scoring = true;
this->include_reference = true;
} else if (preset == preset_diploid) {
this->num_haplotypes = NUM_CANDIDATES;
this->diploid_sampling = true;
this->include_reference = true;
}
}

void Recombinator::Parameters::print(std::ostream& out) const {
out << "Sampling parameters:" << std::endl;
if (this->haploid_scoring) {
Expand Down
13 changes: 12 additions & 1 deletion src/recombinator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,6 @@ class Recombinator {
Recombinator(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, Verbosity verbosity);

/// Parameters for `generate_haplotypes()`.
/// TODO: We should have a single parameter for the scoring/sampling model.
struct Parameters {
/// Number of haplotypes to be generated, or the number of candidates
/// for diploid sampling.
Expand Down Expand Up @@ -495,6 +494,18 @@ class Recombinator {
/// Include named and reference paths.
bool include_reference = false;

/// Preset parameters for common use cases.
enum preset_t {
/// Default parameters.
preset_default,
/// Best practices for haploid sampling.
preset_haploid,
/// Best practices for diploid sampling.
preset_diploid
};

explicit Parameters(preset_t preset = preset_default);

/// Print a description of the parameters.
void print(std::ostream& out) const;
};
Expand Down
5 changes: 1 addition & 4 deletions src/subcommand/giraffe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1670,10 +1670,7 @@ string sample_haplotypes(const vector<pair<string, string>>& indexes, string& ba
// Sample haplotypes.
Haplotypes::Verbosity verbosity = (progress ? Haplotypes::verbosity_basic : Haplotypes::verbosity_silent);
Recombinator recombinator(gbz, haplotypes, verbosity);
Recombinator::Parameters parameters;
parameters.num_haplotypes = Recombinator::NUM_CANDIDATES;
parameters.diploid_sampling = true;
parameters.include_reference = true;
Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid);
gbwt::GBWT sampled_gbwt;
try {
sampled_gbwt = recombinator.generate_haplotypes(kff_file, parameters);
Expand Down
37 changes: 29 additions & 8 deletions src/subcommand/haplotypes_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ void help_haplotypes(char** argv, bool developer_options) {
std::cerr << " --linear-structure extend subchains to avoid haplotypes visiting them multiple times" << std::endl;
std::cerr << std::endl;
std::cerr << "Options for sampling haplotypes:" << std::endl;
std::cerr << " --preset X use preset X (default, haploid, diploid)" << std::endl;
std::cerr << " --coverage N kmer coverage in the KFF file (default: estimate)" << std::endl;
std::cerr << " --num-haplotypes N generate N haplotypes (default: " << haplotypes_default_n() << ")" << std::endl;
std::cerr << " sample from N candidates (with --diploid-sampling; default: " << haplotypes_default_candidates() << ")" << std::endl;
Expand Down Expand Up @@ -253,14 +254,15 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
constexpr int OPT_WINDOW_LENGTH = 1201;
constexpr int OPT_SUBCHAIN_LENGTH = 1202;
constexpr int OPT_LINEAR_STRUCTURE = 1203;
constexpr int OPT_COVERAGE = 1300;
constexpr int OPT_NUM_HAPLOTYPES = 1301;
constexpr int OPT_PRESENT_DISCOUNT = 1302;
constexpr int OPT_HET_ADJUSTMENT = 1303;
constexpr int OPT_ABSENT_SCORE = 1304;
constexpr int OPT_HAPLOID_SCORING = 1305;
constexpr int OPT_DIPLOID_SAMPLING = 1306;
constexpr int OPT_INCLUDE_REFERENCE = 1307;
constexpr int OPT_PRESET = 1300;
constexpr int OPT_COVERAGE = 1301;
constexpr int OPT_NUM_HAPLOTYPES = 1302;
constexpr int OPT_PRESENT_DISCOUNT = 1303;
constexpr int OPT_HET_ADJUSTMENT = 1304;
constexpr int OPT_ABSENT_SCORE = 1305;
constexpr int OPT_HAPLOID_SCORING = 1306;
constexpr int OPT_DIPLOID_SAMPLING = 1307;
constexpr int OPT_INCLUDE_REFERENCE = 1308;
constexpr int OPT_VALIDATE = 1400;
constexpr int OPT_VCF_INPUT = 1500;
constexpr int OPT_CONTIG_PREFIX = 1501;
Expand All @@ -280,6 +282,7 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
{ "window-length", required_argument, 0, OPT_WINDOW_LENGTH },
{ "subchain-length", required_argument, 0, OPT_SUBCHAIN_LENGTH },
{ "linear-structure", no_argument, 0, OPT_LINEAR_STRUCTURE },
{ "preset", required_argument, 0, OPT_PRESET },
{ "coverage", required_argument, 0, OPT_COVERAGE },
{ "num-haplotypes", required_argument, 0, OPT_NUM_HAPLOTYPES },
{ "present-discount", required_argument, 0, OPT_PRESENT_DISCOUNT },
Expand Down Expand Up @@ -354,6 +357,24 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
case OPT_LINEAR_STRUCTURE:
this->partitioner_parameters.linear_structure = true;
break;

case OPT_PRESET:
{
Recombinator::Parameters::preset_t preset;
if (std::string(optarg) == "default") {
preset = Recombinator::Parameters::preset_default;
} else if (std::string(optarg) == "haploid") {
preset = Recombinator::Parameters::preset_haploid;
} else if (std::string(optarg) == "diploid") {
preset = Recombinator::Parameters::preset_diploid;
} else {
std::cerr << "error: [vg haplotypes] unknown preset: " << optarg << std::endl;
std::exit(EXIT_FAILURE);
}
this->recombinator_parameters = Recombinator::Parameters(preset);
num_haplotypes_set = true; // The preset is assumed to include the number of haplotypes.
break;
}
case OPT_COVERAGE:
this->recombinator_parameters.coverage = parse<size_t>(optarg);
break;
Expand Down
10 changes: 8 additions & 2 deletions test/t/54_vg_haplotypes.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 19
plan tests 21

# The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph:
# - GRCh38#chr6:31498145-31511124 (micb)
Expand Down Expand Up @@ -45,6 +45,12 @@ is $(vg gbwt -S -Z diploid.gbz) 3 "1 generated + 2 reference samples"
is $(vg gbwt -C -Z diploid.gbz) 2 "2 contigs"
is $(vg gbwt -H -Z diploid.gbz) 4 "2 generated + 2 reference haplotypes"

# Diploid sampling using a preset
vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff --preset diploid -g diploid2.gbz full.gbz
is $? 0 "diploid sampling using a preset"
cmp diploid.gbz diploid2.gbz
is $? 0 "the outputs are identical"

# Giraffe integration, guessed output name
vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
-f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null
Expand All @@ -63,6 +69,6 @@ is $? 0 "the sampled graphs are identical"
# Cleanup
rm -r full.gbz full.ri full.dist full.hapl
rm -f indirect.gbz direct.gbz no_ref.gbz
rm -f diploid.gbz
rm -f diploid.gbz diploid2.gbz
rm -f full.HG003.gbz full.HG003.dist full.HG003.min default.gam
rm -f sampled.003HG.gbz sampled.003HG.dist sampled.003HG.min specified.gam

0 comments on commit a018e65

Please sign in to comment.