From 065ee645ce3bb643d85088414df6097ac8d993ac Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 11 Jun 2024 13:44:27 -0400 Subject: [PATCH] make star-allele opt-in with -R --- src/deconstructor.cpp | 10 +++++++--- src/deconstructor.hpp | 8 +++++++- src/subcommand/deconstruct_main.cpp | 18 ++++++++++++++---- test/t/26_deconstruct.t | 15 ++++++++++++--- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index b4936a6b03d..e6dbaff7b4d 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -974,8 +974,10 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // add in the star alleles -- these are alleles that were genotyped in the parent but not // the current allele, and are treated as *'s in VCF. - sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, - in_nesting_info->sample_to_haplotypes); + if (this->star_allele) { + sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, + in_nesting_info->sample_to_haplotypes); + } } @@ -1364,7 +1366,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand bool long_ref_contig, double cluster_threshold, gbwt::GBWT* gbwt, - bool nested_decomposition) { + bool nested_decomposition, + bool star_allele) { this->graph = graph; this->ref_paths = set(ref_paths.begin(), ref_paths.end()); @@ -1380,6 +1383,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand this->cluster_threshold = cluster_threshold; this->gbwt = gbwt; this->nested_decomposition = nested_decomposition; + this->star_allele = star_allele; // the need to use nesting is due to a problem with omp tasks and shared state // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 16c75d00c51..ee537495a8e 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -47,7 +47,8 @@ class Deconstructor : public VCFOutputCaller { bool long_ref_contig, double cluster_threshold = 1.0, gbwt::GBWT* gbwt = nullptr, - bool nested_decomposition = false); + bool nested_decomposition = false, + bool star_allele = false); private: @@ -201,6 +202,11 @@ class Deconstructor : public VCFOutputCaller { // (which lives in vcfoutputcaller) but with more of an effort to link // the parent and child snarls, as well as better support for nested insertions bool nested_decomposition = false; + + // use *-alleles to represent spanning alleles that do not cross site but do go around it + // ex: a big containing deletion + // only works with nested_decomposition + bool star_allele = false; }; diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index aa976584d87..0d381732533 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -52,7 +52,8 @@ void help_deconstruct(char** argv){ << " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl << " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl << " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental]" << endl - << " -n, --nested Write a nested VCF, including *-alleles and special tags. [experimental]" << endl + << " -n, --nested Write a nested VCF, including special tags. [experimental]" << endl + << " -R, --star-allele Use *-alleles to denote alleles that span but do not cross the site. Only works with -n" << endl << " -t, --threads N Use N threads" << endl << " -v, --verbose Print some status messages" << endl << endl; @@ -81,6 +82,7 @@ int main_deconstruct(int argc, char** argv){ bool contig_only_ref = false; double cluster_threshold = 1.0; bool nested = false; + bool star_allele = false; int c; optind = 2; // force optind past command positional argument @@ -104,13 +106,14 @@ int main_deconstruct(int argc, char** argv){ {"contig-only-ref", no_argument, 0, 'C'}, {"cluster", required_argument, 0, 'L'}, {"nested", no_argument, 0, 'n'}, + {"start-allele", no_argument, 0, 'R'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nt:v", + c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nRt:v", long_options, &option_index); // Detect the end of the options. @@ -171,6 +174,9 @@ int main_deconstruct(int argc, char** argv){ case 'n': nested = true; break; + case 'R': + star_allele = true; + break; case 't': omp_set_num_threads(parse(optarg)); break; @@ -192,9 +198,12 @@ int main_deconstruct(int argc, char** argv){ cerr << "Error [vg deconstruct]: -C cannot be used with -n" << endl; return 1; } + if (star_allele == true && nested == false) { + cerr << "Error [vg deconstruct]: -R can only be used with -n" << endl; + return 1; + } // Read the graph - unique_ptr path_handle_graph_up; unique_ptr gbz_graph; gbwt::GBWT* gbwt_index = nullptr; @@ -372,7 +381,8 @@ int main_deconstruct(int argc, char** argv){ !contig_only_ref, cluster_threshold, gbwt_index, - nested); + nested, + star_allele); return 0; } diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index ea7ed91754d..7e6d0584899 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 36 +plan tests 37 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -155,7 +155,7 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf -vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +vg deconstruct nesting/nested_snp_in_del.gfa -p x -nR > nested_snp_in_del.vcf grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv @@ -166,6 +166,15 @@ is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both site rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv +printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv +printf "T\tA\t1|.\n" >> nested_snp_in_del_truth.tsv +diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion (without -R)" + +rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv + vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv printf "A\tT\t0|1\n" > nested_snp_in_ins_truth.tsv @@ -184,7 +193,7 @@ is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp insi rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv -vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv +vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -nR | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv printf "A\tT,*\t0|1\t0|2\n" > nested_snp_in_ins2._truth.tsv printf "C\tCAAG,CATG\t1|2\t1|0\n" >> nested_snp_in_ins2._truth.tsv diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv