Skip to content

Commit

Permalink
make star-allele opt-in with -R
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed Jun 11, 2024
1 parent d746b4d commit 065ee64
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 11 deletions.
10 changes: 7 additions & 3 deletions src/deconstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -974,8 +974,10 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t

// add in the star alleles -- these are alleles that were genotyped in the parent but not
// the current allele, and are treated as *'s in VCF.
sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info,
in_nesting_info->sample_to_haplotypes);
if (this->star_allele) {
sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info,
in_nesting_info->sample_to_haplotypes);
}

}

Expand Down Expand Up @@ -1364,7 +1366,8 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
bool long_ref_contig,
double cluster_threshold,
gbwt::GBWT* gbwt,
bool nested_decomposition) {
bool nested_decomposition,
bool star_allele) {

this->graph = graph;
this->ref_paths = set<string>(ref_paths.begin(), ref_paths.end());
Expand All @@ -1380,6 +1383,7 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
this->cluster_threshold = cluster_threshold;
this->gbwt = gbwt;
this->nested_decomposition = nested_decomposition;
this->star_allele = star_allele;

// the need to use nesting is due to a problem with omp tasks and shared state
// which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1)
Expand Down
8 changes: 7 additions & 1 deletion src/deconstructor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ class Deconstructor : public VCFOutputCaller {
bool long_ref_contig,
double cluster_threshold = 1.0,
gbwt::GBWT* gbwt = nullptr,
bool nested_decomposition = false);
bool nested_decomposition = false,
bool star_allele = false);

private:

Expand Down Expand Up @@ -201,6 +202,11 @@ class Deconstructor : public VCFOutputCaller {
// (which lives in vcfoutputcaller) but with more of an effort to link
// the parent and child snarls, as well as better support for nested insertions
bool nested_decomposition = false;

// use *-alleles to represent spanning alleles that do not cross site but do go around it
// ex: a big containing deletion
// only works with nested_decomposition
bool star_allele = false;
};


Expand Down
18 changes: 14 additions & 4 deletions src/subcommand/deconstruct_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ void help_deconstruct(char** argv){
<< " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl
<< " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl
<< " -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental]" << endl
<< " -n, --nested Write a nested VCF, including *-alleles and special tags. [experimental]" << endl
<< " -n, --nested Write a nested VCF, including special tags. [experimental]" << endl
<< " -R, --star-allele Use *-alleles to denote alleles that span but do not cross the site. Only works with -n" << endl
<< " -t, --threads N Use N threads" << endl
<< " -v, --verbose Print some status messages" << endl
<< endl;
Expand Down Expand Up @@ -81,6 +82,7 @@ int main_deconstruct(int argc, char** argv){
bool contig_only_ref = false;
double cluster_threshold = 1.0;
bool nested = false;
bool star_allele = false;

int c;
optind = 2; // force optind past command positional argument
Expand All @@ -104,13 +106,14 @@ int main_deconstruct(int argc, char** argv){
{"contig-only-ref", no_argument, 0, 'C'},
{"cluster", required_argument, 0, 'L'},
{"nested", no_argument, 0, 'n'},
{"start-allele", no_argument, 0, 'R'},
{"threads", required_argument, 0, 't'},
{"verbose", no_argument, 0, 'v'},
{0, 0, 0, 0}
};

int option_index = 0;
c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nt:v",
c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uaL:nRt:v",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -171,6 +174,9 @@ int main_deconstruct(int argc, char** argv){
case 'n':
nested = true;
break;
case 'R':
star_allele = true;
break;
case 't':
omp_set_num_threads(parse<int>(optarg));
break;
Expand All @@ -192,9 +198,12 @@ int main_deconstruct(int argc, char** argv){
cerr << "Error [vg deconstruct]: -C cannot be used with -n" << endl;
return 1;
}
if (star_allele == true && nested == false) {
cerr << "Error [vg deconstruct]: -R can only be used with -n" << endl;
return 1;
}

// Read the graph

unique_ptr<PathHandleGraph> path_handle_graph_up;
unique_ptr<GBZGraph> gbz_graph;
gbwt::GBWT* gbwt_index = nullptr;
Expand Down Expand Up @@ -372,7 +381,8 @@ int main_deconstruct(int argc, char** argv){
!contig_only_ref,
cluster_threshold,
gbwt_index,
nested);
nested,
star_allele);
return 0;
}

Expand Down
15 changes: 12 additions & 3 deletions test/t/26_deconstruct.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 36
plan tests 37

vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg
vg index hla.vg -x hla.xg
Expand Down Expand Up @@ -155,7 +155,7 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d

rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf

vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf
vg deconstruct nesting/nested_snp_in_del.gfa -p x -nR > nested_snp_in_del.vcf
grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv
printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv
printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv
Expand All @@ -166,6 +166,15 @@ is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both site

rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv

vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf
grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv
printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv
printf "T\tA\t1|.\n" >> nested_snp_in_del_truth.tsv
diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv
is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion (without -R)"

rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv

vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf
grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv
printf "A\tT\t0|1\n" > nested_snp_in_ins_truth.tsv
Expand All @@ -184,7 +193,7 @@ is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp insi

rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv

vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv
vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -nR | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv
printf "A\tT,*\t0|1\t0|2\n" > nested_snp_in_ins2._truth.tsv
printf "C\tCAAG,CATG\t1|2\t1|0\n" >> nested_snp_in_ins2._truth.tsv
diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv
Expand Down

0 comments on commit 065ee64

Please sign in to comment.