Skip to content

Commit

Permalink
store parent allele number in PA
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed May 31, 2024
1 parent 350b1b2 commit 2c8db79
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 3 deletions.
14 changes: 14 additions & 0 deletions src/deconstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -966,6 +966,14 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t
// the current allele, and are treated as *'s in VCF.
sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info,
in_nesting_info->sample_to_haplotypes);

}
if (this->nested_decomposition) {
int64_t parent_allele = 0;
if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) {
parent_allele = in_nesting_info->ref_path_allele;
}
v.info["PA"].push_back(std::to_string(parent_allele));
}

vector<int> trav_to_allele = get_alleles(v, travs, trav_steps,
Expand Down Expand Up @@ -1001,6 +1009,8 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t
out_nesting_infos->at(j).has_ref = true;
out_nesting_infos->at(j).ref_path_interval = trav_steps[child_snarl_to_trav[j]];
out_nesting_infos->at(j).sample_to_haplotypes = sample_to_haps;
out_nesting_infos->at(j).ref_path_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ?
trav_to_allele[child_snarl_to_trav[j]] : 0;
}
}
}
Expand Down Expand Up @@ -1124,6 +1134,9 @@ string Deconstructor::get_vcf_header() {
stream << "##INFO=<ID=LV,Number=1,Type=Integer,Description=\"Level in the snarl tree (0=top level)\">" << endl;
stream << "##INFO=<ID=PS,Number=1,Type=String,Description=\"ID of variant corresponding to parent snarl\">" << endl;
}
if (this->nested_decomposition) {
stream << "##INFO=<ID=PA,Number=1,Type=Integer,Description=\"Allele number of the reference allele in Parent Snarl\">" << endl;
}
if (untangle_allele_traversals) {
stream << "##INFO=<ID=UT,Number=R,Type=String,Description=\"Untangled allele Traversal with reference node start and end positions, format: [>|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl;
} else {
Expand Down Expand Up @@ -1312,6 +1325,7 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
}
this->cluster_threshold = cluster_threshold;
this->gbwt = gbwt;
this->nested_decomposition = nested_decomposition;

// the need to use nesting is due to a problem with omp tasks and shared state
// which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1)
Expand Down
6 changes: 6 additions & 0 deletions src/deconstructor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class Deconstructor : public VCFOutputCaller {
vector<pair<handle_t, handle_t>> child_snarls;
PathInterval ref_path_interval;
unordered_map<string, vector<int>> sample_to_haplotypes;
int ref_path_allele;
};

// write a vcf record for the given site. returns true if a record was written
Expand Down Expand Up @@ -189,6 +190,11 @@ class Deconstructor : public VCFOutputCaller {
// currently implemented as handle jaccard coefficient. So 1 means only
// merge if identical (which is what deconstruct has always done)
double cluster_threshold = 1.0;

// activate the new nested decomposition mode, which is like the old include_nested
// (which lives in vcfoutputcaller) but with more of an effort to link
// the parent and child snarls, as well as better support for nested insertions
bool nested_decomposition = false;
};


Expand Down
12 changes: 9 additions & 3 deletions test/t/26_deconstruct.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 29
plan tests 32

vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg
vg index hla.vg -x hla.xg
Expand Down Expand Up @@ -155,13 +155,16 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d

rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf

vg deconstruct nesting/nested_snp_in_del.gfa -p x -n | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv
vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf
grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv
printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv
printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv
diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv
is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletion"

rm -f nested_snp_in_del.tsv nested_snp_in_del_truth.tsv
is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both sites of neseted deletion"

rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv

vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf
grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv
Expand All @@ -170,6 +173,9 @@ printf "A\tT\t0|1\n" >> nested_snp_in_ins_truth.tsv
diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv
is "$?" 0 "nested deconstruction gets correct allele for snp inside insert"

is $(grep PA= nested_snp_in_ins.vcf | head -1 | grep PA=0 | wc -l) 1 "PA=0 set for base allele of nested insertion"
is $(grep PA= nested_snp_in_ins.vcf | tail -1 | grep PA=1 | wc -l) 1 "PA=1 set for nested allele of nested insertion"

grep ^##contig nested_snp_in_ins.vcf > nested_snp_in_ins_contigs.tsv
printf "##contig=<ID=x,length=2>\n" > nested_snp_in_ins_contigs_truth.tsv
printf "##contig=<ID=y0,length=5>\n" >> nested_snp_in_ins_contigs_truth.tsv
Expand Down

1 comment on commit 2c8db79

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch deconstruct. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17449 seconds

Please sign in to comment.