Skip to content

Commit

Permalink
Partial fix for variant inflation (#477)
Browse files Browse the repository at this point in the history
* rearrange filters

* solve AD based de novo failures

* small model method tweak
  • Loading branch information
MattWellie authored Feb 4, 2025
1 parent f11c77e commit 91900a0
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 84 deletions.
113 changes: 75 additions & 38 deletions src/talos/RunHailFiltering.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,13 @@ def annotate_exomiser(mt: hl.MatrixTable, exomiser: str | None = None) -> hl.Mat
Returns:
The same MatrixTable but with additional annotations
"""

if not exomiser:
if not exomiser or (
'exomiser'
in config_retrieve(
['ValidateMOI', 'ignore_categories'],
[],
)
):
get_logger().info('No exomiser table found, skipping annotation')
return mt.annotate_rows(info=mt.info.annotate(categorydetailsexomiser=MISSING_STRING))

Expand All @@ -120,6 +125,9 @@ def annotate_exomiser(mt: hl.MatrixTable, exomiser: str | None = None) -> hl.Mat
),
)

get_logger().info('No exomiser table found, skipping annotation')
return mt.annotate_rows(info=mt.info.annotate(categorydetailsexomiser=MISSING_STRING))


def annotate_codon_clinvar(mt: hl.MatrixTable, pm5_path: str | None):
"""
Expand Down Expand Up @@ -244,12 +252,24 @@ def annotate_splicevardb(mt: hl.MatrixTable, svdb_path: str | None):
Returns:
Same MT with an extra category label
"""
if svdb_path is None or (
'svdb'
in config_retrieve(
['ValidateMOI', 'ignore_categories'],
[],
)
):
get_logger().info('Skipping SVDB annotation')
return mt.annotate_rows(
info=mt.info.annotate(
categorybooleansvdb=MISSING_INT,
svdb_location=MISSING_STRING,
svdb_method=MISSING_STRING,
svdb_doi=MISSING_STRING,
),
)

if svdb_path is None:
get_logger().info('SVDB table not found, skipping annotation')
return mt.annotate_rows(info=mt.info.annotate(categorydetailsSVDB=MISSING_STRING))

# read in the codon table
# read in the codon table
get_logger().info(f'Reading SpliceVarDB data from {svdb_path}')
svdb_ht = hl.read_table(svdb_path)

Expand Down Expand Up @@ -289,13 +309,7 @@ def filter_on_quality_flags(mt: hl.MatrixTable) -> hl.MatrixTable:
"""

return mt.filter_rows(
hl.is_missing(mt.filters)
| (mt.filters.length() == 0)
| (
(mt.info.clinvar_talos == ONE_INT)
| (mt.info.categorybooleansvdb == ONE_INT)
| (mt.info.categorydetailsexomiser != MISSING_STRING)
),
hl.is_missing(mt.filters) | (mt.filters.length() == 0) | (mt.info.clinvar_talos == ONE_INT),
)


Expand Down Expand Up @@ -361,11 +375,7 @@ def filter_matrix_by_ac(mt: hl.MatrixTable, ac_threshold: float = 0.01) -> hl.Ma
min_callset_ac = 5
return mt.filter_rows(
((min_callset_ac >= mt.info.AC[0]) | (ac_threshold > mt.info.AC[0] / mt.info.AN))
| (
(mt.info.clinvar_talos == ONE_INT)
| (mt.info.categorybooleansvdb == ONE_INT)
| (mt.info.categorydetailsexomiser != MISSING_STRING)
),
| (mt.info.clinvar_talos == ONE_INT),
)


Expand All @@ -383,11 +393,7 @@ def filter_to_population_rare(mt: hl.MatrixTable) -> hl.MatrixTable:
(hl.or_else(mt.gnomad_exomes.AF, MISSING_FLOAT_LO) < rare_af_threshold)
& (hl.or_else(mt.gnomad_genomes.AF, MISSING_FLOAT_LO) < rare_af_threshold)
)
| (
(mt.info.clinvar_talos == ONE_INT)
| (mt.info.categorybooleansvdb == ONE_INT)
| (mt.info.categorydetailsexomiser != MISSING_STRING)
),
| (mt.info.clinvar_talos == ONE_INT),
)


Expand Down Expand Up @@ -429,6 +435,30 @@ def remove_variants_outside_gene_roi(mt: hl.MatrixTable, green_genes: hl.SetExpr
return mt.filter_rows(hl.len(green_genes.intersection(mt.geneIds)) > 0)


def update_wt_ad_entry(mt: hl.MatrixTable) -> hl.MatrixTable:
"""
Take the MT as it is currently, and update the AD field for WT calls to be [AD, 0] where missing
Args:
mt ():
Returns:
Same MT, with updated fields
"""
return mt.annotate_entries(
AD=hl.case()
.when(
~hl.is_missing(mt.AD),
mt.AD,
)
.when(
(~hl.is_missing(mt.DP)) & (mt.GT.is_hom_ref()),
[mt.DP, 0],
)
.default([30, 0]),
)


def split_rows_by_gene_and_filter_to_green(mt: hl.MatrixTable, green_genes: hl.SetExpression) -> hl.MatrixTable:
"""
splits each GeneId onto a new row, then filters any rows not annotating a Green PanelApp gene
Expand Down Expand Up @@ -951,11 +981,17 @@ def main(
# repartition if required - local Hail with finite resources has struggled with some really high (~120k) partitions
# this creates a local duplicate of the input data with far smaller partition counts, for less processing overhead
if mt.n_partitions() > MAX_PARTITIONS:
get_logger().info('Shrinking partitions way down with a unshuffled repartition')
get_logger().info('Shrinking partitions way down with an unshuffled repartition')
mt = mt.repartition(shuffle=False, n_partitions=number_of_cores * 10)
if checkpoint:
get_logger().info('Trying to write the result locally, might need more space on disk...')
mt = generate_a_checkpoint(mt, f'{checkpoint}_reparitioned')
mt = generate_a_checkpoint(mt, f'{checkpoint}_repartitioned')

# swap out the default clinvar annotations with private clinvar
mt = annotate_clinvarbitration(mt=mt, clinvar=clinvar)

# remove common-in-gnomad variants (also includes ClinVar annotation)
mt = filter_to_population_rare(mt=mt)

# subset to currently considered samples
mt = subselect_mt_to_pedigree(mt, pedigree=pedigree)
Expand All @@ -966,21 +1002,13 @@ def main(
# remove any rows which have no genes of interest
mt = remove_variants_outside_gene_roi(mt=mt, green_genes=green_expression)

# update WT genotypes and AD fields
if config_retrieve(['RunHailFiltering', 'update_wt_ad'], True):
mt = update_wt_ad_entry(mt)

if checkpoint:
mt = generate_a_checkpoint(mt, f'{checkpoint}_green_genes')

# swap out the default clinvar annotations with private clinvar
mt = annotate_clinvarbitration(mt=mt, clinvar=clinvar)

# annotate this MT with exomiser variants - annotated as MISSING if the table is absent
mt = annotate_exomiser(mt=mt, exomiser=exomiser)

# if a SVDB data is provided, use that to apply category annotations
mt = annotate_splicevardb(mt=mt, svdb_path=svdb)

# remove common-in-gnomad variants (also includes ClinVar annotation)
mt = filter_to_population_rare(mt=mt)

# filter out quality failures
mt = filter_on_quality_flags(mt=mt)

Expand All @@ -999,6 +1027,15 @@ def main(
if checkpoint:
mt = generate_a_checkpoint(mt, f'{checkpoint}_green_and_clean')

# annotate this MT with exomiser variants - annotated as MISSING if the table is absent
mt = annotate_exomiser(mt=mt, exomiser=exomiser)

# if a SVDB data is provided, use that to apply category annotations
mt = annotate_splicevardb(mt=mt, svdb_path=svdb)

if checkpoint:
mt = generate_a_checkpoint(mt, f'{checkpoint}_green_and_clean_w_external_tables')

# add Labels to the MT
# current logic is to apply 1, 2, 3, and 5, then 4 (de novo)
# for cat. 4, pre-filter the variants by tx-consequential or C5==1
Expand Down
6 changes: 6 additions & 0 deletions src/talos/example_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ max_parent_ab = 0.05
minimum_depth = 10
spliceai = 0.5

# Hail's gVCF combiner is a cheaper approach to joint-calling, but drops some data regarding call quality
# one specific issue is that the 0/0 GT for Wild-types is established, but the Allele Depths are missing
# This has specific implications for our de novo filtering process, which relies heavily on PL and AD fields
# If this attribute is True, we replace all missing ADs with [DP, 0] if DP is populated, else [30, 0]
update_wt_ad = true

[RunHailFiltering.cores]
small_variants = 8

Expand Down
20 changes: 18 additions & 2 deletions src/talos/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def category_values(self, sample: str) -> set[str]:
categories: set[str] = set()
for category in self.sample_categories:
cat_samples = self.info[category]
print(cat_samples, category)
if not isinstance(cat_samples, list):
raise TypeError(f'Sample categories should be a list: {cat_samples}')
if sample in cat_samples:
Expand Down Expand Up @@ -271,11 +270,13 @@ def get_sample_flags(self, sample: str) -> set[str]:
"""
gets all report flags for this sample - currently only one flag
"""
return self.check_ab_ratio(sample) | self.check_read_depth(sample)
return self.check_ab_ratio(sample) | self.check_read_depth_strict(sample)

def check_read_depth(self, sample: str, threshold: int = 10, var_is_cat_1: bool = False) -> set[str]:
"""
flag low read depth for this sample
this version is used to determine whether a variant should be considered for _this_ sample
in this situation we pass variants in clinvar, regardless of read depth
Args:
sample (str): sample ID matching VCF
Expand All @@ -287,6 +288,19 @@ def check_read_depth(self, sample: str, threshold: int = 10, var_is_cat_1: bool
"""
if var_is_cat_1:
return set()
return self.check_read_depth_strict(sample, threshold)

def check_read_depth_strict(self, sample: str, threshold: int = 10) -> set[str]:
"""
flag low read depth for this sample - doesn't care if it's in ClinVar
Args:
sample (str): sample ID matching VCF
threshold (int): cut-off for flagging as a failure
Returns:
return a flag if this sample has low read depth
"""
if self.depths[sample] < threshold:
return {'Low Read Depth'}
return set()
Expand Down Expand Up @@ -355,6 +369,8 @@ class ReportVariant(BaseModel):
support_vars: set[str] = Field(default_factory=set)
# log whether there was an increase in ClinVar star rating since the last run
clinvar_increase: bool = Field(default=False)
# exomiser results - I'd like to store this in a cleaner way in future
exomiser_results: list[str] = Field(default_factory=list)

def __eq__(self, other):
"""
Expand Down
21 changes: 6 additions & 15 deletions test/test_hail_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,23 +157,18 @@ def test_green_from_panelapp():


@pytest.mark.parametrize(
'exomes,genomes,clinvar,svdb,exomiser,length',
'exomes,genomes,clinvar,length',
[
[0, 0, 0, 0, 'missing', 1],
[1.0, 0, 0, 0, 'missing', 0],
[1.0, 0, 0, 0, 'present', 1],
[1.0, 0, 0, 1, 'missing', 1],
[1.0, 0, 1, 0, 'missing', 1],
[0.0001, 0.0001, 0, 0, 'missing', 1],
[0.0001, 0.0001, 1, 0, 'missing', 1],
[0, 0, 0, 1],
[1.0, 0, 0, 0],
[0.0001, 0.0001, 0, 1],
[0.0001, 0.0001, 1, 1],
],
)
def test_filter_rows_for_rare(
exomes: float,
genomes: float,
clinvar: int,
svdb: int,
exomiser: str,
length: int,
make_a_mt,
):
Expand All @@ -183,11 +178,7 @@ def test_filter_rows_for_rare(
anno_matrix = make_a_mt.annotate_rows(
gnomad_genomes=hl.Struct(AF=genomes),
gnomad_exomes=hl.Struct(AF=exomes),
info=make_a_mt.info.annotate(
clinvar_talos=clinvar,
categorybooleansvdb=svdb,
categorydetailsexomiser=exomiser,
),
info=make_a_mt.info.annotate(clinvar_talos=clinvar),
)
matrix = filter_to_population_rare(anno_matrix)
assert matrix.count_rows() == length
Expand Down
44 changes: 15 additions & 29 deletions test/test_hail_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,21 @@


@pytest.mark.parametrize( # needs clinvar
'ac,an,clinvar,svdb,exomiser,threshold,rows',
'ac,an,clinvar,threshold,rows',
[
([1], 1, 0, 0, 'missing', 0.01, 1),
([6], 1, 0, 0, 'missing', 0.01, 0),
([6], 1, 0, 1, 'missing', 0.01, 1),
([6], 1, 0, 0, 'present', 0.01, 1),
([6], 1, 1, 0, 'missing', 0.01, 1),
([6], 70, 0, 0, 'missing', 0.1, 1),
([50], 999999, 0, 0, 'missing', 0.01, 1),
([50], 50, 0, 0, 'missing', 0.01, 0),
([50], 50, 1, 0, 'missing', 0.01, 1),
([1], 1, 0, 0.01, 1),
([6], 1, 0, 0.01, 0),
([6], 1, 1, 0.01, 1),
([6], 70, 0, 0.1, 1),
([50], 999999, 0, 0.01, 1),
([50], 50, 0, 0.01, 0),
([50], 50, 1, 0.01, 1),
],
)
def test_ac_filter_no_filt(
ac: int,
an: int,
clinvar: int,
svdb: str,
exomiser: str,
threshold: float,
rows: int,
make_a_mt: hl.MatrixTable,
Expand All @@ -41,31 +37,25 @@ def test_ac_filter_no_filt(
clinvar_talos=clinvar,
AC=ac,
AN=an,
categorybooleansvdb=svdb,
categorydetailsexomiser=exomiser,
),
)

assert filter_matrix_by_ac(matrix, threshold).count_rows() == rows


@pytest.mark.parametrize(
'filters,clinvar,svdb,exomiser,length',
'filters,clinvar,length',
[
(hl.empty_set(hl.tstr), 0, 0, 'missing', 1),
(hl.literal({'fail'}), 0, 0, 'missing', 0),
(hl.literal({'fail'}), 0, 1, 'missing', 1),
(hl.literal({'fail'}), 0, 0, 'present', 1),
(hl.literal({'fail'}), 1, 0, 'missing', 1),
(hl.literal({'VQSR'}), 0, 0, 'missing', 0),
(hl.literal({'VQSR'}), 1, 0, 'missing', 1),
(hl.empty_set(hl.tstr), 0, 1),
(hl.literal({'fail'}), 0, 0),
(hl.literal({'fail'}), 1, 1),
(hl.literal({'VQSR'}), 0, 0),
(hl.literal({'VQSR'}), 1, 1),
],
)
def test_filter_on_quality_flags(
filters: hl.set,
clinvar: int,
svdb: str,
exomiser: str,
length: int,
make_a_mt: hl.MatrixTable,
):
Expand All @@ -76,11 +66,7 @@ def test_filter_on_quality_flags(
anno_matrix = make_a_mt.key_rows_by('locus')
anno_matrix = anno_matrix.annotate_rows(
filters=filters,
info=anno_matrix.info.annotate(
clinvar_talos=clinvar,
categorybooleansvdb=svdb,
categorydetailsexomiser=exomiser,
),
info=anno_matrix.info.annotate(clinvar_talos=clinvar),
)
assert filter_on_quality_flags(anno_matrix).count_rows() == length

Expand Down

0 comments on commit 91900a0

Please sign in to comment.