Partial fix for variant inflation (#477)

* rearrange filters * solve AD based de novo failures * small model method tweak
populationgenomics · Feb 4, 2025 · 91900a0 · 91900a0
1 parent f11c77e
commit 91900a0
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 84 deletions.
diff --git a/src/talos/RunHailFiltering.py b/src/talos/RunHailFiltering.py
@@ -107,8 +107,13 @@ def annotate_exomiser(mt: hl.MatrixTable, exomiser: str | None = None) -> hl.Mat
     Returns:
         The same MatrixTable but with additional annotations
     """
-
-    if not exomiser:
+    if not exomiser or (
+        'exomiser'
+        in config_retrieve(
+            ['ValidateMOI', 'ignore_categories'],
+            [],
+        )
+    ):
         get_logger().info('No exomiser table found, skipping annotation')
         return mt.annotate_rows(info=mt.info.annotate(categorydetailsexomiser=MISSING_STRING))
 
@@ -120,6 +125,9 @@ def annotate_exomiser(mt: hl.MatrixTable, exomiser: str | None = None) -> hl.Mat
         ),
     )
 
+    get_logger().info('No exomiser table found, skipping annotation')
+    return mt.annotate_rows(info=mt.info.annotate(categorydetailsexomiser=MISSING_STRING))
+
 
 def annotate_codon_clinvar(mt: hl.MatrixTable, pm5_path: str | None):
     """
@@ -244,12 +252,24 @@ def annotate_splicevardb(mt: hl.MatrixTable, svdb_path: str | None):
     Returns:
         Same MT with an extra category label
     """
+    if svdb_path is None or (
+        'svdb'
+        in config_retrieve(
+            ['ValidateMOI', 'ignore_categories'],
+            [],
+        )
+    ):
+        get_logger().info('Skipping SVDB annotation')
+        return mt.annotate_rows(
+            info=mt.info.annotate(
+                categorybooleansvdb=MISSING_INT,
+                svdb_location=MISSING_STRING,
+                svdb_method=MISSING_STRING,
+                svdb_doi=MISSING_STRING,
+            ),
+        )
 
-    if svdb_path is None:
-        get_logger().info('SVDB table not found, skipping annotation')
-        return mt.annotate_rows(info=mt.info.annotate(categorydetailsSVDB=MISSING_STRING))
-
-    # read in the codon table
+        # read in the codon table
     get_logger().info(f'Reading SpliceVarDB data from {svdb_path}')
     svdb_ht = hl.read_table(svdb_path)
 
@@ -289,13 +309,7 @@ def filter_on_quality_flags(mt: hl.MatrixTable) -> hl.MatrixTable:
     """
 
     return mt.filter_rows(
-        hl.is_missing(mt.filters)
-        | (mt.filters.length() == 0)
-        | (
-            (mt.info.clinvar_talos == ONE_INT)
-            | (mt.info.categorybooleansvdb == ONE_INT)
-            | (mt.info.categorydetailsexomiser != MISSING_STRING)
-        ),
+        hl.is_missing(mt.filters) | (mt.filters.length() == 0) | (mt.info.clinvar_talos == ONE_INT),
     )
 
 
@@ -361,11 +375,7 @@ def filter_matrix_by_ac(mt: hl.MatrixTable, ac_threshold: float = 0.01) -> hl.Ma
     min_callset_ac = 5
     return mt.filter_rows(
         ((min_callset_ac >= mt.info.AC[0]) | (ac_threshold > mt.info.AC[0] / mt.info.AN))
-        | (
-            (mt.info.clinvar_talos == ONE_INT)
-            | (mt.info.categorybooleansvdb == ONE_INT)
-            | (mt.info.categorydetailsexomiser != MISSING_STRING)
-        ),
+        | (mt.info.clinvar_talos == ONE_INT),
     )
 
 
@@ -383,11 +393,7 @@ def filter_to_population_rare(mt: hl.MatrixTable) -> hl.MatrixTable:
             (hl.or_else(mt.gnomad_exomes.AF, MISSING_FLOAT_LO) < rare_af_threshold)
             & (hl.or_else(mt.gnomad_genomes.AF, MISSING_FLOAT_LO) < rare_af_threshold)
         )
-        | (
-            (mt.info.clinvar_talos == ONE_INT)
-            | (mt.info.categorybooleansvdb == ONE_INT)
-            | (mt.info.categorydetailsexomiser != MISSING_STRING)
-        ),
+        | (mt.info.clinvar_talos == ONE_INT),
     )
 
 
@@ -429,6 +435,30 @@ def remove_variants_outside_gene_roi(mt: hl.MatrixTable, green_genes: hl.SetExpr
     return mt.filter_rows(hl.len(green_genes.intersection(mt.geneIds)) > 0)
 
 
+def update_wt_ad_entry(mt: hl.MatrixTable) -> hl.MatrixTable:
+    """
+    Take the MT as it is currently, and update the AD field for WT calls to be [AD, 0] where missing
+
+    Args:
+        mt ():
+
+    Returns:
+        Same MT, with updated fields
+    """
+    return mt.annotate_entries(
+        AD=hl.case()
+        .when(
+            ~hl.is_missing(mt.AD),
+            mt.AD,
+        )
+        .when(
+            (~hl.is_missing(mt.DP)) & (mt.GT.is_hom_ref()),
+            [mt.DP, 0],
+        )
+        .default([30, 0]),
+    )
+
+
 def split_rows_by_gene_and_filter_to_green(mt: hl.MatrixTable, green_genes: hl.SetExpression) -> hl.MatrixTable:
     """
     splits each GeneId onto a new row, then filters any rows not annotating a Green PanelApp gene
@@ -951,11 +981,17 @@ def main(
     # repartition if required - local Hail with finite resources has struggled with some really high (~120k) partitions
     # this creates a local duplicate of the input data with far smaller partition counts, for less processing overhead
     if mt.n_partitions() > MAX_PARTITIONS:
-        get_logger().info('Shrinking partitions way down with a unshuffled repartition')
+        get_logger().info('Shrinking partitions way down with an unshuffled repartition')
         mt = mt.repartition(shuffle=False, n_partitions=number_of_cores * 10)
         if checkpoint:
             get_logger().info('Trying to write the result locally, might need more space on disk...')
-            mt = generate_a_checkpoint(mt, f'{checkpoint}_reparitioned')
+            mt = generate_a_checkpoint(mt, f'{checkpoint}_repartitioned')
+
+    # swap out the default clinvar annotations with private clinvar
+    mt = annotate_clinvarbitration(mt=mt, clinvar=clinvar)
+
+    # remove common-in-gnomad variants (also includes ClinVar annotation)
+    mt = filter_to_population_rare(mt=mt)
 
     # subset to currently considered samples
     mt = subselect_mt_to_pedigree(mt, pedigree=pedigree)
@@ -966,21 +1002,13 @@ def main(
     # remove any rows which have no genes of interest
     mt = remove_variants_outside_gene_roi(mt=mt, green_genes=green_expression)
 
+    # update WT genotypes and AD fields
+    if config_retrieve(['RunHailFiltering', 'update_wt_ad'], True):
+        mt = update_wt_ad_entry(mt)
+
     if checkpoint:
         mt = generate_a_checkpoint(mt, f'{checkpoint}_green_genes')
 
-    # swap out the default clinvar annotations with private clinvar
-    mt = annotate_clinvarbitration(mt=mt, clinvar=clinvar)
-
-    # annotate this MT with exomiser variants - annotated as MISSING if the table is absent
-    mt = annotate_exomiser(mt=mt, exomiser=exomiser)
-
-    # if a SVDB data is provided, use that to apply category annotations
-    mt = annotate_splicevardb(mt=mt, svdb_path=svdb)
-
-    # remove common-in-gnomad variants (also includes ClinVar annotation)
-    mt = filter_to_population_rare(mt=mt)
-
     # filter out quality failures
     mt = filter_on_quality_flags(mt=mt)
 
@@ -999,6 +1027,15 @@ def main(
     if checkpoint:
         mt = generate_a_checkpoint(mt, f'{checkpoint}_green_and_clean')
 
+    # annotate this MT with exomiser variants - annotated as MISSING if the table is absent
+    mt = annotate_exomiser(mt=mt, exomiser=exomiser)
+
+    # if a SVDB data is provided, use that to apply category annotations
+    mt = annotate_splicevardb(mt=mt, svdb_path=svdb)
+
+    if checkpoint:
+        mt = generate_a_checkpoint(mt, f'{checkpoint}_green_and_clean_w_external_tables')
+
     # add Labels to the MT
     # current logic is to apply 1, 2, 3, and 5, then 4 (de novo)
     # for cat. 4, pre-filter the variants by tx-consequential or C5==1

diff --git a/src/talos/example_config.toml b/src/talos/example_config.toml
@@ -57,6 +57,12 @@ max_parent_ab = 0.05
 minimum_depth = 10
 spliceai = 0.5
 
+# Hail's gVCF combiner is a cheaper approach to joint-calling, but drops some data regarding call quality
+# one specific issue is that the 0/0 GT for Wild-types is established, but the Allele Depths are missing
+# This has specific implications for our de novo filtering process, which relies heavily on PL and AD fields
+# If this attribute is True, we replace all missing ADs with [DP, 0] if DP is populated, else [30, 0]
+update_wt_ad = true
+
 [RunHailFiltering.cores]
 small_variants = 8
 

diff --git a/src/talos/models.py b/src/talos/models.py
@@ -183,7 +183,6 @@ def category_values(self, sample: str) -> set[str]:
         categories: set[str] = set()
         for category in self.sample_categories:
             cat_samples = self.info[category]
-            print(cat_samples, category)
             if not isinstance(cat_samples, list):
                 raise TypeError(f'Sample categories should be a list: {cat_samples}')
             if sample in cat_samples:
@@ -271,11 +270,13 @@ def get_sample_flags(self, sample: str) -> set[str]:
         """
         gets all report flags for this sample - currently only one flag
         """
-        return self.check_ab_ratio(sample) | self.check_read_depth(sample)
+        return self.check_ab_ratio(sample) | self.check_read_depth_strict(sample)
 
     def check_read_depth(self, sample: str, threshold: int = 10, var_is_cat_1: bool = False) -> set[str]:
         """
         flag low read depth for this sample
+        this version is used to determine whether a variant should be considered for _this_ sample
+        in this situation we pass variants in clinvar, regardless of read depth
 
         Args:
             sample (str): sample ID matching VCF
@@ -287,6 +288,19 @@ def check_read_depth(self, sample: str, threshold: int = 10, var_is_cat_1: bool
         """
         if var_is_cat_1:
             return set()
+        return self.check_read_depth_strict(sample, threshold)
+
+    def check_read_depth_strict(self, sample: str, threshold: int = 10) -> set[str]:
+        """
+        flag low read depth for this sample - doesn't care if it's in ClinVar
+
+        Args:
+            sample (str): sample ID matching VCF
+            threshold (int): cut-off for flagging as a failure
+
+        Returns:
+            return a flag if this sample has low read depth
+        """
         if self.depths[sample] < threshold:
             return {'Low Read Depth'}
         return set()
@@ -355,6 +369,8 @@ class ReportVariant(BaseModel):
     support_vars: set[str] = Field(default_factory=set)
     # log whether there was an increase in ClinVar star rating since the last run
     clinvar_increase: bool = Field(default=False)
+    # exomiser results - I'd like to store this in a cleaner way in future
+    exomiser_results: list[str] = Field(default_factory=list)
 
     def __eq__(self, other):
         """

diff --git a/test/test_hail_categories.py b/test/test_hail_categories.py
@@ -157,23 +157,18 @@ def test_green_from_panelapp():
 
 
 @pytest.mark.parametrize(
-    'exomes,genomes,clinvar,svdb,exomiser,length',
+    'exomes,genomes,clinvar,length',
     [
-        [0, 0, 0, 0, 'missing', 1],
-        [1.0, 0, 0, 0, 'missing', 0],
-        [1.0, 0, 0, 0, 'present', 1],
-        [1.0, 0, 0, 1, 'missing', 1],
-        [1.0, 0, 1, 0, 'missing', 1],
-        [0.0001, 0.0001, 0, 0, 'missing', 1],
-        [0.0001, 0.0001, 1, 0, 'missing', 1],
+        [0, 0, 0, 1],
+        [1.0, 0, 0, 0],
+        [0.0001, 0.0001, 0, 1],
+        [0.0001, 0.0001, 1, 1],
     ],
 )
 def test_filter_rows_for_rare(
     exomes: float,
     genomes: float,
     clinvar: int,
-    svdb: int,
-    exomiser: str,
     length: int,
     make_a_mt,
 ):
@@ -183,11 +178,7 @@ def test_filter_rows_for_rare(
     anno_matrix = make_a_mt.annotate_rows(
         gnomad_genomes=hl.Struct(AF=genomes),
         gnomad_exomes=hl.Struct(AF=exomes),
-        info=make_a_mt.info.annotate(
-            clinvar_talos=clinvar,
-            categorybooleansvdb=svdb,
-            categorydetailsexomiser=exomiser,
-        ),
+        info=make_a_mt.info.annotate(clinvar_talos=clinvar),
     )
     matrix = filter_to_population_rare(anno_matrix)
     assert matrix.count_rows() == length

diff --git a/test/test_hail_filters.py b/test/test_hail_filters.py
@@ -9,25 +9,21 @@
 
 
 @pytest.mark.parametrize(  # needs clinvar
-    'ac,an,clinvar,svdb,exomiser,threshold,rows',
+    'ac,an,clinvar,threshold,rows',
     [
-        ([1], 1, 0, 0, 'missing', 0.01, 1),
-        ([6], 1, 0, 0, 'missing', 0.01, 0),
-        ([6], 1, 0, 1, 'missing', 0.01, 1),
-        ([6], 1, 0, 0, 'present', 0.01, 1),
-        ([6], 1, 1, 0, 'missing', 0.01, 1),
-        ([6], 70, 0, 0, 'missing', 0.1, 1),
-        ([50], 999999, 0, 0, 'missing', 0.01, 1),
-        ([50], 50, 0, 0, 'missing', 0.01, 0),
-        ([50], 50, 1, 0, 'missing', 0.01, 1),
+        ([1], 1, 0, 0.01, 1),
+        ([6], 1, 0, 0.01, 0),
+        ([6], 1, 1, 0.01, 1),
+        ([6], 70, 0, 0.1, 1),
+        ([50], 999999, 0, 0.01, 1),
+        ([50], 50, 0, 0.01, 0),
+        ([50], 50, 1, 0.01, 1),
     ],
 )
 def test_ac_filter_no_filt(
     ac: int,
     an: int,
     clinvar: int,
-    svdb: str,
-    exomiser: str,
     threshold: float,
     rows: int,
     make_a_mt: hl.MatrixTable,
@@ -41,31 +37,25 @@ def test_ac_filter_no_filt(
             clinvar_talos=clinvar,
             AC=ac,
             AN=an,
-            categorybooleansvdb=svdb,
-            categorydetailsexomiser=exomiser,
         ),
     )
 
     assert filter_matrix_by_ac(matrix, threshold).count_rows() == rows
 
 
 @pytest.mark.parametrize(
-    'filters,clinvar,svdb,exomiser,length',
+    'filters,clinvar,length',
     [
-        (hl.empty_set(hl.tstr), 0, 0, 'missing', 1),
-        (hl.literal({'fail'}), 0, 0, 'missing', 0),
-        (hl.literal({'fail'}), 0, 1, 'missing', 1),
-        (hl.literal({'fail'}), 0, 0, 'present', 1),
-        (hl.literal({'fail'}), 1, 0, 'missing', 1),
-        (hl.literal({'VQSR'}), 0, 0, 'missing', 0),
-        (hl.literal({'VQSR'}), 1, 0, 'missing', 1),
+        (hl.empty_set(hl.tstr), 0, 1),
+        (hl.literal({'fail'}), 0, 0),
+        (hl.literal({'fail'}), 1, 1),
+        (hl.literal({'VQSR'}), 0, 0),
+        (hl.literal({'VQSR'}), 1, 1),
     ],
 )
 def test_filter_on_quality_flags(
     filters: hl.set,
     clinvar: int,
-    svdb: str,
-    exomiser: str,
     length: int,
     make_a_mt: hl.MatrixTable,
 ):
@@ -76,11 +66,7 @@ def test_filter_on_quality_flags(
     anno_matrix = make_a_mt.key_rows_by('locus')
     anno_matrix = anno_matrix.annotate_rows(
         filters=filters,
-        info=anno_matrix.info.annotate(
-            clinvar_talos=clinvar,
-            categorybooleansvdb=svdb,
-            categorydetailsexomiser=exomiser,
-        ),
+        info=anno_matrix.info.annotate(clinvar_talos=clinvar),
     )
     assert filter_on_quality_flags(anno_matrix).count_rows() == length