From 863e68c0f008c928c8b82a78316e86b4c317f54e Mon Sep 17 00:00:00 2001
From: Jure Zmrzlikar <zmrzlikar.jure@gmail.com>
Date: Fri, 28 Oct 2022 09:55:58 +0200
Subject: [PATCH] Rework VariantTables

---
 docs/CHANGELOG.rst                 |   8 +
 docs/resdk-tables.rst              |  72 ++---
 src/resdk/tables/__init__.py       |  19 ++
 src/resdk/tables/variant.py        | 429 ++++++++++++++---------------
 tests/unit/test_tables_variants.py | 178 ++----------
 5 files changed, 280 insertions(+), 426 deletions(-)
diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst
index a5655f01..f7a2e9c0 100644
--- a/docs/CHANGELOG.rst
+++ b/docs/CHANGELOG.rst
@@ -26,6 +26,14 @@ Changed
   - ``Sample.get_cuffquant``
   - ``Sample.get_expression``
 
+- **BACKWARD INCOMPATIBLE:** Rework ``VariantTables``:
+
+  - Index in VariantTables.variants is simplified and does not include
+    ammino-acid change anymore.
+  - Argument ``mutations`` in ``VariantTables`` constructor is renamed to
+    ``geneset``. Besides holding a list fo genes, this can also be a valid ID /
+    slug for Geneset object on Genialis Platform.
+
 Fixed
 -----
 - Fix ``Sample.get_reads()`` utility method
diff --git a/docs/resdk-tables.rst b/docs/resdk-tables.rst
index 6440227f..1019068f 100644
--- a/docs/resdk-tables.rst
+++ b/docs/resdk-tables.rst
@@ -153,49 +153,23 @@ access of variant data present in Data of type ``data:mutationstable``::
 
 The output of the above would look something like this:
 
-=========  =====================  =====================
-sample_id  chr1_123_C>T_Gly11Asp  chr1_126_T>C_Asp12Gly
-=========  =====================  =====================
-101        2                      0
-102        0                      1
-=========  =====================  =====================
+=========  ============  ============
+sample_id  chr1_123_C>T  chr1_126_T>C
+=========  ============  ============
+101        2             NaN
+102        0             2
+=========  ============  ============
 
 
 In rows, there are sample ID's. In columns there are variants where each
 variant is given as:
-``<chromosome>_<position>_<nucleotide-change>_<amino-acid-change>``.
-Values in table can be 0 (no mutation), 1 (heterozygous mutation) or 2
-(homozygous mutation).
+``<chromosome>_<position>_<nucleotide-change>``.
+Values in table can be:
 
-The above example gives an ideal situation where the mutation status for
-each position is known. However, this is not always the case.
-
-
-Missing values and ``discard_fakes`` argument
----------------------------------------------
-
-Very often, there is no info about a certain variant / sample, so values
-can also be ``NaN`` (unknown). Other common case is just the info that
-there is no mutation on a given position. This is a valid information
-also. Given the above, a more realistic example of output is:
-
-=========  =====================  ===================== ========
-sample_id  chr1_123_C>T_Gly11Asp  chr1_126_T>C_Asp12Gly chr1_127
-=========  =====================  ===================== ========
-101        2                      NaN                   0
-102        0                      1                     NaN
-=========  =====================  ===================== ========
-
-One can se that for some combination of variants / samples there is no
-information: a value in table is ``NaN``. It is up to a user if this is
-interpreted as no variant or something else. In the first case, one can
-quickly convert ``NaN`` to 0 with ``vt.variants.fillna(0)``. One can
-also see that there is a column (chr1_127) that is not actually
-representing a variant. One may call this a "fake" variant. It is a way
-of signalling the absence of variant on a given position. Usually this
-is not useful, but is some cases it is. If you would like your output to
-contain such fake variants please specify ``discard_fakes=False`` in
-``VariantTables`` constructor.
+    - 0 (wild-type / no mutation)
+    - 1 (heterozygous mutation),
+    - 2 (homozygous mutation)
+    - NaN (QC filters are failing - mutation status is unreliable)
 
 
 Inspecting depth
@@ -217,18 +191,18 @@ worth inspecting the depth or depth per base::
 Filtering mutations
 -------------------
 
-Process ``mutations-table`` accepts an input ``mutations`` which
-specifies the gene (and optionally amino acid change) of interest. It
-restricts the scope of mutation to just a given gene or amino acid.
+Process ``mutations-table`` on Genialis Platform accepts either ``mutations`` or
+``geneset`` input which specifies the genes of interest. It restricts the scope
+of mutation search to just a few given genes.
 
-However, it can happen that not all the samples have the same
-``mutations`` input. In such cases, it makes little sense to merge the
-information about mutations from multiple samples. By default,
-``VariantTables`` checks that all Data is computed with same
-``mutations`` input. If this is not true, it will raise an error.
+However, it can happen that not all the samples have the same ``mutations`` or
+``geneset`` input. In such cases, it makes little sense to merge the information
+about mutations from multiple samples. By default, ``VariantTables`` checks that
+all Data is computed with same ``mutations`` / ``geneset`` input. If this is
+not true, it will raise an error.
 
-But if you provide additional argument ``mutations`` it will limit the
-mutations to only those in the given gene. An example::
+But if you provide additional argument ``geneset`` it will limit the
+mutations to only those in the given geneset. An example::
 
     # Sample 101 has mutations input "FHIT, BRCA2"
     # Sample 102 has mutations input "BRCA2"
@@ -238,5 +212,5 @@ mutations to only those in the given gene. An example::
     vt.variants
 
     # This would limit the variants to just the ones in BRCA2 gene.
-    vt = resdk.tables.VariantTables(<collection> mutations=["BRCA2"])
+    vt = resdk.tables.VariantTables(<collection>, geneset=["BRCA2"])
     vt.variants
diff --git a/src/resdk/tables/__init__.py b/src/resdk/tables/__init__.py
index c8ad4136..2c37d291 100644
--- a/src/resdk/tables/__init__.py
+++ b/src/resdk/tables/__init__.py
@@ -9,12 +9,31 @@
 Table classes
 =============
 
+.. autoclass:: resdk.tables.microarray.MATables
+   :members:
+
+   .. automethod:: __init__
+
+.. autoclass:: resdk.tables.ml_ready.MLTables
+   :members:
+
+   .. automethod:: __init__
+
 .. autoclass:: resdk.tables.rna.RNATables
    :members:
 
+   .. automethod:: __init__
+
 .. autoclass:: resdk.tables.methylation.MethylationTables
    :members:
 
+   .. automethod:: __init__
+
+.. autoclass:: resdk.tables.variant.VariantTables
+   :members:
+
+   .. automethod:: __init__
+
 """
 from .methylation import MethylationTables  # noqa
 from .microarray import MATables  # noqa
diff --git a/src/resdk/tables/variant.py b/src/resdk/tables/variant.py
index b29609d5..cb5af889 100644
--- a/src/resdk/tables/variant.py
+++ b/src/resdk/tables/variant.py
@@ -11,13 +11,12 @@
 import re
 import warnings
 from functools import lru_cache
-from itertools import groupby
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 
-from resdk.resources import Collection, Data
+from resdk.resources import Collection, Data, Geneset
 
 from .base import BaseTables
 
@@ -54,6 +53,7 @@ class VariantTables(BaseTables):
     DEPTH_C = "depth_c"
     DEPTH_G = "depth_g"
     DEPTH_T = "depth_t"
+    FILTER = "FILTER"
 
     data_type_to_field_name = {
         VARIANTS: "tsv",
@@ -62,6 +62,7 @@ class VariantTables(BaseTables):
         DEPTH_C: "tsv",
         DEPTH_G: "tsv",
         DEPTH_T: "tsv",
+        FILTER: "tsv",
     }
 
     DATA_FIELDS = [
@@ -79,81 +80,182 @@ class VariantTables(BaseTables):
     def __init__(
         self,
         collection: Collection,
-        discard_fakes: Optional[bool] = True,
-        mutations: Optional[List[str]] = None,
+        geneset: Optional[List[str]] = None,
+        filtering: bool = True,
         cache_dir: Optional[str] = None,
         progress_callable: Optional[Callable] = None,
     ):
         """Initialize class.
 
-        :param collection: collection to use
-        :param mutations: only consider these mutations
-        :param discard_fakes: discard fake variants
-        :param cache_dir: cache directory location, if not specified system specific
-                          cache directory is used
-        :param progress_callable: custom callable that can be used to report
-                                  progress. By default, progress is written to
-                                  stderr with tqdm
+        :param collection: Collection to use.
+        :param geneset: Only consider mutations from this gene-set.
+            Can be a list of gene symbols or a valid geneset Data
+            object id / slug.
+        :param filtering: Only show variants that pass QC filters.
+        :param cache_dir: Cache directory location, if not specified
+            system specific cache directory is used.
+        :param progress_callable: Custom callable that can be used to
+            report progress. By default, progress is written to stderr
+            with tqdm.
         """
         super().__init__(collection, cache_dir, progress_callable)
+        self.filtering = filtering
 
-        self.discard_fakes = discard_fakes
+        self._geneset = None
+        if geneset is None:
+            self._check_heterogeneous_mutations()
+            # Assign geneset from the Genialis Platform
+            self.geneset = self._get_obj_geneset(self._data[0])
+        else:
+            self.geneset = geneset
+
+    @property
+    def geneset(self):
+        """Get geneset."""
+        return self._geneset
+
+    @geneset.setter
+    def geneset(self, value: Union[str, int, Geneset, List[str]]):
+        """Set geneset.
 
-        self.mutations = mutations
-        if mutations is not None:
-            if not isinstance(mutations, list):
-                raise ValueError("Mutations must be a list of strings")
+        Geneset can be set only once. On attempt to re-set, ValueError is raised.
+        """
+        # Geneset can be set only once, prevent modifications
+        if self._geneset is not None:
+            raise ValueError("It is not allowed to change geneset value.")
+
+        if value is None:
+            return
+
+        # If id / slug of a geneset is given, get it from the Resolwe server
+        if isinstance(value, (int, str)):
+            gs = self.resolwe.geneset.get(value)
+            value = gs.genes
+        elif isinstance(value, Geneset):
+            value = value.genes
+
+        if isinstance(value, (list, set, tuple, pd.Series)):
+            self._geneset = set(value)
         else:
-            # Check for heterogeneous mutations only if mutations is not specified
-            self.check_heterogeneous_mutations()
+            raise ValueError(f'Unsupported type of "geneset" input: {value}.')
+
+    @property
+    @lru_cache()
+    def variants(self) -> pd.DataFrame:
+        """Get variants table.
+
+        There are 4 possible values:
+
+            - 0 - wild-type, no variant
+            - 1 - heterozygous mutation
+            - 2 - homozygous mutation
+            - NaN - QC filters are failing - mutation status is unreliable
+
+        """
+        df = self._load_fetch(self.VARIANTS)
+
+        # Variants that are not reported (NaN) were not detected:
+        # they are wild type.
+        df = df.fillna(0)
+
+        if self.filtering:
+            # Keep values in case .filter == PASS or .variants == 0
+            df = df.where((self.filter == "PASS") | (df == 0), other=np.nan)
 
-    def check_heterogeneous_mutations(self):
-        """Ensure variants are selected for the same mutations in all samples."""
-        mutations = {str(d.input["mutations"]) for d in self._data}
+        return df
+
+    @property
+    @lru_cache()
+    def depth(self) -> pd.DataFrame:
+        """Get depth table."""
+        return self._load_fetch(self.DEPTH)
+
+    @property
+    @lru_cache()
+    def depth_a(self) -> pd.DataFrame:
+        """Get depth table for adenine."""
+        return self._load_fetch(self.DEPTH_A)
+
+    @property
+    @lru_cache()
+    def depth_c(self) -> pd.DataFrame:
+        """Get depth table for cytosine."""
+        return self._load_fetch(self.DEPTH_C)
+
+    @property
+    @lru_cache()
+    def depth_g(self) -> pd.DataFrame:
+        """Get depth table for guanine."""
+        return self._load_fetch(self.DEPTH_G)
+
+    @property
+    @lru_cache()
+    def depth_t(self) -> pd.DataFrame:
+        """Get depth table for thymine."""
+        return self._load_fetch(self.DEPTH_T)
+
+    # TODO: consider better name
+    @property
+    @lru_cache()
+    def filter(self) -> pd.DataFrame:
+        """Get filter table.
+
+        Values can be:
+
+            - PASS - Variant has passed filters:
+            - DP : Insufficient read depth (< 10.0)
+            - QD: insufficient quality normalized by depth (< 2.0)
+            - FS: insufficient phred-scaled p-value using Fisher's exact
+                test to detect strand bias (> 30.0)
+            - SnpCluster: Variant is part of a cluster
+
+        For example, if a variant has read depth 8, GATK will mark it as DP.
+
+        """
+        return self._load_fetch(self.FILTER)
+
+    def _check_heterogeneous_mutations(self):
+        """Check there are not heterogeneous mutations / genesets.
+
+        Genes for which mutations are computed are given either with mutations
+        (list of genes) or geneset (geneset ID) input. Ensure all the data has
+        the same value of this.
+        """
+        # Currently, frontend assigns empty list if this value is not entered.
+        mutations = {str(d.input.get("mutations", [])) for d in self._data}
+        genesets = {str(d.input.get("geneset", "")) for d in self._data}
 
         if len(mutations) > 1:
-            raise ValueError(
-                "Variants should be computed with the same mutation input. "
-                f"Variants of samples in collection {self.collection.name} "
-                f"have been computed with {', '.join(list(mutations))}.\n"
-                "Use mutations filter in the VariantTables constructor.\n"
-            )
+            name = "mutations"
+            multiple = mutations
+        elif len(genesets) > 1:
+            name = "genesets"
+            multiple = genesets
+        else:
+            return
 
-    def parse_mutations_string(self, mutations):
-        """Parse mutations string."""
-        db = {}
-        for mutation in mutations:
-            if ":" not in mutation:
-                # Only gene is given
-                gene = mutation
-                aa_changes = set()
-            else:
-                gene, aa_changes = mutation.split(":")
-                aa_changes = set(map(str.strip, aa_changes.split(",")))
-
-            db[gene] = aa_changes
-
-        return db
-
-    def is_mutation_subset(self, first, second):
-        """Check if mutations given in first are a a subset of second."""
-        db_first = self.parse_mutations_string(first)
-        db_second = self.parse_mutations_string(second)
-
-        for first_gene, first_aas in db_first.items():
-            if first_gene not in db_second:
-                return False
-            if not db_second[first_gene]:
-                # If the second covers the whole gene, all is good
-                continue
-            second_aas = db_second[first_gene]
-            if not first_aas and second_aas:
-                # First covers the whole gene, second does not
-                return False
-            if not first_aas.issubset(second_aas):
-                return False
+        raise ValueError(
+            f"Variants should be computed with the same {name} input. "
+            f"Variants of samples in collection {self.collection.name} "
+            f"have been computed with {', '.join(list(multiple))}.\n"
+            "Use geneset filter in the VariantTables constructor.\n"
+        )
+
+    def _get_obj_geneset(self, obj):
+        """Get genes for which mutations are computed in an object."""
+        obj_geneset = set(obj.input.get("mutations", []))
+        if not obj_geneset:
+            # Geneset is given via geneset input:
+            gs = self.resolwe.geneset.get(obj.input["geneset"])
+            obj_geneset = set(gs.genes)
+
+            # Convert to gene symbols in case genes are given as feature ID's
+            if gs.output["source"] != "UCSC":
+                qs = self.resolwe.feature.filter(feature_id__in=list(obj_geneset))
+                id_2_name = {obj.feature_id: obj.name for obj in qs}
+                obj_geneset = set([id_2_name[gene] for gene in obj_geneset])
 
-        return True
+        return obj_geneset
 
     @property
     @lru_cache()
@@ -161,8 +263,7 @@ def _data(self) -> List[Data]:
         """Fetch data objects.
 
         Fetch Data of type ``self.process_type`` from given collection
-        and cache the results in memory. Only the needed subset of
-        fields is fetched.
+        and cache the results in memory.
 
         :return: list of Data objects
         """
@@ -174,18 +275,21 @@ def _data(self) -> List[Data]:
             ordering="-created",
             fields=self.DATA_FIELDS,
         ):
-            # 1 Filter by mutations, if required
-            if self.mutations:
-                if not self.is_mutation_subset(
-                    self.mutations, datum.input["mutations"]
-                ):
-                    continue
-
-            # 2 Filter by newest datum in the sample
+            # 1 Filter by newest datum in the sample
             if datum.sample.id in sample_ids:
                 repeated_sample_ids.add(datum.sample.id)
                 continue
 
+            # 2 Filter by genes, if geneset is given
+            if self.geneset:
+                obj_geneset = self._get_obj_geneset(datum)
+                if not self.geneset.issubset(obj_geneset):
+                    warnings.warn(
+                        f"Sample {datum.sample} (Data {datum.id}) does not "
+                        "contain the genes requested in geneset input."
+                    )
+                    continue
+
             sample_ids.add(datum.sample.id)
             data.append(datum)
 
@@ -205,94 +309,29 @@ def _data(self) -> List[Data]:
 
         return data
 
-    @property
-    @lru_cache()
-    def variants(self) -> pd.DataFrame:
-        """Return variants table as a pandas DataFrame object."""
-        return self._load_fetch(self.VARIANTS)
-
-    @property
-    @lru_cache()
-    def depth(self) -> pd.DataFrame:
-        """Return depth table as a pandas DataFrame object."""
-        return self._load_fetch(self.DEPTH)
-
-    @property
-    @lru_cache()
-    def depth_a(self) -> pd.DataFrame:
-        """Return depth table for adenine as a pandas DataFrame object."""
-        return self._load_fetch(self.DEPTH_A)
-
-    @property
-    @lru_cache()
-    def depth_c(self) -> pd.DataFrame:
-        """Return depth table for cytosine as a pandas DataFrame object."""
-        return self._load_fetch(self.DEPTH_C)
-
-    @property
-    @lru_cache()
-    def depth_g(self) -> pd.DataFrame:
-        """Return depth table for guanine as a pandas DataFrame object."""
-        return self._load_fetch(self.DEPTH_G)
-
-    @property
-    @lru_cache()
-    def depth_t(self) -> pd.DataFrame:
-        """Return depth table for thymine as a pandas DataFrame object."""
-        return self._load_fetch(self.DEPTH_T)
-
     def _download_qc(self) -> pd.DataFrame:
         """Download sample QC data and transform into table."""
         # No QC is given for variants data - return empty DataFrame
         return pd.DataFrame()
 
-    def strip_amino_acid(self, amino_acid_string, remove_alt=False):
-        """Remove "p." prefix of the amino acid string."""
-        # Example Amino acid change:  "p.Gly12Asp"
-        if pd.isna(amino_acid_string):
-            return ""
-        aa_match = re.match(
-            r"^p?\.?([A-Z][a-z]{2}\d+[A-Z][a-z]{2})$", amino_acid_string
-        )
-        if aa_match:
-            if remove_alt:
-                return aa_match.group(1)[:-3]
-            return aa_match.group(1)
-        return ""
-
-    def construct_index(self, row) -> str:
+    def _construct_index(self, row) -> str:
         """
         Construct index of the variants table.
 
         Index should have the form:
-        <chr>_<position>_<snp_change>_<aminoacid_pos_change>
-        E.g. chr2_1234567_C>T_Gly12Asp
-
-        In cases where nucleotide change or amino-acid change is
-        missing, this info is left out from the index.
+        <chr>_<position>_<snp_change>
+        E.g. chr2_1234567_C>T
 
         """
         chrom = row["CHROM"]
         pos = row["POS"]
-
-        index = f"{chrom}_{pos}"
-
-        # REF & ALT
         ref = row["REF"]
-        alt = row.get("ALT", "?")
-        if alt in ["A", "C", "G", "T"]:
-            index += f"_{ref}>{alt}"
+        alt = row["ALT"]
 
-        # Amino acid change
-        aa_change = row.get("HGVS.p", "?")
-        aa_change = self.strip_amino_acid(aa_change)
-        if aa_change:
-            index = f"{index}_{aa_change}"
-
-        return index
+        return f"{chrom}_{pos}_{ref}>{alt}"
 
     @staticmethod
-    def encode_mutation(row) -> int:
+    def _encode_mutation(row) -> int:
         """Encode mutation to numerical value.
 
         Mutations are given as <allele1>/<allele2>, e.g. T/T or C/T
@@ -302,25 +341,10 @@ def encode_mutation(row) -> int:
             - 1 for heterozygous mutation
             - 2 for homozygous mutation
         """
-        allele_line = row.get("SAMPLENAME1.GT", np.nan)
-        hgvs_line = row.get("HGVS.p", np.nan)
-
-        # Handle the obvious cases first:
-        if row["REF"] == row["ALT"]:
-            return 0
-        elif row["ALT"] is None or row["ALT"] == "" or pd.isna(row["ALT"]):
-            return 0
-        elif allele_line is None or allele_line == "" or pd.isna(allele_line):
-            return 0
-        elif (
-            hgvs_line is None
-            or hgvs_line.startswith("no mutation at")
-            or pd.isna(hgvs_line)
-        ):
-            return 0
-
         try:
-            allele1, allele2 = re.match(r"([ACGT])/([ACGT])", allele_line).group(1, 2)
+            allele_line = row.get("SAMPLENAME1.GT", np.nan)
+            allele_re = r"^([ACGT]+)/([ACGT]+)$"
+            allele1, allele2 = re.match(allele_re, allele_line).group(1, 2)
         except AttributeError:
             # AttributeError is raised when there is no match, e.g.
             # there is a string value for column "SAMPLENAME1.GT" but
@@ -337,35 +361,25 @@ def encode_mutation(row) -> int:
 
     def _parse_file(self, file_obj, sample_id, data_type) -> pd.Series:
         """Parse file - get encoded variants / depth of a single sample."""
-        sample_data = pd.read_csv(file_obj, sep="\t")
+        sample_data = pd.read_csv(file_obj, sep="\t", low_memory=False)
 
         # Filter mutations if specified
-        if self.mutations:
-            keep = set()
-            for gene, aa_change in self.parse_mutations_string(self.mutations).items():
-                for sample, gene2, aa_change2 in zip(
-                    sample_data.index, sample_data["Gene_Name"], sample_data["HGVS.p"]
-                ):
-                    if not aa_change:
-                        if gene == gene2:
-                            keep.add(sample)
-                    else:
-                        if gene == gene2 and aa_change == {
-                            self.strip_amino_acid(aa_change2, remove_alt=True)
-                        }:
-                            keep.add(sample)
-
-            sample_data = sample_data.loc[keep]
+        if self.geneset:
+            out = sample_data.loc[sample_data["Gene_Name"].isin(self.geneset)]
+            if out.empty:
+                out = sample_data.loc[sample_data["Feature_ID"].isin(self.geneset)]
+
+            sample_data = out
 
         # Construct index
         sample_data["index"] = sample_data.apply(
-            self.construct_index, axis=1, result_type="reduce"
+            self._construct_index, axis=1, result_type="reduce"
         )
         sample_data.set_index("index", inplace=True)
         sample_data.index.name = None
 
         if data_type == self.VARIANTS:
-            s = sample_data.apply(self.encode_mutation, axis=1, result_type="reduce")
+            s = sample_data.apply(self._encode_mutation, axis=1, result_type="reduce")
         elif data_type == self.DEPTH:
             # Depth, as computed by GATK is reported by "DP" column
             s = sample_data["Total_depth"]
@@ -377,54 +391,13 @@ def _parse_file(self, file_obj, sample_id, data_type) -> pd.Series:
             s = sample_data["Base_G"]
         elif data_type == self.DEPTH_T:
             s = sample_data["Base_T"]
+        elif data_type == self.FILTER:
+            s = sample_data["FILTER"]
 
         s.name = sample_id
-        return s
-
-    def postprocess_variants_table(self, df, data_type):
-        """
-        Propagate info from "fake" to the "real" variants.
-
-        "fake variants" are referred as positions for which we know that
-        do not contain variants. The absence of variants is still a valuable
-        information and should be propagated further when mergeing info
-        from multiple samples in matrix form.
-        """
 
-        def chrom_pos(column_name):
-            """Return only chromosome and position from index."""
-            chrom, pos = column_name.split("_")[:2]
-            return f"{chrom}_{pos}"
+        # Sometimes mutations_table.tsv to contains duplicate variants
+        # For now, keep the first one and drop the rest of them.
+        s = s[~s.index.duplicated(keep="first")]
 
-        to_discard = list()
-
-        for _, group in groupby(df.columns, key=chrom_pos):
-            # Sorting moves fake variant to the first position
-            group = sorted(group)
-            if len(group) <= 1:
-                continue
-
-            fake_variant = group[0]
-            true_variants = group[1:]
-            to_discard.append(fake_variant)
-
-            # Propagate info from fake variants to the true ones
-            for dst in true_variants:
-                for sample, fake, true in zip(df.index, df[fake_variant], df[dst]):
-                    if data_type == self.VARIANTS:
-                        if fake == 0.0 and pd.isna(true):
-                            # Copy 0.0 from all fake to the true ones
-                            df.loc[sample, dst] = fake
-                    else:
-                        if not pd.isna(fake) and pd.isna(true):
-                            # Copy depth from all false to the true ones
-                            df.loc[sample, dst] = fake
-
-        if self.discard_fakes:
-            df = df.drop(columns=to_discard)
-
-        return df
-
-    async def _download_data(self, data_type: str) -> pd.DataFrame:
-        df = await super()._download_data(data_type)
-        return self.postprocess_variants_table(df, data_type=data_type)
+        return s
diff --git a/tests/unit/test_tables_variants.py b/tests/unit/test_tables_variants.py
index 657dbfa5..ed5f02d2 100644
--- a/tests/unit/test_tables_variants.py
+++ b/tests/unit/test_tables_variants.py
@@ -22,8 +22,8 @@ def setUp(self):
             "REF",
             "ALT",
             "DP",
-            "HGVS.p",
             "SAMPLENAME1.GT",
+            "Gene_Name",
             "Base_A",
             "Base_C",
             "Base_G",
@@ -43,8 +43,8 @@ def setUp(self):
         self.variants_data1 = pd.DataFrame(
             columns=mutation_header,
             data=[
-                ["chr2", 120, "C", "T", 24, "p.Gly12Asp", "C/T", 0, 10, 0, 55, 65],
-                ["chr2", 123, "C", "T", 44, "p.Gly13Asp", "T/T", 0, 10, 0, 41, 51],
+                ["chr2", 120, "C", "T", 24, "C/T", "FHIT", 0, 10, 0, 55, 65],
+                ["chr2", 123, "C", "T", 44, "T/T", "FHIT", 0, 10, 0, 41, 51],
             ],
         )
         self.variants_data1.to_csv(self.variants_file1, sep="\t")
@@ -61,7 +61,7 @@ def setUp(self):
         self.variants_data2 = pd.DataFrame(
             columns=mutation_header,
             data=[
-                ["chr2", 120, "C", "T", 24, "p.Gly12Asp", "C/T", 0, 10, 0, 640, 650],
+                ["chr2", 120, "C", "T", 24, "C/T", "FHIT", 0, 10, 0, 640, 650],
             ],
         )
         self.variants_data2.to_csv(self.variants_file2, sep="\t")
@@ -92,19 +92,12 @@ def slow(*args, **kwargs):
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
 
-    @patch.object(VariantTables, "check_heterogeneous_mutations")
-    def test_init(self, hetero_check):
+    def test_init(self):
         vt = VariantTables(self.collection)
-        self.assertEqual(vt.mutations, None)
-        self.assertEqual(vt.discard_fakes, True)
-        hetero_check.assert_called_once()
+        self.assertEqual(vt.geneset, {"FHIT"})
 
-        hetero_check.reset_mock()
-
-        vt = VariantTables(self.collection, mutations=["BRCA2"], discard_fakes=True)
-        self.assertEqual(vt.mutations, ["BRCA2"])
-        self.assertEqual(vt.discard_fakes, True)
-        hetero_check.assert_not_called()
+        vt = VariantTables(self.collection, geneset=["BRCA2"])
+        self.assertEqual(vt.geneset, {"BRCA2"})
 
     @patch.object(VariantTables, "_data", new_callable=PropertyMock)
     def test_check_heterogeneous_mutations(self, data_mock):
@@ -116,44 +109,7 @@ def test_check_heterogeneous_mutations(self, data_mock):
         with self.assertRaisesRegex(ValueError, r"Variants should be computed .*"):
             VariantTables(self.collection)
 
-        VariantTables(self.collection, mutations=["BRCA2"])
-
-    def test_parse_mutations_string(self):
-        vt = VariantTables(self.collection)
-
-        self.assertEqual(
-            vt.parse_mutations_string(["BRCA2"]),
-            {"BRCA2": set()},
-        )
-        self.assertEqual(
-            vt.parse_mutations_string(["BRCA2: Gly3Thy, Gly7Thy"]),
-            {"BRCA2": {"Gly3Thy", "Gly7Thy"}},
-        )
-        self.assertEqual(
-            vt.parse_mutations_string(["BRCA2: Gly3Thy, Gly7Thy", "FHIT"]),
-            {
-                "BRCA2": {"Gly3Thy", "Gly7Thy"},
-                "FHIT": set(),
-            },
-        )
-
-    def test_is_mutation_subset(self):
-        vt = VariantTables(self.collection)
-
-        self.assertTrue(vt.is_mutation_subset(["BRCA2"], ["BRCA2"]))
-        self.assertTrue(vt.is_mutation_subset(["BRCA2: Thy12Gly"], ["BRCA2"]))
-        self.assertTrue(
-            vt.is_mutation_subset(["BRCA2: Thy12Gly"], ["BRCA2: Thy12Gly, Thy15Gly"])
-        )
-        self.assertTrue(
-            vt.is_mutation_subset(
-                ["BRCA2: Thy12Gly"], ["BRCA2: Thy12Gly", "BRCA3: Thy15Gly"]
-            )
-        )
-        self.assertFalse(vt.is_mutation_subset(["BRCA2"], ["BRCA2: Thy12Gly"]))
-        self.assertFalse(
-            vt.is_mutation_subset(["BRCA2: Thy12Gly"], ["BRCA2: Thy15Gly"])
-        )
+        VariantTables(self.collection, geneset=["BRCA2"])
 
     def test_data(self):
         self.collection.data.filter = self.web_request([self.data1])
@@ -183,7 +139,14 @@ async def read_mock():
 
     @patch("resdk.tables.base.aiohttp.ClientSession.get")
     @patch.object(VariantTables, "_get_data_uri")
-    def test_variants(self, data_uri_mock, get_mock):
+    @patch.object(VariantTables, "filter", new_callable=PropertyMock)
+    def test_variants(self, filter_mock, data_uri_mock, get_mock):
+        filter_mock.return_value = pd.DataFrame(
+            [["PASS", "PASS"], ["PASS", "DP"]],
+            index=[self.sample1.id, self.sample2.id],
+            columns=["chr2_120_C>T", "chr2_123_C>T"],
+        )
+
         # Mock the response of _get_data_uri
         data_uri_mock.side_effect = [
             f"{self.data1.sample.id}/{self.variants_file1}",
@@ -194,9 +157,9 @@ def test_variants(self, data_uri_mock, get_mock):
 
         vt = VariantTables(self.collection)
         expected = pd.DataFrame(
-            [[1.0, 2.0], [1.0, np.nan]],
+            [[1.0, 2.0], [1.0, 0.0]],
             index=[self.sample1.id, self.sample2.id],
-            columns=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"],
+            columns=["chr2_120_C>T", "chr2_123_C>T"],
         )
         expected.index.name = "sample_id"
         pd.testing.assert_frame_equal(vt.variants, expected)
@@ -214,7 +177,7 @@ def test_depth(self, data_uri_mock, get_mock):
         expected = pd.DataFrame(
             [[65.0, 51.0], [650.0, np.nan]],
             index=[self.sample1.id, self.sample2.id],
-            columns=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"],
+            columns=["chr2_120_C>T", "chr2_123_C>T"],
         )
         expected.index.name = "sample_id"
         pd.testing.assert_frame_equal(vt.depth, expected)
@@ -232,70 +195,28 @@ def test_depth_t(self, data_uri_mock, get_mock):
         expected = pd.DataFrame(
             [[55.0, 41.0], [640.0, np.nan]],
             index=[self.sample1.id, self.sample2.id],
-            columns=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"],
+            columns=["chr2_120_C>T", "chr2_123_C>T"],
         )
         expected.index.name = "sample_id"
         pd.testing.assert_frame_equal(vt.depth_t, expected)
 
-    def test_strip_amino_acid(self):
-        vt = VariantTables(self.collection)
-
-        self.assertEqual(vt.strip_amino_acid("Gly12Asp"), "Gly12Asp")
-        self.assertEqual(vt.strip_amino_acid("p.Gly12Asp"), "Gly12Asp")
-        self.assertEqual(vt.strip_amino_acid("_p.Gly12Asp"), "")
-        self.assertEqual(vt.strip_amino_acid("p.Gly12Asp_"), "")
-        self.assertEqual(
-            vt.strip_amino_acid(
-                "foobar",
-            ),
-            "",
-        )
-        self.assertEqual(vt.strip_amino_acid("p.Gly12Asp", remove_alt=True), "Gly12")
-
     def test_construct_index(self):
         vt = VariantTables(self.collection)
 
-        row = pd.Series({"CHROM": "chr2", "POS": 7, "REF": "C"})
-        self.assertEqual(vt.construct_index(row), "chr2_7")
-
         row = pd.Series({"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "T"})
-        self.assertEqual(vt.construct_index(row), "chr2_7_C>T")
-
-        row = pd.Series({"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "X"})
-        self.assertEqual(vt.construct_index(row), "chr2_7")
-
-        row = pd.Series(
-            {"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp"}
-        )
-        self.assertEqual(vt.construct_index(row), "chr2_7_C>T_Gly3Asp")
-
-        row = pd.Series(
-            {"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "T", "HGVS.p": "foobar"}
-        )
-        self.assertEqual(vt.construct_index(row), "chr2_7_C>T")
+        self.assertEqual(vt._construct_index(row), "chr2_7_C>T")
 
     def test_encode_mutation(self):
         vt = VariantTables(self.collection)
 
-        row = pd.Series(
-            {"REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp", "SAMPLENAME1.GT": "T/T"}
-        )
-        self.assertEqual(vt.encode_mutation(row), 2)
+        row = pd.Series({"REF": "C", "ALT": "T", "SAMPLENAME1.GT": "T/T"})
+        self.assertEqual(vt._encode_mutation(row), 2)
 
-        row = pd.Series(
-            {"REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp", "SAMPLENAME1.GT": "C/T"}
-        )
-        self.assertEqual(vt.encode_mutation(row), 1)
+        row = pd.Series({"REF": "C", "ALT": "T", "SAMPLENAME1.GT": "C/T"})
+        self.assertEqual(vt._encode_mutation(row), 1)
 
-        row = pd.Series(
-            {"REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp", "SAMPLENAME1.GT": "C/C"}
-        )
-        self.assertEqual(vt.encode_mutation(row), 0)
-
-        row = pd.Series(
-            {"REF": "C", "ALT": np.nan, "HGVS.p": np.nan, "SAMPLENAME1.GT": np.nan}
-        )
-        self.assertEqual(vt.encode_mutation(row), 0)
+        row = pd.Series({"REF": "C", "ALT": "T", "SAMPLENAME1.GT": "C/C"})
+        self.assertEqual(vt._encode_mutation(row), 0)
 
     def test_parse_file(self):
         vt = VariantTables(self.collection)
@@ -304,46 +225,5 @@ def test_parse_file(self):
         # mostly covered in other tests.
         pd.testing.assert_series_equal(
             vt._parse_file(self.variants_file1, 7, "variants"),
-            pd.Series(
-                [1, 2], index=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"], name=7
-            ),
-        )
-
-    def test_postprocess_variants_table(self):
-        # Keep fakes
-        vt = VariantTables(self.collection, discard_fakes=False)
-
-        df = pd.DataFrame(
-            [
-                [2.0, np.nan, 1.0],
-                [np.nan, 0.0, 1.0],
-            ],
-            index=[101, 102],
-            columns=["chr2_10_C>T_Thy4Asp", "chr2_10", "chr2_13_C>T_Thy4Asp"],
-        )
-
-        df2 = vt.postprocess_variants_table(df, "variants")
-        np.testing.assert_array_equal(
-            df2.values.tolist(),
-            [
-                [2.0, np.nan, 1.0],
-                [0.0, 0.0, 1.0],
-            ],
-        )
-        pd.testing.assert_index_equal(df2.index, df.index)
-        pd.testing.assert_index_equal(df2.columns, df.columns)
-
-        # Remove fakes
-        vt = VariantTables(self.collection, discard_fakes=True)
-        df2 = vt.postprocess_variants_table(df, "variants")
-        np.testing.assert_array_equal(
-            df2.values.tolist(),
-            [
-                [2.0, 1.0],
-                [0.0, 1.0],
-            ],
-        )
-        pd.testing.assert_index_equal(df2.index, df.index)
-        np.testing.assert_array_equal(
-            df2.columns, ["chr2_10_C>T_Thy4Asp", "chr2_13_C>T_Thy4Asp"]
+            pd.Series([1, 2], index=["chr2_120_C>T", "chr2_123_C>T"], name=7),
         )