From 863e68c0f008c928c8b82a78316e86b4c317f54e Mon Sep 17 00:00:00 2001 From: Jure Zmrzlikar Date: Fri, 28 Oct 2022 09:55:58 +0200 Subject: [PATCH] Rework VariantTables --- docs/CHANGELOG.rst | 8 + docs/resdk-tables.rst | 72 ++--- src/resdk/tables/__init__.py | 19 ++ src/resdk/tables/variant.py | 429 ++++++++++++++--------------- tests/unit/test_tables_variants.py | 178 ++---------- 5 files changed, 280 insertions(+), 426 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index a5655f01..f7a2e9c0 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -26,6 +26,14 @@ Changed - ``Sample.get_cuffquant`` - ``Sample.get_expression`` +- **BACKWARD INCOMPATIBLE:** Rework ``VariantTables``: + + - Index in VariantTables.variants is simplified and does not include + ammino-acid change anymore. + - Argument ``mutations`` in ``VariantTables`` constructor is renamed to + ``geneset``. Besides holding a list fo genes, this can also be a valid ID / + slug for Geneset object on Genialis Platform. + Fixed ----- - Fix ``Sample.get_reads()`` utility method diff --git a/docs/resdk-tables.rst b/docs/resdk-tables.rst index 6440227f..1019068f 100644 --- a/docs/resdk-tables.rst +++ b/docs/resdk-tables.rst @@ -153,49 +153,23 @@ access of variant data present in Data of type ``data:mutationstable``:: The output of the above would look something like this: -========= ===================== ===================== -sample_id chr1_123_C>T_Gly11Asp chr1_126_T>C_Asp12Gly -========= ===================== ===================== -101 2 0 -102 0 1 -========= ===================== ===================== +========= ============ ============ +sample_id chr1_123_C>T chr1_126_T>C +========= ============ ============ +101 2 NaN +102 0 2 +========= ============ ============ In rows, there are sample ID's. In columns there are variants where each variant is given as: -``___``. -Values in table can be 0 (no mutation), 1 (heterozygous mutation) or 2 -(homozygous mutation). +``__``. +Values in table can be: -The above example gives an ideal situation where the mutation status for -each position is known. However, this is not always the case. - - -Missing values and ``discard_fakes`` argument ---------------------------------------------- - -Very often, there is no info about a certain variant / sample, so values -can also be ``NaN`` (unknown). Other common case is just the info that -there is no mutation on a given position. This is a valid information -also. Given the above, a more realistic example of output is: - -========= ===================== ===================== ======== -sample_id chr1_123_C>T_Gly11Asp chr1_126_T>C_Asp12Gly chr1_127 -========= ===================== ===================== ======== -101 2 NaN 0 -102 0 1 NaN -========= ===================== ===================== ======== - -One can se that for some combination of variants / samples there is no -information: a value in table is ``NaN``. It is up to a user if this is -interpreted as no variant or something else. In the first case, one can -quickly convert ``NaN`` to 0 with ``vt.variants.fillna(0)``. One can -also see that there is a column (chr1_127) that is not actually -representing a variant. One may call this a "fake" variant. It is a way -of signalling the absence of variant on a given position. Usually this -is not useful, but is some cases it is. If you would like your output to -contain such fake variants please specify ``discard_fakes=False`` in -``VariantTables`` constructor. + - 0 (wild-type / no mutation) + - 1 (heterozygous mutation), + - 2 (homozygous mutation) + - NaN (QC filters are failing - mutation status is unreliable) Inspecting depth @@ -217,18 +191,18 @@ worth inspecting the depth or depth per base:: Filtering mutations ------------------- -Process ``mutations-table`` accepts an input ``mutations`` which -specifies the gene (and optionally amino acid change) of interest. It -restricts the scope of mutation to just a given gene or amino acid. +Process ``mutations-table`` on Genialis Platform accepts either ``mutations`` or +``geneset`` input which specifies the genes of interest. It restricts the scope +of mutation search to just a few given genes. -However, it can happen that not all the samples have the same -``mutations`` input. In such cases, it makes little sense to merge the -information about mutations from multiple samples. By default, -``VariantTables`` checks that all Data is computed with same -``mutations`` input. If this is not true, it will raise an error. +However, it can happen that not all the samples have the same ``mutations`` or +``geneset`` input. In such cases, it makes little sense to merge the information +about mutations from multiple samples. By default, ``VariantTables`` checks that +all Data is computed with same ``mutations`` / ``geneset`` input. If this is +not true, it will raise an error. -But if you provide additional argument ``mutations`` it will limit the -mutations to only those in the given gene. An example:: +But if you provide additional argument ``geneset`` it will limit the +mutations to only those in the given geneset. An example:: # Sample 101 has mutations input "FHIT, BRCA2" # Sample 102 has mutations input "BRCA2" @@ -238,5 +212,5 @@ mutations to only those in the given gene. An example:: vt.variants # This would limit the variants to just the ones in BRCA2 gene. - vt = resdk.tables.VariantTables( mutations=["BRCA2"]) + vt = resdk.tables.VariantTables(, geneset=["BRCA2"]) vt.variants diff --git a/src/resdk/tables/__init__.py b/src/resdk/tables/__init__.py index c8ad4136..2c37d291 100644 --- a/src/resdk/tables/__init__.py +++ b/src/resdk/tables/__init__.py @@ -9,12 +9,31 @@ Table classes ============= +.. autoclass:: resdk.tables.microarray.MATables + :members: + + .. automethod:: __init__ + +.. autoclass:: resdk.tables.ml_ready.MLTables + :members: + + .. automethod:: __init__ + .. autoclass:: resdk.tables.rna.RNATables :members: + .. automethod:: __init__ + .. autoclass:: resdk.tables.methylation.MethylationTables :members: + .. automethod:: __init__ + +.. autoclass:: resdk.tables.variant.VariantTables + :members: + + .. automethod:: __init__ + """ from .methylation import MethylationTables # noqa from .microarray import MATables # noqa diff --git a/src/resdk/tables/variant.py b/src/resdk/tables/variant.py index b29609d5..cb5af889 100644 --- a/src/resdk/tables/variant.py +++ b/src/resdk/tables/variant.py @@ -11,13 +11,12 @@ import re import warnings from functools import lru_cache -from itertools import groupby -from typing import Callable, List, Optional +from typing import Callable, List, Optional, Union import numpy as np import pandas as pd -from resdk.resources import Collection, Data +from resdk.resources import Collection, Data, Geneset from .base import BaseTables @@ -54,6 +53,7 @@ class VariantTables(BaseTables): DEPTH_C = "depth_c" DEPTH_G = "depth_g" DEPTH_T = "depth_t" + FILTER = "FILTER" data_type_to_field_name = { VARIANTS: "tsv", @@ -62,6 +62,7 @@ class VariantTables(BaseTables): DEPTH_C: "tsv", DEPTH_G: "tsv", DEPTH_T: "tsv", + FILTER: "tsv", } DATA_FIELDS = [ @@ -79,81 +80,182 @@ class VariantTables(BaseTables): def __init__( self, collection: Collection, - discard_fakes: Optional[bool] = True, - mutations: Optional[List[str]] = None, + geneset: Optional[List[str]] = None, + filtering: bool = True, cache_dir: Optional[str] = None, progress_callable: Optional[Callable] = None, ): """Initialize class. - :param collection: collection to use - :param mutations: only consider these mutations - :param discard_fakes: discard fake variants - :param cache_dir: cache directory location, if not specified system specific - cache directory is used - :param progress_callable: custom callable that can be used to report - progress. By default, progress is written to - stderr with tqdm + :param collection: Collection to use. + :param geneset: Only consider mutations from this gene-set. + Can be a list of gene symbols or a valid geneset Data + object id / slug. + :param filtering: Only show variants that pass QC filters. + :param cache_dir: Cache directory location, if not specified + system specific cache directory is used. + :param progress_callable: Custom callable that can be used to + report progress. By default, progress is written to stderr + with tqdm. """ super().__init__(collection, cache_dir, progress_callable) + self.filtering = filtering - self.discard_fakes = discard_fakes + self._geneset = None + if geneset is None: + self._check_heterogeneous_mutations() + # Assign geneset from the Genialis Platform + self.geneset = self._get_obj_geneset(self._data[0]) + else: + self.geneset = geneset + + @property + def geneset(self): + """Get geneset.""" + return self._geneset + + @geneset.setter + def geneset(self, value: Union[str, int, Geneset, List[str]]): + """Set geneset. - self.mutations = mutations - if mutations is not None: - if not isinstance(mutations, list): - raise ValueError("Mutations must be a list of strings") + Geneset can be set only once. On attempt to re-set, ValueError is raised. + """ + # Geneset can be set only once, prevent modifications + if self._geneset is not None: + raise ValueError("It is not allowed to change geneset value.") + + if value is None: + return + + # If id / slug of a geneset is given, get it from the Resolwe server + if isinstance(value, (int, str)): + gs = self.resolwe.geneset.get(value) + value = gs.genes + elif isinstance(value, Geneset): + value = value.genes + + if isinstance(value, (list, set, tuple, pd.Series)): + self._geneset = set(value) else: - # Check for heterogeneous mutations only if mutations is not specified - self.check_heterogeneous_mutations() + raise ValueError(f'Unsupported type of "geneset" input: {value}.') + + @property + @lru_cache() + def variants(self) -> pd.DataFrame: + """Get variants table. + + There are 4 possible values: + + - 0 - wild-type, no variant + - 1 - heterozygous mutation + - 2 - homozygous mutation + - NaN - QC filters are failing - mutation status is unreliable + + """ + df = self._load_fetch(self.VARIANTS) + + # Variants that are not reported (NaN) were not detected: + # they are wild type. + df = df.fillna(0) + + if self.filtering: + # Keep values in case .filter == PASS or .variants == 0 + df = df.where((self.filter == "PASS") | (df == 0), other=np.nan) - def check_heterogeneous_mutations(self): - """Ensure variants are selected for the same mutations in all samples.""" - mutations = {str(d.input["mutations"]) for d in self._data} + return df + + @property + @lru_cache() + def depth(self) -> pd.DataFrame: + """Get depth table.""" + return self._load_fetch(self.DEPTH) + + @property + @lru_cache() + def depth_a(self) -> pd.DataFrame: + """Get depth table for adenine.""" + return self._load_fetch(self.DEPTH_A) + + @property + @lru_cache() + def depth_c(self) -> pd.DataFrame: + """Get depth table for cytosine.""" + return self._load_fetch(self.DEPTH_C) + + @property + @lru_cache() + def depth_g(self) -> pd.DataFrame: + """Get depth table for guanine.""" + return self._load_fetch(self.DEPTH_G) + + @property + @lru_cache() + def depth_t(self) -> pd.DataFrame: + """Get depth table for thymine.""" + return self._load_fetch(self.DEPTH_T) + + # TODO: consider better name + @property + @lru_cache() + def filter(self) -> pd.DataFrame: + """Get filter table. + + Values can be: + + - PASS - Variant has passed filters: + - DP : Insufficient read depth (< 10.0) + - QD: insufficient quality normalized by depth (< 2.0) + - FS: insufficient phred-scaled p-value using Fisher's exact + test to detect strand bias (> 30.0) + - SnpCluster: Variant is part of a cluster + + For example, if a variant has read depth 8, GATK will mark it as DP. + + """ + return self._load_fetch(self.FILTER) + + def _check_heterogeneous_mutations(self): + """Check there are not heterogeneous mutations / genesets. + + Genes for which mutations are computed are given either with mutations + (list of genes) or geneset (geneset ID) input. Ensure all the data has + the same value of this. + """ + # Currently, frontend assigns empty list if this value is not entered. + mutations = {str(d.input.get("mutations", [])) for d in self._data} + genesets = {str(d.input.get("geneset", "")) for d in self._data} if len(mutations) > 1: - raise ValueError( - "Variants should be computed with the same mutation input. " - f"Variants of samples in collection {self.collection.name} " - f"have been computed with {', '.join(list(mutations))}.\n" - "Use mutations filter in the VariantTables constructor.\n" - ) + name = "mutations" + multiple = mutations + elif len(genesets) > 1: + name = "genesets" + multiple = genesets + else: + return - def parse_mutations_string(self, mutations): - """Parse mutations string.""" - db = {} - for mutation in mutations: - if ":" not in mutation: - # Only gene is given - gene = mutation - aa_changes = set() - else: - gene, aa_changes = mutation.split(":") - aa_changes = set(map(str.strip, aa_changes.split(","))) - - db[gene] = aa_changes - - return db - - def is_mutation_subset(self, first, second): - """Check if mutations given in first are a a subset of second.""" - db_first = self.parse_mutations_string(first) - db_second = self.parse_mutations_string(second) - - for first_gene, first_aas in db_first.items(): - if first_gene not in db_second: - return False - if not db_second[first_gene]: - # If the second covers the whole gene, all is good - continue - second_aas = db_second[first_gene] - if not first_aas and second_aas: - # First covers the whole gene, second does not - return False - if not first_aas.issubset(second_aas): - return False + raise ValueError( + f"Variants should be computed with the same {name} input. " + f"Variants of samples in collection {self.collection.name} " + f"have been computed with {', '.join(list(multiple))}.\n" + "Use geneset filter in the VariantTables constructor.\n" + ) + + def _get_obj_geneset(self, obj): + """Get genes for which mutations are computed in an object.""" + obj_geneset = set(obj.input.get("mutations", [])) + if not obj_geneset: + # Geneset is given via geneset input: + gs = self.resolwe.geneset.get(obj.input["geneset"]) + obj_geneset = set(gs.genes) + + # Convert to gene symbols in case genes are given as feature ID's + if gs.output["source"] != "UCSC": + qs = self.resolwe.feature.filter(feature_id__in=list(obj_geneset)) + id_2_name = {obj.feature_id: obj.name for obj in qs} + obj_geneset = set([id_2_name[gene] for gene in obj_geneset]) - return True + return obj_geneset @property @lru_cache() @@ -161,8 +263,7 @@ def _data(self) -> List[Data]: """Fetch data objects. Fetch Data of type ``self.process_type`` from given collection - and cache the results in memory. Only the needed subset of - fields is fetched. + and cache the results in memory. :return: list of Data objects """ @@ -174,18 +275,21 @@ def _data(self) -> List[Data]: ordering="-created", fields=self.DATA_FIELDS, ): - # 1 Filter by mutations, if required - if self.mutations: - if not self.is_mutation_subset( - self.mutations, datum.input["mutations"] - ): - continue - - # 2 Filter by newest datum in the sample + # 1 Filter by newest datum in the sample if datum.sample.id in sample_ids: repeated_sample_ids.add(datum.sample.id) continue + # 2 Filter by genes, if geneset is given + if self.geneset: + obj_geneset = self._get_obj_geneset(datum) + if not self.geneset.issubset(obj_geneset): + warnings.warn( + f"Sample {datum.sample} (Data {datum.id}) does not " + "contain the genes requested in geneset input." + ) + continue + sample_ids.add(datum.sample.id) data.append(datum) @@ -205,94 +309,29 @@ def _data(self) -> List[Data]: return data - @property - @lru_cache() - def variants(self) -> pd.DataFrame: - """Return variants table as a pandas DataFrame object.""" - return self._load_fetch(self.VARIANTS) - - @property - @lru_cache() - def depth(self) -> pd.DataFrame: - """Return depth table as a pandas DataFrame object.""" - return self._load_fetch(self.DEPTH) - - @property - @lru_cache() - def depth_a(self) -> pd.DataFrame: - """Return depth table for adenine as a pandas DataFrame object.""" - return self._load_fetch(self.DEPTH_A) - - @property - @lru_cache() - def depth_c(self) -> pd.DataFrame: - """Return depth table for cytosine as a pandas DataFrame object.""" - return self._load_fetch(self.DEPTH_C) - - @property - @lru_cache() - def depth_g(self) -> pd.DataFrame: - """Return depth table for guanine as a pandas DataFrame object.""" - return self._load_fetch(self.DEPTH_G) - - @property - @lru_cache() - def depth_t(self) -> pd.DataFrame: - """Return depth table for thymine as a pandas DataFrame object.""" - return self._load_fetch(self.DEPTH_T) - def _download_qc(self) -> pd.DataFrame: """Download sample QC data and transform into table.""" # No QC is given for variants data - return empty DataFrame return pd.DataFrame() - def strip_amino_acid(self, amino_acid_string, remove_alt=False): - """Remove "p." prefix of the amino acid string.""" - # Example Amino acid change: "p.Gly12Asp" - if pd.isna(amino_acid_string): - return "" - aa_match = re.match( - r"^p?\.?([A-Z][a-z]{2}\d+[A-Z][a-z]{2})$", amino_acid_string - ) - if aa_match: - if remove_alt: - return aa_match.group(1)[:-3] - return aa_match.group(1) - return "" - - def construct_index(self, row) -> str: + def _construct_index(self, row) -> str: """ Construct index of the variants table. Index should have the form: - ___ - E.g. chr2_1234567_C>T_Gly12Asp - - In cases where nucleotide change or amino-acid change is - missing, this info is left out from the index. + __ + E.g. chr2_1234567_C>T """ chrom = row["CHROM"] pos = row["POS"] - - index = f"{chrom}_{pos}" - - # REF & ALT ref = row["REF"] - alt = row.get("ALT", "?") - if alt in ["A", "C", "G", "T"]: - index += f"_{ref}>{alt}" + alt = row["ALT"] - # Amino acid change - aa_change = row.get("HGVS.p", "?") - aa_change = self.strip_amino_acid(aa_change) - if aa_change: - index = f"{index}_{aa_change}" - - return index + return f"{chrom}_{pos}_{ref}>{alt}" @staticmethod - def encode_mutation(row) -> int: + def _encode_mutation(row) -> int: """Encode mutation to numerical value. Mutations are given as /, e.g. T/T or C/T @@ -302,25 +341,10 @@ def encode_mutation(row) -> int: - 1 for heterozygous mutation - 2 for homozygous mutation """ - allele_line = row.get("SAMPLENAME1.GT", np.nan) - hgvs_line = row.get("HGVS.p", np.nan) - - # Handle the obvious cases first: - if row["REF"] == row["ALT"]: - return 0 - elif row["ALT"] is None or row["ALT"] == "" or pd.isna(row["ALT"]): - return 0 - elif allele_line is None or allele_line == "" or pd.isna(allele_line): - return 0 - elif ( - hgvs_line is None - or hgvs_line.startswith("no mutation at") - or pd.isna(hgvs_line) - ): - return 0 - try: - allele1, allele2 = re.match(r"([ACGT])/([ACGT])", allele_line).group(1, 2) + allele_line = row.get("SAMPLENAME1.GT", np.nan) + allele_re = r"^([ACGT]+)/([ACGT]+)$" + allele1, allele2 = re.match(allele_re, allele_line).group(1, 2) except AttributeError: # AttributeError is raised when there is no match, e.g. # there is a string value for column "SAMPLENAME1.GT" but @@ -337,35 +361,25 @@ def encode_mutation(row) -> int: def _parse_file(self, file_obj, sample_id, data_type) -> pd.Series: """Parse file - get encoded variants / depth of a single sample.""" - sample_data = pd.read_csv(file_obj, sep="\t") + sample_data = pd.read_csv(file_obj, sep="\t", low_memory=False) # Filter mutations if specified - if self.mutations: - keep = set() - for gene, aa_change in self.parse_mutations_string(self.mutations).items(): - for sample, gene2, aa_change2 in zip( - sample_data.index, sample_data["Gene_Name"], sample_data["HGVS.p"] - ): - if not aa_change: - if gene == gene2: - keep.add(sample) - else: - if gene == gene2 and aa_change == { - self.strip_amino_acid(aa_change2, remove_alt=True) - }: - keep.add(sample) - - sample_data = sample_data.loc[keep] + if self.geneset: + out = sample_data.loc[sample_data["Gene_Name"].isin(self.geneset)] + if out.empty: + out = sample_data.loc[sample_data["Feature_ID"].isin(self.geneset)] + + sample_data = out # Construct index sample_data["index"] = sample_data.apply( - self.construct_index, axis=1, result_type="reduce" + self._construct_index, axis=1, result_type="reduce" ) sample_data.set_index("index", inplace=True) sample_data.index.name = None if data_type == self.VARIANTS: - s = sample_data.apply(self.encode_mutation, axis=1, result_type="reduce") + s = sample_data.apply(self._encode_mutation, axis=1, result_type="reduce") elif data_type == self.DEPTH: # Depth, as computed by GATK is reported by "DP" column s = sample_data["Total_depth"] @@ -377,54 +391,13 @@ def _parse_file(self, file_obj, sample_id, data_type) -> pd.Series: s = sample_data["Base_G"] elif data_type == self.DEPTH_T: s = sample_data["Base_T"] + elif data_type == self.FILTER: + s = sample_data["FILTER"] s.name = sample_id - return s - - def postprocess_variants_table(self, df, data_type): - """ - Propagate info from "fake" to the "real" variants. - - "fake variants" are referred as positions for which we know that - do not contain variants. The absence of variants is still a valuable - information and should be propagated further when mergeing info - from multiple samples in matrix form. - """ - def chrom_pos(column_name): - """Return only chromosome and position from index.""" - chrom, pos = column_name.split("_")[:2] - return f"{chrom}_{pos}" + # Sometimes mutations_table.tsv to contains duplicate variants + # For now, keep the first one and drop the rest of them. + s = s[~s.index.duplicated(keep="first")] - to_discard = list() - - for _, group in groupby(df.columns, key=chrom_pos): - # Sorting moves fake variant to the first position - group = sorted(group) - if len(group) <= 1: - continue - - fake_variant = group[0] - true_variants = group[1:] - to_discard.append(fake_variant) - - # Propagate info from fake variants to the true ones - for dst in true_variants: - for sample, fake, true in zip(df.index, df[fake_variant], df[dst]): - if data_type == self.VARIANTS: - if fake == 0.0 and pd.isna(true): - # Copy 0.0 from all fake to the true ones - df.loc[sample, dst] = fake - else: - if not pd.isna(fake) and pd.isna(true): - # Copy depth from all false to the true ones - df.loc[sample, dst] = fake - - if self.discard_fakes: - df = df.drop(columns=to_discard) - - return df - - async def _download_data(self, data_type: str) -> pd.DataFrame: - df = await super()._download_data(data_type) - return self.postprocess_variants_table(df, data_type=data_type) + return s diff --git a/tests/unit/test_tables_variants.py b/tests/unit/test_tables_variants.py index 657dbfa5..ed5f02d2 100644 --- a/tests/unit/test_tables_variants.py +++ b/tests/unit/test_tables_variants.py @@ -22,8 +22,8 @@ def setUp(self): "REF", "ALT", "DP", - "HGVS.p", "SAMPLENAME1.GT", + "Gene_Name", "Base_A", "Base_C", "Base_G", @@ -43,8 +43,8 @@ def setUp(self): self.variants_data1 = pd.DataFrame( columns=mutation_header, data=[ - ["chr2", 120, "C", "T", 24, "p.Gly12Asp", "C/T", 0, 10, 0, 55, 65], - ["chr2", 123, "C", "T", 44, "p.Gly13Asp", "T/T", 0, 10, 0, 41, 51], + ["chr2", 120, "C", "T", 24, "C/T", "FHIT", 0, 10, 0, 55, 65], + ["chr2", 123, "C", "T", 44, "T/T", "FHIT", 0, 10, 0, 41, 51], ], ) self.variants_data1.to_csv(self.variants_file1, sep="\t") @@ -61,7 +61,7 @@ def setUp(self): self.variants_data2 = pd.DataFrame( columns=mutation_header, data=[ - ["chr2", 120, "C", "T", 24, "p.Gly12Asp", "C/T", 0, 10, 0, 640, 650], + ["chr2", 120, "C", "T", 24, "C/T", "FHIT", 0, 10, 0, 640, 650], ], ) self.variants_data2.to_csv(self.variants_file2, sep="\t") @@ -92,19 +92,12 @@ def slow(*args, **kwargs): def tearDown(self): shutil.rmtree(self.tmp_dir) - @patch.object(VariantTables, "check_heterogeneous_mutations") - def test_init(self, hetero_check): + def test_init(self): vt = VariantTables(self.collection) - self.assertEqual(vt.mutations, None) - self.assertEqual(vt.discard_fakes, True) - hetero_check.assert_called_once() + self.assertEqual(vt.geneset, {"FHIT"}) - hetero_check.reset_mock() - - vt = VariantTables(self.collection, mutations=["BRCA2"], discard_fakes=True) - self.assertEqual(vt.mutations, ["BRCA2"]) - self.assertEqual(vt.discard_fakes, True) - hetero_check.assert_not_called() + vt = VariantTables(self.collection, geneset=["BRCA2"]) + self.assertEqual(vt.geneset, {"BRCA2"}) @patch.object(VariantTables, "_data", new_callable=PropertyMock) def test_check_heterogeneous_mutations(self, data_mock): @@ -116,44 +109,7 @@ def test_check_heterogeneous_mutations(self, data_mock): with self.assertRaisesRegex(ValueError, r"Variants should be computed .*"): VariantTables(self.collection) - VariantTables(self.collection, mutations=["BRCA2"]) - - def test_parse_mutations_string(self): - vt = VariantTables(self.collection) - - self.assertEqual( - vt.parse_mutations_string(["BRCA2"]), - {"BRCA2": set()}, - ) - self.assertEqual( - vt.parse_mutations_string(["BRCA2: Gly3Thy, Gly7Thy"]), - {"BRCA2": {"Gly3Thy", "Gly7Thy"}}, - ) - self.assertEqual( - vt.parse_mutations_string(["BRCA2: Gly3Thy, Gly7Thy", "FHIT"]), - { - "BRCA2": {"Gly3Thy", "Gly7Thy"}, - "FHIT": set(), - }, - ) - - def test_is_mutation_subset(self): - vt = VariantTables(self.collection) - - self.assertTrue(vt.is_mutation_subset(["BRCA2"], ["BRCA2"])) - self.assertTrue(vt.is_mutation_subset(["BRCA2: Thy12Gly"], ["BRCA2"])) - self.assertTrue( - vt.is_mutation_subset(["BRCA2: Thy12Gly"], ["BRCA2: Thy12Gly, Thy15Gly"]) - ) - self.assertTrue( - vt.is_mutation_subset( - ["BRCA2: Thy12Gly"], ["BRCA2: Thy12Gly", "BRCA3: Thy15Gly"] - ) - ) - self.assertFalse(vt.is_mutation_subset(["BRCA2"], ["BRCA2: Thy12Gly"])) - self.assertFalse( - vt.is_mutation_subset(["BRCA2: Thy12Gly"], ["BRCA2: Thy15Gly"]) - ) + VariantTables(self.collection, geneset=["BRCA2"]) def test_data(self): self.collection.data.filter = self.web_request([self.data1]) @@ -183,7 +139,14 @@ async def read_mock(): @patch("resdk.tables.base.aiohttp.ClientSession.get") @patch.object(VariantTables, "_get_data_uri") - def test_variants(self, data_uri_mock, get_mock): + @patch.object(VariantTables, "filter", new_callable=PropertyMock) + def test_variants(self, filter_mock, data_uri_mock, get_mock): + filter_mock.return_value = pd.DataFrame( + [["PASS", "PASS"], ["PASS", "DP"]], + index=[self.sample1.id, self.sample2.id], + columns=["chr2_120_C>T", "chr2_123_C>T"], + ) + # Mock the response of _get_data_uri data_uri_mock.side_effect = [ f"{self.data1.sample.id}/{self.variants_file1}", @@ -194,9 +157,9 @@ def test_variants(self, data_uri_mock, get_mock): vt = VariantTables(self.collection) expected = pd.DataFrame( - [[1.0, 2.0], [1.0, np.nan]], + [[1.0, 2.0], [1.0, 0.0]], index=[self.sample1.id, self.sample2.id], - columns=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"], + columns=["chr2_120_C>T", "chr2_123_C>T"], ) expected.index.name = "sample_id" pd.testing.assert_frame_equal(vt.variants, expected) @@ -214,7 +177,7 @@ def test_depth(self, data_uri_mock, get_mock): expected = pd.DataFrame( [[65.0, 51.0], [650.0, np.nan]], index=[self.sample1.id, self.sample2.id], - columns=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"], + columns=["chr2_120_C>T", "chr2_123_C>T"], ) expected.index.name = "sample_id" pd.testing.assert_frame_equal(vt.depth, expected) @@ -232,70 +195,28 @@ def test_depth_t(self, data_uri_mock, get_mock): expected = pd.DataFrame( [[55.0, 41.0], [640.0, np.nan]], index=[self.sample1.id, self.sample2.id], - columns=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"], + columns=["chr2_120_C>T", "chr2_123_C>T"], ) expected.index.name = "sample_id" pd.testing.assert_frame_equal(vt.depth_t, expected) - def test_strip_amino_acid(self): - vt = VariantTables(self.collection) - - self.assertEqual(vt.strip_amino_acid("Gly12Asp"), "Gly12Asp") - self.assertEqual(vt.strip_amino_acid("p.Gly12Asp"), "Gly12Asp") - self.assertEqual(vt.strip_amino_acid("_p.Gly12Asp"), "") - self.assertEqual(vt.strip_amino_acid("p.Gly12Asp_"), "") - self.assertEqual( - vt.strip_amino_acid( - "foobar", - ), - "", - ) - self.assertEqual(vt.strip_amino_acid("p.Gly12Asp", remove_alt=True), "Gly12") - def test_construct_index(self): vt = VariantTables(self.collection) - row = pd.Series({"CHROM": "chr2", "POS": 7, "REF": "C"}) - self.assertEqual(vt.construct_index(row), "chr2_7") - row = pd.Series({"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "T"}) - self.assertEqual(vt.construct_index(row), "chr2_7_C>T") - - row = pd.Series({"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "X"}) - self.assertEqual(vt.construct_index(row), "chr2_7") - - row = pd.Series( - {"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp"} - ) - self.assertEqual(vt.construct_index(row), "chr2_7_C>T_Gly3Asp") - - row = pd.Series( - {"CHROM": "chr2", "POS": 7, "REF": "C", "ALT": "T", "HGVS.p": "foobar"} - ) - self.assertEqual(vt.construct_index(row), "chr2_7_C>T") + self.assertEqual(vt._construct_index(row), "chr2_7_C>T") def test_encode_mutation(self): vt = VariantTables(self.collection) - row = pd.Series( - {"REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp", "SAMPLENAME1.GT": "T/T"} - ) - self.assertEqual(vt.encode_mutation(row), 2) + row = pd.Series({"REF": "C", "ALT": "T", "SAMPLENAME1.GT": "T/T"}) + self.assertEqual(vt._encode_mutation(row), 2) - row = pd.Series( - {"REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp", "SAMPLENAME1.GT": "C/T"} - ) - self.assertEqual(vt.encode_mutation(row), 1) + row = pd.Series({"REF": "C", "ALT": "T", "SAMPLENAME1.GT": "C/T"}) + self.assertEqual(vt._encode_mutation(row), 1) - row = pd.Series( - {"REF": "C", "ALT": "T", "HGVS.p": "p.Gly3Asp", "SAMPLENAME1.GT": "C/C"} - ) - self.assertEqual(vt.encode_mutation(row), 0) - - row = pd.Series( - {"REF": "C", "ALT": np.nan, "HGVS.p": np.nan, "SAMPLENAME1.GT": np.nan} - ) - self.assertEqual(vt.encode_mutation(row), 0) + row = pd.Series({"REF": "C", "ALT": "T", "SAMPLENAME1.GT": "C/C"}) + self.assertEqual(vt._encode_mutation(row), 0) def test_parse_file(self): vt = VariantTables(self.collection) @@ -304,46 +225,5 @@ def test_parse_file(self): # mostly covered in other tests. pd.testing.assert_series_equal( vt._parse_file(self.variants_file1, 7, "variants"), - pd.Series( - [1, 2], index=["chr2_120_C>T_Gly12Asp", "chr2_123_C>T_Gly13Asp"], name=7 - ), - ) - - def test_postprocess_variants_table(self): - # Keep fakes - vt = VariantTables(self.collection, discard_fakes=False) - - df = pd.DataFrame( - [ - [2.0, np.nan, 1.0], - [np.nan, 0.0, 1.0], - ], - index=[101, 102], - columns=["chr2_10_C>T_Thy4Asp", "chr2_10", "chr2_13_C>T_Thy4Asp"], - ) - - df2 = vt.postprocess_variants_table(df, "variants") - np.testing.assert_array_equal( - df2.values.tolist(), - [ - [2.0, np.nan, 1.0], - [0.0, 0.0, 1.0], - ], - ) - pd.testing.assert_index_equal(df2.index, df.index) - pd.testing.assert_index_equal(df2.columns, df.columns) - - # Remove fakes - vt = VariantTables(self.collection, discard_fakes=True) - df2 = vt.postprocess_variants_table(df, "variants") - np.testing.assert_array_equal( - df2.values.tolist(), - [ - [2.0, 1.0], - [0.0, 1.0], - ], - ) - pd.testing.assert_index_equal(df2.index, df.index) - np.testing.assert_array_equal( - df2.columns, ["chr2_10_C>T_Thy4Asp", "chr2_13_C>T_Thy4Asp"] + pd.Series([1, 2], index=["chr2_120_C>T", "chr2_123_C>T"], name=7), )