diff --git a/helpers/report_hunter.py b/helpers/report_hunter.py index 0158b3e5..4800f527 100644 --- a/helpers/report_hunter.py +++ b/helpers/report_hunter.py @@ -20,7 +20,7 @@ from cpg_utils.config import get_config from metamist.graphql import gql, query -from reanalysis.utils import get_logger +from reanalysis.static_values import get_logger JINJA_TEMPLATE_DIR = Path(__file__).absolute().parent / 'templates' PROJECT_QUERY = gql( diff --git a/reanalysis/hail_filter_sv.py b/reanalysis/hail_filter_sv.py index c68a88c8..0a041878 100644 --- a/reanalysis/hail_filter_sv.py +++ b/reanalysis/hail_filter_sv.py @@ -22,7 +22,8 @@ ONE_INT, MISSING_INT, ) -from reanalysis.utils import get_logger, read_json_from_path +from reanalysis.utils import read_json_from_path +from reanalysis.static_values import get_logger def filter_matrix_by_af( diff --git a/reanalysis/interpretation_runner.py b/reanalysis/interpretation_runner.py index f708195e..2dae2d1f 100644 --- a/reanalysis/interpretation_runner.py +++ b/reanalysis/interpretation_runner.py @@ -41,11 +41,10 @@ seqr_loader, ) from reanalysis.utils import ( - FileTypes, identify_file_type, - get_granular_date, - get_logger, ) +from reanalysis.models import FileTypes +from reanalysis.static_values import get_granular_date, get_logger # region: CONSTANTS # exact time that this run occurred diff --git a/reanalysis/models.py b/reanalysis/models.py index 11779a7a..25696e2c 100644 --- a/reanalysis/models.py +++ b/reanalysis/models.py @@ -4,13 +4,33 @@ from enum import Enum from pydantic import BaseModel, Field -from reanalysis.utils import get_granular_date - +from reanalysis.static_values import get_granular_date NON_HOM_CHROM = ['X', 'Y', 'MT', 'M'] CHROM_ORDER = list(map(str, range(1, 23))) + NON_HOM_CHROM +class VariantType(Enum): + """ + enumeration of permitted variant types + """ + + SMALL = 'SMALL' + SV = 'SV' + + +class FileTypes(Enum): + """ + enumeration of permitted input file types + """ + + HAIL_TABLE = '.ht' + MATRIX_TABLE = '.mt' + VCF = '.vcf' + VCF_GZ = '.vcf.gz' + VCF_BGZ = '.vcf.bgz' + + class Coordinates(BaseModel): """ A representation of genomic coordinates @@ -61,24 +81,13 @@ def __eq__(self, other) -> bool: ) -class VariantType(Enum): - """ - enumeration of permitted variant types - """ - - SMALL = 'SMALL' - SV = 'SV' - - class Variant(BaseModel): """ the abstracted representation of a variant from any source - todo move some more of the parsing logic into here as an init? """ coordinates: Coordinates = Field(repr=True) - info: dict[str, str | int | float] = Field(default_factory=dict) - categories: list[str] = Field(default_factory=list) + info: dict[str, str | int | float | list[str] | bool] = Field(default_factory=dict) het_samples: set[str] = Field(default_factory=set, exclude=True) hom_samples: set[str] = Field(default_factory=set, exclude=True) boolean_categories: list[str] = Field(default_factory=list, exclude=True) @@ -153,7 +162,9 @@ def sample_support_only(self, sample_id: str) -> bool: Returns: True if support only """ - return self.has_support and not self.sample_categorised_check(sample_id) + return self.has_support and not ( + self.category_non_support or self.sample_categorised_check(sample_id) + ) def category_values(self, sample: str) -> list[str]: """ @@ -227,8 +238,9 @@ def sample_category_check(self, sample_id: str, allow_support: bool = True) -> b class SmallVariant(Variant): depths: dict[str, int] = Field(default_factory=dict, exclude=True) ab_ratios: dict[str, float] = Field(default_factory=dict, exclude=True) - transcript_consequences: list[dict[str, str]] = Field(default_factory=list) - var_type: str = VariantType.SMALL.value + transcript_consequences: list[dict[str, str | float | int]] = Field( + default_factory=list + ) def get_sample_flags(self, sample: str) -> list[str]: """ @@ -274,9 +286,6 @@ def check_ab_ratio(self, sample: str) -> list[str]: class StructuralVariant(Variant): - - var_type: str = VariantType.SV.value - def check_ab_ratio(self, *args, **kwargs) -> list[str]: """ dummy method for AB ratio checking - not implemented for SVs @@ -314,14 +323,7 @@ class ReportVariant(BaseModel): phenotypes: list[str] = Field(default_factory=list) labels: list[str] = Field(default_factory=list) first_seen: str = Field(default=get_granular_date()) - independent: bool = False - - @property - def is_independent(self): - """ - check if this variant acts independently - """ - return len(self.support_vars) == 0 + independent: bool = Field(default=False) def __eq__(self, other): """ diff --git a/reanalysis/moi_tests.py b/reanalysis/moi_tests.py index e287dacb..ef30ba64 100644 --- a/reanalysis/moi_tests.py +++ b/reanalysis/moi_tests.py @@ -12,14 +12,8 @@ from cpg_utils.config import get_config -from reanalysis.utils import ( - AbstractVariant, - CompHetDict, - MinimalVariant, - ReportedVariant, - VariantType, - X_CHROMOSOME, -) +from reanalysis.models import SmallVariant, StructuralVariant, ReportVariant +from reanalysis.utils import CompHetDict, X_CHROMOSOME # config keys to use for dominant MOI tests CALLSET_AF_SV_DOMINANT = 'callset_af_sv_dominant' @@ -38,7 +32,7 @@ def check_for_second_hit( first_variant: str, comp_hets: CompHetDict, sample: str -) -> list[AbstractVariant]: +) -> list[SmallVariant | StructuralVariant]: """ checks for a second hit partner in this gene @@ -46,10 +40,10 @@ def check_for_second_hit( { "SampleID": { "12-52287177-T-C": [ - AbstractVariant(12-52287180-TGG-T) + Variant(12-52287180-TGG-T) ], "12-52287180-TGG-T": [ - AbstractVariant(12-52287177-T-C) + Variant(12-52287177-T-C) ] } ... } @@ -124,7 +118,7 @@ def run( principal_var, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ run method - triggers each relevant inheritance model @@ -165,10 +159,10 @@ def __init__(self, pedigree: Ped, applied_moi: str): @abstractmethod def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ run all applicable inheritance patterns and finds good fits """ @@ -215,12 +209,12 @@ def check_familial_inheritance( return True def get_family_genotypes( - self, variant: AbstractVariant, sample_id: str + self, variant: SmallVariant | StructuralVariant, sample_id: str ) -> dict[str, str]: """ Args: - variant (AbstractVariant): + variant (SmallVariant | StructuralVariant): sample_id (str): the sample ID to gather genotypes for Returns: @@ -238,7 +232,7 @@ def get_sample_genotype(member_id: str, sex: str) -> str: str: text representation of this genotype """ - if variant.coords.chrom in X_CHROMOSOME: + if variant.coordinates.chrom in X_CHROMOSOME: if sex == 'male' and ( member_id in variant.het_samples or member_id in variant.hom_samples ): @@ -281,7 +275,10 @@ def check_frequency_passes(info: dict, thresholds: dict[str, int | float]) -> bo return all({info.get(key, 0) <= test for key, test in thresholds.items()}) def check_comp_het( - self, sample_id: str, variant_1: AbstractVariant, variant_2: AbstractVariant + self, + sample_id: str, + variant_1: SmallVariant | StructuralVariant, + variant_2: SmallVariant | StructuralVariant, ) -> bool: """ use parents to accept or dismiss the comp-het @@ -293,8 +290,8 @@ def check_comp_het( Args: sample_id (str): sample ID to check for - variant_1 (AbstractVariant): first variant of comp-het pair - variant_2 (AbstractVariant): second variant of comp-het pair + variant_1 (SmallVariant | StructuralVariant): first variant of comp-het pair + variant_2 (SmallVariant | StructuralVariant): second variant of comp-het pair Returns: bool: True if these two variants form a comp-het @@ -339,12 +336,12 @@ def __init__( # prepare the AF test dicts self.freq_tests = { - VariantType.SMALL: {key: self.hom_threshold for key in INFO_HOMS} + SmallVariant.__name__: {key: self.hom_threshold for key in INFO_HOMS} | { 'gnomad_ac': self.ac_threshold, 'gnomad_af': self.ad_threshold, }, - VariantType.SV: { + StructuralVariant.__name__: { 'af': self.sv_af_threshold, SV_AF_KEY: self.sv_af_threshold, }, @@ -353,10 +350,10 @@ def __init__( def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ Simplest MOI, exclusions based on HOM count and AF Args: @@ -370,7 +367,7 @@ def run( # reject support for dominant MOI, apply checks based on var type if principal.support_only or not ( self.check_frequency_passes( - principal.info, self.freq_tests[principal.info['var_type']] + principal.info, self.freq_tests[principal.__class__.__name__] ) ): return classifications @@ -400,16 +397,18 @@ def run( continue classifications.append( - ReportedVariant( + ReportVariant( sample=sample_id, family=self.pedigree[sample_id].family_id, gene=principal.info.get('gene_id'), - var_data=MinimalVariant(variant=principal, sample=sample_id), + var_data=principal, + categories=principal.category_values(sample_id), reasons={self.applied_moi}, genotypes=self.get_family_genotypes( variant=principal, sample_id=sample_id ), flags=principal.get_sample_flags(sample_id), + independent=True, ) ) @@ -432,21 +431,21 @@ def __init__( def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ valid if present as compound het counts as being phased if a compound het is split between parents Args: - principal (AbstractVariant): main variant being evaluated + principal (SmallVariant | StructuralVariant): main variant being evaluated comp_het (dict): comp-het partners partial_pen (bool): Returns: - list[ReportedVariant]: data object if RecessiveAutosomal fits + list[ReportVariant]: data object if RecessiveAutosomal fits """ if comp_het is None: @@ -471,7 +470,7 @@ def run( continue for partner_variant in check_for_second_hit( - first_variant=principal.coords.string_format, + first_variant=principal.coordinates.string_format, comp_hets=comp_het, sample=sample_id, ): @@ -500,18 +499,20 @@ def run( continue classifications.append( - ReportedVariant( + ReportVariant( sample=sample_id, family=self.pedigree[sample_id].family_id, gene=principal.info.get('gene_id'), - var_data=MinimalVariant(principal, sample_id), + var_data=principal, + categories=principal.category_values(sample_id), reasons={self.applied_moi}, genotypes=self.get_family_genotypes( variant=principal, sample_id=sample_id ), - support_vars={partner_variant.coords.string_format}, + support_vars={partner_variant.coordinates.string_format}, flags=principal.get_sample_flags(sample_id) + partner_variant.get_sample_flags(sample_id), + independent=False, ), ) @@ -532,27 +533,27 @@ def __init__( """ """ self.hom_threshold = get_config()['moi_tests'][GNOMAD_REC_HOM_THRESHOLD] self.freq_tests = { - VariantType.SMALL.value: {key: self.hom_threshold for key in INFO_HOMS}, - VariantType.SV.value: {key: self.hom_threshold for key in SV_HOMS}, + SmallVariant.__name__: {key: self.hom_threshold for key in INFO_HOMS}, + StructuralVariant.__name__: {key: self.hom_threshold for key in SV_HOMS}, } super().__init__(pedigree=pedigree, applied_moi=applied_moi) def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ explicitly tests HOMs Args: - principal (AbstractVariant): main variant being evaluated + principal (SmallVariant | StructuralVariant): main variant being evaluated comp_het (dict): comp-het partners partial_pen (bool): Returns: - list[ReportedVariant]: data object if RecessiveAutosomal fits + list[ReportVariant]: data object if RecessiveAutosomal fits """ classifications = [] @@ -560,7 +561,7 @@ def run( # remove if too many homs are present in population databases if principal.support_only or not ( self.check_frequency_passes( - principal.info, self.freq_tests[principal.var_type] + principal.info, self.freq_tests[principal.__class__.__name__] ) or principal.info.get('categoryboolean1') ): @@ -590,18 +591,19 @@ def run( ): continue - # todo make this a pydantic model classifications.append( - ReportedVariant( + ReportVariant( sample=sample_id, family=self.pedigree[sample_id].family_id, gene=principal.info.get('gene_id'), - var_data=MinimalVariant(principal, sample_id), + var_data=principal, + categories=principal.category_values(sample_id), genotypes=self.get_family_genotypes( variant=principal, sample_id=sample_id ), reasons={self.applied_moi}, flags=principal.get_sample_flags(sample_id), + independent=True, ) ) @@ -630,13 +632,13 @@ def __init__(self, pedigree: Ped, applied_moi: str = 'X_Dominant'): self.hemi_threshold = get_config()['moi_tests'][GNOMAD_HEMI_THRESHOLD] self.freq_tests = { - VariantType.SMALL: {key: self.hom_threshold for key in INFO_HOMS} + SmallVariant.__name__: {key: self.hom_threshold for key in INFO_HOMS} | {key: self.hemi_threshold for key in INFO_HEMI} | { - 'gnomad_ad': self.ad_threshold, 'gnomad_ac': self.ac_threshold, + 'gnomad_af': self.ad_threshold, }, - VariantType.SV: {key: self.hom_threshold for key in SV_HOMS} + StructuralVariant.__name__: {key: self.hom_threshold for key in SV_HOMS} | {key: self.hemi_threshold for key in SV_HEMI}, } @@ -644,10 +646,10 @@ def __init__(self, pedigree: Ped, applied_moi: str = 'X_Dominant'): def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ if variant is present and sufficiently rare, we take it discarded if support @@ -666,7 +668,7 @@ def run( # never apply dominant MOI to support variants # more stringent Pop.Freq checks for dominant - hemi restriction if not self.check_frequency_passes( - principal.info, self.freq_tests[principal.info['var_type']] + principal.info, self.freq_tests[principal.__class__.__name__] ): return classifications @@ -698,16 +700,18 @@ def run( continue classifications.append( - ReportedVariant( + ReportVariant( sample=sample_id, family=self.pedigree[sample_id].family_id, gene=principal.info.get('gene_id'), - var_data=MinimalVariant(principal, sample_id), + var_data=principal, + categories=principal.category_values(sample_id), reasons={self.applied_moi}, genotypes=self.get_family_genotypes( variant=principal, sample_id=sample_id ), flags=principal.get_sample_flags(sample_id), + independent=True, ) ) return classifications @@ -736,9 +740,9 @@ def __init__( self.hemi_threshold = get_config()['moi_tests'][GNOMAD_HEMI_THRESHOLD] self.freq_tests = { - VariantType.SMALL: {key: self.hom_dom_threshold for key in INFO_HOMS} + SmallVariant.__name__: {key: self.hom_dom_threshold for key in INFO_HOMS} | {key: self.hemi_threshold for key in INFO_HEMI}, - VariantType.SV: {key: self.hom_dom_threshold for key in SV_HOMS} + StructuralVariant.__name__: {key: self.hom_dom_threshold for key in SV_HOMS} | {key: self.hemi_threshold for key in SV_HEMI}, } @@ -746,10 +750,10 @@ def __init__( def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ Args: principal (): @@ -761,7 +765,7 @@ def run( # remove from analysis if too many homs are present in population databases if not self.check_frequency_passes( - principal.info, self.freq_tests[principal.info['var_type']] + principal.info, self.freq_tests[principal.__class__.__name__] ): return classifications @@ -797,16 +801,18 @@ def run( continue classifications.append( - ReportedVariant( + ReportVariant( sample=sample_id, family=self.pedigree[sample_id].family_id, gene=principal.info.get('gene_id'), - var_data=MinimalVariant(principal, sample_id), + var_data=principal, + categories=principal.category_values(sample_id), genotypes=self.get_family_genotypes( variant=principal, sample_id=sample_id ), reasons={self.applied_moi}, flags=principal.get_sample_flags(sample_id), + independent=True, ) ) return classifications @@ -832,17 +838,19 @@ def __init__( self.hom_rec_threshold = get_config()['moi_tests'][GNOMAD_REC_HOM_THRESHOLD] self.freq_tests = { - VariantType.SMALL: {key: self.hom_rec_threshold for key in INFO_HOMS}, - VariantType.SV: {key: self.hom_rec_threshold for key in SV_HOMS}, + SmallVariant.__name__: {key: self.hom_rec_threshold for key in INFO_HOMS}, + StructuralVariant.__name__: { + key: self.hom_rec_threshold for key in SV_HOMS + }, } super().__init__(pedigree=pedigree, applied_moi=applied_moi) def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ Args: @@ -856,7 +864,7 @@ def run( # remove from analysis if too many homs are present in population databases if principal.support_only or not ( self.check_frequency_passes( - principal.info, self.freq_tests[principal.info['var_type']] + principal.info, self.freq_tests[principal.__class__.__name__] ) or principal.info.get('categoryboolean1') ): @@ -890,16 +898,18 @@ def run( continue classifications.append( - ReportedVariant( + ReportVariant( sample=sample_id, family=self.pedigree[sample_id].family_id, gene=principal.info.get('gene_id'), - var_data=MinimalVariant(principal, sample_id), + var_data=principal, + categories=principal.category_values(sample_id), genotypes=self.get_family_genotypes( variant=principal, sample_id=sample_id ), reasons={self.applied_moi}, flags=principal.get_sample_flags(sample_id), + independent=True, ) ) return classifications @@ -925,17 +935,19 @@ def __init__( self.hom_rec_threshold = get_config()['moi_tests'][GNOMAD_REC_HOM_THRESHOLD] self.freq_tests = { - VariantType.SMALL: {key: self.hom_rec_threshold for key in INFO_HOMS}, - VariantType.SV: {key: self.hom_rec_threshold for key in SV_HOMS}, + SmallVariant.__name__: {key: self.hom_rec_threshold for key in INFO_HOMS}, + StructuralVariant.__name__: { + key: self.hom_rec_threshold for key in SV_HOMS + }, } super().__init__(pedigree=pedigree, applied_moi=applied_moi) def run( self, - principal: AbstractVariant, + principal: SmallVariant | StructuralVariant, comp_het: CompHetDict | None = None, partial_pen: bool = False, - ) -> list[ReportedVariant]: + ) -> list[ReportVariant]: """ Args: @@ -946,18 +958,16 @@ def run( if comp_het is None: comp_het = {} - classifications = [] # remove from analysis if too many homs are present in population databases if not ( self.check_frequency_passes( - principal.info, self.freq_tests[principal.info['var_type']] + principal.info, self.freq_tests[principal.__class__.__name__] ) or principal.info.get('categoryboolean1') ): return classifications - het_females = { sam for sam in principal.het_samples if self.pedigree[sam].sex == 'female' } @@ -979,7 +989,7 @@ def run( continue for partner in check_for_second_hit( - first_variant=principal.coords.string_format, + first_variant=principal.coordinates.string_format, comp_hets=comp_het, sample=sample_id, ): @@ -989,7 +999,7 @@ def run( not partner.sample_category_check(sample_id, allow_support=True) or not ( self.check_frequency_passes( - partner.info, self.freq_tests[partner.info['var_type']] + partner.info, self.freq_tests[partner.__class__.__name__] ) or partner.info.get('categoryboolean1') ) @@ -1011,18 +1021,20 @@ def run( continue classifications.append( - ReportedVariant( + ReportVariant( sample=sample_id, family=self.pedigree[sample_id].family_id, gene=principal.info.get('gene_id'), - var_data=MinimalVariant(principal, sample_id), + var_data=principal, + categories=principal.category_values(sample_id), reasons={self.applied_moi}, genotypes=self.get_family_genotypes( variant=principal, sample_id=sample_id ), - support_vars={partner.coords.string_format}, + support_vars={partner.coordinates.string_format}, flags=principal.get_sample_flags(sample_id) + partner.get_sample_flags(sample_id), + independent=False, ) ) diff --git a/reanalysis/static_values.py b/reanalysis/static_values.py new file mode 100644 index 00000000..d70f025a --- /dev/null +++ b/reanalysis/static_values.py @@ -0,0 +1,68 @@ +""" +This is a placeholder, completely base class to prevent circular imports +""" +import logging +import sys + +from datetime import datetime + +from cpg_utils.config import get_config + + +_GRANULAR_DATE: str | None = None +LOGGER = None + + +def get_granular_date(): + """ + cached getter/setter + """ + global _GRANULAR_DATE + if _GRANULAR_DATE is None: + # allow an override here - synthetic historic runs + try: + if fake_date := get_config().get('workflow', {}).get('fake_date'): + _GRANULAR_DATE = fake_date + except AssertionError: + get_logger().warning('No date set in config, falling back to real Date') + if _GRANULAR_DATE is None: + _GRANULAR_DATE = datetime.now().strftime('%Y-%m-%d') + return _GRANULAR_DATE + + +def get_logger( + logger_name: str = 'AIP-logger', log_level: int = logging.INFO +) -> logging.Logger: + """ + creates a logger instance (so as not to use the root logger) + + Args: + logger_name (str): + log_level (): + + Returns: + a logger instance, or the global logger if already defined + """ + global LOGGER + + if LOGGER is None: + # this very verbose logging is to ensure that the log level requested (INFO) + # doesn't cause the unintentional logging of every Metamist query + # create a named logger + LOGGER = logging.getLogger(logger_name) + LOGGER.setLevel(log_level) + + # create a stream handler to write output + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(log_level) + + # create format string for messages + formatter = logging.Formatter( + '%(asctime)s - %(name)s %(lineno)d - %(levelname)s - %(message)s' + ) + stream_handler.setFormatter(formatter) + + # set the logger to use this handler + LOGGER.addHandler(stream_handler) + + return LOGGER diff --git a/reanalysis/utils.py b/reanalysis/utils.py index eccb5058..b5794901 100644 --- a/reanalysis/utils.py +++ b/reanalysis/utils.py @@ -2,11 +2,9 @@ classes and methods shared across reanalysis components """ -import logging -import sys import time from collections import defaultdict -from dataclasses import dataclass, is_dataclass, field +from dataclasses import dataclass, is_dataclass from datetime import datetime from enum import Enum from itertools import chain, combinations_with_replacement, islice @@ -16,11 +14,22 @@ import json import re + +import cyvcf2 import requests from cpg_utils import to_path, Path as CPGPathType from cpg_utils.config import get_config +from reanalysis.models import ( + Coordinates, + ReportVariant, + SmallVariant, + StructuralVariant, + VariantType, + FileTypes, +) +from reanalysis.static_values import get_granular_date, get_logger HOMREF: int = 0 HETALT: int = 1 @@ -35,8 +44,6 @@ X_CHROMOSOME = {'X'} TODAY = datetime.now().strftime('%Y-%m-%d_%H:%M') -_GRANULAR_DATE: str | None = None - # most lenient to most conservative # usage = if we have two MOIs for the same gene, take the broadest ORDERED_MOIS = [ @@ -56,84 +63,6 @@ # CONFIG_FIELDS = ['workflow'] # , 'filter', 'panels', 'categories'] # assert all(field in get_config(False).keys() for field in CONFIG_FIELDS) -LOGGER = None - - -def get_logger( - logger_name: str = 'AIP-logger', log_level: int = logging.INFO -) -> logging.Logger: - """ - creates a logger instance (so as not to use the root logger) - - Args: - logger_name (str): - log_level (): - - Returns: - a logger instance, or the global logger if already defined - """ - global LOGGER - - if LOGGER is None: - # this very verbose logging is to ensure that the log level requested (INFO) - # doesn't cause the unintentional logging of every Metamist query - # create a named logger - LOGGER = logging.getLogger(logger_name) - LOGGER.setLevel(log_level) - - # create a stream handler to write output - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setLevel(log_level) - - # create format string for messages - formatter = logging.Formatter( - '%(asctime)s - %(name)s %(lineno)d - %(levelname)s - %(message)s' - ) - stream_handler.setFormatter(formatter) - - # set the logger to use this handler - LOGGER.addHandler(stream_handler) - - return LOGGER - - -def get_granular_date(): - """ - cached getter/setter - """ - global _GRANULAR_DATE - if _GRANULAR_DATE is None: - # allow an override here - synthetic historic runs - try: - if fake_date := get_config().get('workflow', {}).get('fake_date'): - _GRANULAR_DATE = fake_date - except AssertionError: - get_logger().info(f'No config loaded, falling back to {_GRANULAR_DATE}') - if _GRANULAR_DATE is None: - _GRANULAR_DATE = datetime.now().strftime('%Y-%m-%d') - return _GRANULAR_DATE - - -class VariantType(Enum): - """ - enumeration of permitted variant types - """ - - SMALL = 'SMALL' - SV = 'SV' - - -class FileTypes(Enum): - """ - enumeration of permitted input file types - """ - - HAIL_TABLE = '.ht' - MATRIX_TABLE = '.mt' - VCF = '.vcf' - VCF_GZ = '.vcf.gz' - VCF_BGZ = '.vcf.bgz' - def chunks(iterable, chunk_size): """ @@ -200,55 +129,56 @@ def identify_file_type(file_path: str) -> FileTypes | Exception: raise TypeError(f'File cannot be definitively typed: {str(extensions)}') -@dataclass -class Coordinates: - """ - a home for the positional variant attributes - """ - - chrom: str - pos: int - ref: str - alt: str - - @property - def string_format(self) -> str: - """ - forms a string representation: chr-pos-ref-alt - """ - return f'{self.chrom}-{self.pos}-{self.ref}-{self.alt}' - - def __lt__(self, other) -> bool: - """ - enables positional sorting - """ - # this will return False for same chrom and position - if self.chrom == other.chrom: - return self.pos < other.pos - # otherwise take the relative index from sorted chromosomes list - if self.chrom in CHROM_ORDER and other.chrom in CHROM_ORDER: - return CHROM_ORDER.index(self.chrom) < CHROM_ORDER.index(other.chrom) - # if self is on a canonical chromosome, sort before HLA/Decoy etc. - if self.chrom in CHROM_ORDER: - return True - return False - - def __eq__(self, other) -> bool: - """ - equivalence check - Args: - other (Coordinates): - - Returns: - true if self == other - - """ - return ( - self.chrom == other.chrom - and self.pos == other.pos - and self.ref == other.ref - and self.alt == other.alt - ) +# +# @dataclass +# class Coordinates: +# """ +# a home for the positional variant attributes +# """ +# +# chrom: str +# pos: int +# ref: str +# alt: str +# +# @property +# def string_format(self) -> str: +# """ +# forms a string representation: chr-pos-ref-alt +# """ +# return f'{self.chrom}-{self.pos}-{self.ref}-{self.alt}' +# +# def __lt__(self, other) -> bool: +# """ +# enables positional sorting +# """ +# # this will return False for same chrom and position +# if self.chrom == other.chrom: +# return self.pos < other.pos +# # otherwise take the relative index from sorted chromosomes list +# if self.chrom in CHROM_ORDER and other.chrom in CHROM_ORDER: +# return CHROM_ORDER.index(self.chrom) < CHROM_ORDER.index(other.chrom) +# # if self is on a canonical chromosome, sort before HLA/Decoy etc. +# if self.chrom in CHROM_ORDER: +# return True +# return False +# +# def __eq__(self, other) -> bool: +# """ +# equivalence check +# Args: +# other (Coordinates): +# +# Returns: +# true if self == other +# +# """ +# return ( +# self.chrom == other.chrom +# and self.pos == other.pos +# and self.ref == other.ref +# and self.alt == other.alt +# ) def get_json_response(url, max_retries=4, base_delay=1, max_delay=32): @@ -426,6 +356,188 @@ def get_phase_data(samples, var) -> dict[str, dict[int, str]]: return dict(phased_dict) +def organise_pm5(info_dict: dict[str, Any]) -> dict[str, Any]: + """ + method dedicated to handling the new pm5 annotations + + e.g. categorydetailsPM5=27037::Pathogenic::1+27048::Pathogenic::1; + 1. break into component allele data + + Returns: + None, updates self. attributes + """ + + if 'categorydetailspm5' not in info_dict: + return info_dict + + pm5_content = info_dict.pop('categorydetailspm5') + + # nothing to do here + if pm5_content == 'missing': + info_dict['categorybooleanpm5'] = 0 + return info_dict + + # current clinvar annotation, if any + current_clinvar = str(info_dict.get('clinvar_allele', 'not_this')) + + # instantiate a dict to store csq-matched results + pm5_data = {} + + # break the strings into a set + pm5_strings = set(pm5_content.split('+')) + for clinvar_entry in pm5_strings: + + # fragment each entry + allele_id, stars = clinvar_entry.split('::') + + # never consider the exact match, pm5 is always separate + if allele_id == current_clinvar: + continue + + # if non-self, add to the dict + pm5_data[allele_id] = stars + + # case where no non-self alleles were found + # assigning False and not-assigning are equivalent, just return + if pm5_data: + # set boolean category and specific data + info_dict['categorybooleanpm5'] = 1 + info_dict['pm5_data'] = pm5_data + else: + info_dict['categorybooleanpm5'] = 0 + + return info_dict + + +def create_small_variant( + var: cyvcf2.Variant, + samples: list[str], + as_singletons=False, + new_genes: dict[str, str] | None = None, +): + """ + takes a small variant and creates a Model from it + + Args: + var (): + samples (): + as_singletons (): + new_genes (): + """ + coordinates = Coordinates( + chrom=var.CHROM.replace('chr', ''), pos=var.POS, ref=var.REF, alt=var.ALT[0] + ) + depths = dict(zip(samples, map(float, var.gt_depths))) # type: ignore + info: dict[str, Any] = {x.lower(): y for x, y in var.INFO} | { + 'seqr_link': coordinates.string_format + } + het_samples, hom_samples = get_non_ref_samples(variant=var, samples=samples) + + # hot-swap cat 2 from a boolean to a sample list - if appropriate + if info.get('categoryboolean2', 0): + new_gene_samples = new_genes.get(info.get('gene_id'), '') + + # if 'all', keep cohort-wide boolean flag + if new_gene_samples == 'all': + get_logger().debug('New applies to all samples') + + # otherwise assign only a specific sample list + elif new_gene_samples: + _boolcat = info.pop('categoryboolean2') + info['categorysample2'] = new_gene_samples + + # else just remove it - shouldn't happen in prod + else: + _boolcat = info.pop('categoryboolean2') + + # set the class attributes + boolean_categories = [ + key for key in info.keys() if key.startswith('categoryboolean') + ] + sample_categories = [key for key in info.keys() if key.startswith('categorysample')] + sample_support = [key for key in info.keys() if key.startswith('categorysupport')] + + # overwrite with true booleans + for cat in sample_support + boolean_categories: + info[cat] = info.get(cat, 0) == 1 + + # sample categories are a list of strings or 'missing' + # if cohort runs as singletons, remove possibility of de novo + # if not singletons, split each into a list of sample IDs + for sam_cat in sample_categories: + if as_singletons and sam_cat in REMOVE_IN_SINGLETONS: + info[sam_cat] = [] + else: + info[sam_cat] = ( + info[sam_cat].split(',') if info[sam_cat] != 'missing' else [] + ) + + # organise PM5 + info = organise_pm5(info) + phased = get_phase_data(samples, var) + ab_ratios = dict(zip(samples, map(float, var.gt_alt_freqs))) + transcript_consequences = extract_csq(csq_contents=info.pop('csq', [])) + + return SmallVariant( + coordinates=coordinates, + info=info, + het_samples=het_samples, + hom_samples=hom_samples, + boolean_categories=boolean_categories, + sample_categories=sample_categories, + sample_support=sample_support, + phased=phased, + depths=depths, + ab_ratios=ab_ratios, + transcript_consequences=transcript_consequences, + ) + + +def create_structural_variant(var: cyvcf2.Variant, samples: list[str]): + """ + takes an SV and creates a Model from it + far less complicated than the SmallVariant model + + Args: + var (): + samples (): + """ + + info: dict[str, Any] = {x.lower(): y for x, y in var.INFO} + + # this is the right ID for Seqr + info['seqr_link'] = info['variantid'] + + coordinates = Coordinates( + chrom=var.CHROM.replace('chr', ''), + pos=var.POS, + ref=var.ALT[0], + alt=info['svlen'], + ) + + het_samples, hom_samples = get_non_ref_samples(variant=var, samples=samples) + + # set the class attributes + boolean_categories = [ + key for key in info.keys() if key.startswith('categoryboolean') + ] + + # overwrite with true booleans + for cat in boolean_categories: + info[cat] = info.get(cat, 0) == 1 + + phased = get_phase_data(samples, var) + + return StructuralVariant( + coordinates=coordinates, + info=info, + het_samples=het_samples, + hom_samples=hom_samples, + boolean_categories=boolean_categories, + phased=phased, + ) + + @dataclass class AbstractVariant: """ @@ -438,7 +550,6 @@ def __init__( samples: list[str], as_singletons=False, new_genes: dict[str, str] | None = None, - var_type: VariantType = VariantType.SMALL, ): """ Intention - this works for both small and structural variants @@ -454,10 +565,6 @@ def __init__( # overwrite the non-standard cyvcf2 representation self.info: dict[str, Any] = {x.lower(): y for x, y in var.INFO} - # presumption of small variant/indel unless otherwise specified - # we could bulk this out as index, snv, etc... - self.info['var_type'] = var_type - # extract the coordinates into a separate object # bump depths for SV calls if 'svtype' in self.info: @@ -467,7 +574,6 @@ def __init__( # artificial depths used to trick logic self.depths = {sam: 999 for sam in samples} self.info['seqr_link'] = self.info['variantid'] - self.info['var_type'] = VariantType.SV else: self.coords = Coordinates( @@ -782,82 +888,10 @@ def check_ab_ratio(self, sample: str) -> list[str]: return [] -class MinimalVariant: - """ - subset of the AbstractVariant data type - todo this is redundant with a model_dump exclude - https://docs.pydantic.dev/latest/concepts/serialization/#advanced-include-and-exclude - """ - - def __init__(self, variant: AbstractVariant, sample: str): - self.coords: Coordinates = variant.coords - self.categories: list[str] = variant.category_values(sample) - # no need to carry these though to the report - avoid_flags = ( - variant.sample_categories - + variant.boolean_categories - + variant.sample_support - ) - self.info: dict[str, Any] = { - key: value for key, value in variant.info.items() if key not in avoid_flags - } - self.transcript_consequences = variant.transcript_consequences - self.phased = variant.phased - - # CompHetDict structure: {sample: {variant_string: [variant, ...]}} # sample: string, e,g, CGP12345 -CompHetDict = dict[str, dict[str, list[AbstractVariant]]] -GeneDict = dict[str, list[AbstractVariant]] - - -@dataclass -class ReportedVariant: - """ - minimal model representing variant categorisation event - the initial variant (minimised) - the MOI applicable - the support ing variant(s), if any - allows for the presence of flags e.g. Borderline AB ratio - - todo should self.categories (vardata.categories) - todo actually be populated here instead of in the variant? - """ - - sample: str - family: str - gene: str - var_data: MinimalVariant - reasons: set[str] - genotypes: dict[str, str] - support_vars: set[str] = field(default_factory=set) - flags: list[str] = field(default_factory=list) - panels: dict[str, str | list[int]] = field(default_factory=dict) - phenotypes: list[str] = field(default_factory=list) - labels: list[str] = field(default_factory=list) - first_seen: str = get_granular_date() - independent: bool = False - - @property - def is_independent(self): - """ - check if this variant acts independently - """ - return len(self.support_vars) == 0 - - def __eq__(self, other): - """ - makes reported variants comparable - """ - # self_supvar = set(self.support_vars) - # other_supvar = set(other.support_vars) - return ( - self.sample == other.sample - and self.var_data.coords == other.var_data.coords - ) - - def __lt__(self, other): - return self.var_data.coords < other.var_data.coords +CompHetDict = dict[str, dict[str, list[SmallVariant | StructuralVariant]]] +GeneDict = dict[str, list[SmallVariant | StructuralVariant]] def canonical_contigs_from_vcf(reader) -> set[str]: @@ -921,16 +955,16 @@ def gather_gene_dict_from_contig( # if contig has no variants, prints an error and returns [] for variant in variant_source(contig): - abs_var = AbstractVariant( + abs_var = create_small_variant( var=variant, samples=variant_source.samples, as_singletons=singletons, new_genes=new_gene_map, ) - if abs_var.coords.string_format in blacklist: + if abs_var.coordinates.string_format in blacklist: get_logger().info( - f'Skipping blacklisted variant: {abs_var.coords.string_format}' + f'Skipping blacklisted variant: {abs_var.coordinates.string_format}' ) continue @@ -948,8 +982,8 @@ def gather_gene_dict_from_contig( second_source_variants = 0 for variant in second_source(contig): # create an abstract SV variant - abs_var = AbstractVariant( - var=variant, samples=second_source.samples, as_singletons=singletons + abs_var = create_structural_variant( + var=variant, samples=second_source.samples ) # update the variant count second_source_variants += 1 @@ -1109,6 +1143,7 @@ def extract_csq(csq_contents) -> list[dict]: return txc_dict +# todo remove completely, use pydantic class CustomEncoder(json.JSONEncoder): """ to be used as a JSON encoding class @@ -1126,7 +1161,7 @@ def default(self, o): o (): python object being JSON encoded """ - if is_dataclass(o) or isinstance(o, MinimalVariant): + if is_dataclass(o) or isinstance(o, (SmallVariant, StructuralVariant)): return o.__dict__ if isinstance(o, set): return list(o) @@ -1135,7 +1170,9 @@ def default(self, o): return json.JSONEncoder.default(self, o) -def find_comp_hets(var_list: list[AbstractVariant], pedigree) -> CompHetDict: +def find_comp_hets( + var_list: list[SmallVariant | StructuralVariant], pedigree +) -> CompHetDict: """ manual implementation to find compound hets variants provided in the format @@ -1150,7 +1187,7 @@ def find_comp_hets(var_list: list[AbstractVariant], pedigree) -> CompHetDict: } Args: - var_list (list[AbstractVariant]): all variants in this gene + var_list (list[SmallVariant | StructuralVariant]): all variants in this gene pedigree (): Peddy.ped """ @@ -1299,7 +1336,7 @@ def find_latest_file( def date_annotate_results( - current: dict[str, dict | list[ReportedVariant]], historic: dict | None = None + current: dict[str, dict | list[ReportVariant]], historic: dict | None = None ) -> tuple[dict, dict]: """ takes the current data, and annotates with previous dates if found diff --git a/reanalysis/validate_categories.py b/reanalysis/validate_categories.py index 4d0b1ec2..55d6a153 100644 --- a/reanalysis/validate_categories.py +++ b/reanalysis/validate_categories.py @@ -24,6 +24,7 @@ from cpg_utils import to_path from cpg_utils.config import get_config +from reanalysis.models import ReportVariant from reanalysis.moi_tests import MOIRunner, PEDDY_AFFECTED from reanalysis.utils import ( canonical_contigs_from_vcf, @@ -31,14 +32,12 @@ find_comp_hets, gather_gene_dict_from_contig, get_cohort_config, - get_granular_date, - get_logger, get_new_gene_map, read_json_from_path, CustomEncoder, GeneDict, - ReportedVariant, ) +from reanalysis.static_values import get_granular_date, get_logger AMBIGUOUS_FLAG = 'Ambiguous Cat.1 MOI' MALE_FEMALE = {'male', 'female'} @@ -98,7 +97,7 @@ def apply_moi_to_variants( moi_lookup: dict[str, MOIRunner], panelapp_data: dict[str, dict[str, str | bool]], pedigree: Ped, -) -> list[ReportedVariant]: +) -> list[ReportVariant]: """ take all variants on a given contig & MOI filters find all variants/compound hets which fit the PanelApp MOI @@ -145,7 +144,7 @@ def apply_moi_to_variants( variant_results = runner.run( principal_var=variant, comp_het=comp_het_dict, - partial_pen=variant.info.get('categoryboolean1', False), + partial_pen=bool(variant.info.get('categoryboolean1', False)), ) # Flag! If this is a Category 1 (ClinVar) variant, and we are @@ -172,11 +171,11 @@ def apply_moi_to_variants( def clean_and_filter( results_holder: dict, - result_list: list[ReportedVariant], + result_list: list[ReportVariant], panelapp_data: dict, dataset: str, participant_panels: dict | None = None, -) -> dict[str, list[ReportedVariant]]: +) -> dict[str, list[ReportVariant]]: """ It's possible 1 variant can be classified multiple ways e.g. different MOIs (dominant and comp het) @@ -190,7 +189,7 @@ def clean_and_filter( Args: results_holder (): container for all results data - result_list (): list of all ReportedVariant events + result_list (): list of all ReportVariant events panelapp_data (): dataset (str): dataset to use for getting the config portion participant_panels (): @@ -215,8 +214,6 @@ def clean_and_filter( for each_event in result_list: - each_event.independent = each_event.is_independent - # grab some attributes from the event sample = each_event.sample gene = each_event.gene diff --git a/test/conftest.py b/test/conftest.py index 7289537e..db0b59af 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -12,6 +12,7 @@ from cpg_utils.config import set_config_paths + # force this to come first PWD = Path(__file__).parent INPUT = PWD / 'input' @@ -27,9 +28,9 @@ SneakyTable, ) -from reanalysis.utils import AbstractVariant, read_json_from_path # noqa: E402 +from reanalysis.utils import read_json_from_path, create_small_variant # noqa: E402 -LABELLED = INPUT / '1_labelled_variant.vcf.bgz' +LABELLED = str(INPUT / '1_labelled_variant.vcf.bgz') AIP_OUTPUT = INPUT / 'aip_output_example.json' DE_NOVO_PED = INPUT / 'de_novo_ped.fam' FAKE_OBO = INPUT / 'hpo_test.obo' @@ -150,7 +151,7 @@ def fixture_phased_trio_variants(): """path to the phased trio VCF""" vcf_reader = VCFReader(PHASED_TRIO) - two_variants = [AbstractVariant(var, vcf_reader.samples) for var in vcf_reader] + two_variants = [create_small_variant(var, vcf_reader.samples) for var in vcf_reader] return two_variants @@ -177,7 +178,17 @@ def fixture_trio_abs_variant(): vcf_reader = VCFReader(LABELLED) cyvcf_var = next(vcf_reader) - return AbstractVariant(cyvcf_var, vcf_reader.samples) + return create_small_variant(cyvcf_var, vcf_reader.samples) + + +@pytest.fixture(name='cyvcf_example_variant') +def fixture_cyvcf_variant(): + """ + sends the location of the Trio Pedigree (PLINK) + Cat. 3, and Cat. 4 for PROBAND only + """ + vcf_reader = VCFReader(LABELLED) + return next(vcf_reader) @pytest.fixture(name='two_trio_abs_variants') @@ -188,7 +199,7 @@ def fixture_two_trio_abs_variants(): 2) Cat. 1 + 3, and Cat. 4 for PROBAND only """ vcf_reader = VCFReader(LABELLED) - two_variants = [AbstractVariant(var, vcf_reader.samples) for var in vcf_reader] + two_variants = [create_small_variant(var, vcf_reader.samples) for var in vcf_reader] return two_variants diff --git a/test/test_moi_tests.py b/test/test_moi_tests.py index e0a2009c..5df782e0 100644 --- a/test/test_moi_tests.py +++ b/test/test_moi_tests.py @@ -1,9 +1,6 @@ """ tests relating to the MOI filters """ -# mypy: ignore-errors -from dataclasses import dataclass, field -from typing import Any, Dict, List from unittest import mock @@ -17,7 +14,6 @@ MOIRunner, RecessiveAutosomalCH, RecessiveAutosomalHomo, - VariantType, XDominant, XRecessiveMale, XRecessiveFemaleCH, @@ -31,130 +27,6 @@ TEST_COORDS_X_2 = Coordinates(chrom='X', pos=2, ref='G', alt='T') -@dataclass -class SimpleVariant: - """ - a fake version of AbstractVariant - """ - - info: Dict[str, Any] - coords: Coordinates - het_samples: set[str] = field(default_factory=set) - hom_samples: set[str] = field(default_factory=set) - categoryboolean1: bool = True - categorysample4: list[str] = field(default_factory=list) - ab_ratios = {'nobody': 1.0} - depths = {'female': 11, 'male': 11} - sample_categories = ['categorysample4'] - boolean_categories = ['categoryboolean1'] - sample_support = [] - transcript_consequences = [] - phased = {} - var_type = VariantType.SMALL.value - - def sample_category_check(self, sample, allow_support=True): - """ - :param sample: - :param allow_support: - """ - _phony = allow_support - return self.categoryboolean1 or sample in self.categorysample4 - - def get_sample_flags(self, *args, **kwargs): - """ - dummy method - """ - if args and kwargs and self: - pass - return [] - - @staticmethod - def category_values(sample): - """ - quick mock method - """ - return [sample] - - @property - def support_only(self): - """pass""" - return False - - -@dataclass -class RecessiveSimpleVariant: - """ - a fake version of AbstractVariant - """ - - coords: Coordinates - ab_ratios: dict[str, float] - info: dict[str, Any] = field(default_factory=dict) - depths = {'female': 11, 'male': 11} - het_samples: set[str] = field(default_factory=set) - hom_samples: set[str] = field(default_factory=set) - categorysample4: list[str] = field(default_factory=list) - categoryboolean1: bool = True - boolean_categories = ['categoryboolean1'] - sample_categories = ['categorysample4'] - sample_support: list = field(default_factory=list) - transcript_consequences: list = field(default_factory=list) - phased: dict = field(default_factory=dict) - var_type = VariantType.SMALL - - def sample_de_novo(self, sample): - """ - :param sample: - """ - return sample in self.categorysample4 - - def sample_category_check(self, sample, allow_support: bool = False): - """ - Args: - sample (): - allow_support (bool): just for the consistent API - """ - _phony = allow_support - return (sample in self.categorysample4) or self.categoryboolean1 - - def check_ab_ratio(self, sample) -> list[str]: - """ - pass - """ - - het = sample in self.het_samples - hom = sample in self.hom_samples - variant_ab = self.ab_ratios.get(sample, 0.0) - if ( - (variant_ab <= 0.15) - or (het and not 0.25 <= variant_ab <= 0.75) - or (hom and variant_ab <= 0.85) - ): - return ['AB Ratio'] - return [] - - def get_sample_flags(self, sample: str): - """ - gets all report flags for this sample - """ - return self.check_ab_ratio(sample) - - def category_values(self, sample): - """ - quick mock method - """ - return [sample] - - @property - def support_only(self): - """pass""" - return False - - def sample_support_only(self, sample_id: str) -> bool: - """dummy method - this will cause issues""" - return sample_id == 'dumdum' - - @pytest.mark.parametrize( 'first,comp_hets,sample,values', ( @@ -205,7 +77,7 @@ def test_check_second_hit(first, comp_hets, sample, values): ('Hemi_Bi_In_Female', ['XRecessive']), ), ) -def test_moi_runner(moi_string: str, filters: List[str], peddy_ped): +def test_moi_runner(moi_string: str, filters: list[str], peddy_ped): """ :param moi_string: @@ -231,19 +103,18 @@ def test_dominant_autosomal_fails_on_depth(peddy_ped): 'gnomad_af': 0.0001, 'gnomad_ac': 0, 'gnomad_hom': 0, - 'var_type': VariantType.SMALL, + 'gene_id': 'TEST1', } dom = DominantAutosomal(pedigree=peddy_ped) # passes with heterozygous - shallow_variant = SimpleVariant( + shallow_variant = SmallVariant( info=info_dict, het_samples={'male'}, - hom_samples=set(), - coords=TEST_COORDS, + coordinates=TEST_COORDS, + depths={'male': 1}, ) - shallow_variant.depths = {'male': 1} results = dom.run(principal=shallow_variant) # noqa assert len(results) == 0 @@ -258,30 +129,44 @@ def test_dominant_autosomal_passes(peddy_ped): 'gnomad_af': 0.0001, 'gnomad_ac': 0, 'gnomad_hom': 0, - 'var_type': VariantType.SMALL, + 'cat1': True, + 'gene_id': 'TEST1', } + # attributes relating to categorisation + boolean_categories = ['cat1'] + dom = DominantAutosomal(pedigree=peddy_ped) # passes with heterozygous - passing_variant = SimpleVariant( - info=info_dict, het_samples={'male'}, hom_samples=set(), coords=TEST_COORDS + passing_variant = SmallVariant( + info=info_dict, + het_samples={'male'}, + coordinates=TEST_COORDS, + boolean_categories=boolean_categories, + depths={'male': 999}, ) results = dom.run(principal=passing_variant) assert len(results) == 1 assert results[0].reasons == {'Autosomal Dominant'} # also passes with homozygous - passing_variant = SimpleVariant( - info=info_dict, het_samples=set(), hom_samples={'male'}, coords=TEST_COORDS + passing_variant = SmallVariant( + info=info_dict, + hom_samples={'male'}, + coordinates=TEST_COORDS, + boolean_categories=boolean_categories, + depths={'male': 999}, ) results = dom.run(principal=passing_variant) assert len(results) == 1 assert results[0].reasons == {'Autosomal Dominant'} # no results if no samples - passing_variant = SimpleVariant( - info=info_dict, het_samples=set(), hom_samples=set(), coords=TEST_COORDS + passing_variant = SmallVariant( + info=info_dict, + coordinates=TEST_COORDS, + boolean_categories=boolean_categories, ) assert len(dom.run(principal=passing_variant)) == 0 @@ -289,8 +174,8 @@ def test_dominant_autosomal_passes(peddy_ped): @pytest.mark.parametrize( 'info', [ - {'gnomad_af': 0.1, 'var_type': VariantType.SMALL}, - {'gnomad_hom': 2, 'var_type': VariantType.SMALL}, + {'gnomad_af': 0.1}, + {'gnomad_hom': 2}, ], ) def test_dominant_autosomal_fails(info, peddy_ped): @@ -303,8 +188,8 @@ def test_dominant_autosomal_fails(info, peddy_ped): dom = DominantAutosomal(pedigree=peddy_ped) # fails due to high af - failing_variant = SimpleVariant( - info=info, het_samples={'male'}, hom_samples=set(), coords=TEST_COORDS + failing_variant = SmallVariant( + info=info, het_samples={'male'}, coordinates=TEST_COORDS ) assert not dom.run(principal=failing_variant) @@ -316,11 +201,11 @@ def test_recessive_autosomal_hom_passes(peddy_ped): """ passing_variant = SmallVariant( hom_samples={'male'}, - info={'categoryboolean1': True}, coordinates=TEST_COORDS, ab_ratios={'male': 1.0}, depths={'male': 15}, boolean_categories=['categoryboolean1'], + info={'categoryboolean1': True, 'gene_id': 'TEST1'}, ) rec = RecessiveAutosomalHomo(pedigree=peddy_ped) results = rec.run(passing_variant) @@ -334,11 +219,13 @@ def test_recessive_autosomal_hom_passes_with_ab_flag(peddy_ped): we accept a homozygous variant as a Recessive """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( hom_samples={'male'}, - coords=TEST_COORDS, + coordinates=TEST_COORDS, ab_ratios={'male': 0.4}, - info={'var_type': VariantType.SMALL.value}, + depths={'male': 40}, + boolean_categories=['categoryboolean1'], + info={'categoryboolean1': True, 'gene_id': 'TEST1'}, ) rec = RecessiveAutosomalHomo(pedigree=peddy_ped) results = rec.run(passing_variant) @@ -354,17 +241,21 @@ def test_recessive_autosomal_comp_het_male_passes(peddy_ped): we accept a heterozygous variant as a Comp-Het """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'male'}, - coords=TEST_COORDS, + coordinates=TEST_COORDS, ab_ratios={'male': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'male': 50}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) - passing_variant2 = RecessiveSimpleVariant( + passing_variant2 = SmallVariant( het_samples={'male'}, - coords=TEST_COORDS2, + coordinates=TEST_COORDS2, ab_ratios={'male': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'male': 50}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) comp_hets = {'male': {TEST_COORDS.string_format: [passing_variant2]}} rec = RecessiveAutosomalCH(pedigree=peddy_ped) @@ -380,17 +271,21 @@ def test_recessive_autosomal_comp_het_male_passes_partner_flag(peddy_ped): we accept a heterozygous variant as a Comp-Het """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'male'}, - coords=TEST_COORDS, + coordinates=TEST_COORDS, ab_ratios={'male': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'male': 50}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) - passing_variant2 = RecessiveSimpleVariant( + passing_variant2 = SmallVariant( het_samples={'male'}, - coords=TEST_COORDS2, + coordinates=TEST_COORDS2, ab_ratios={'male': 1.0}, - info={'var_type': VariantType.SMALL}, + depths={'male': 50}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) comp_hets = {'male': {TEST_COORDS.string_format: [passing_variant2]}} rec = RecessiveAutosomalCH(pedigree=peddy_ped) @@ -408,17 +303,21 @@ def test_recessive_autosomal_comp_het_female_passes(peddy_ped): :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS, + coordinates=TEST_COORDS, ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'female': 50}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) - passing_variant2 = RecessiveSimpleVariant( + passing_variant2 = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS2, + coordinates=TEST_COORDS2, ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'female': 50}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) comp_hets = {'female': {TEST_COORDS.string_format: [passing_variant2]}} rec = RecessiveAutosomalCH(pedigree=peddy_ped) @@ -437,11 +336,11 @@ def test_recessive_autosomal_comp_het_fails_no_ch_return(peddy_ped): :return: """ - failing_variant = SimpleVariant( - info={'var_type': VariantType.SMALL}, + failing_variant = SmallVariant( + info={'gene_id': 'TEST1'}, het_samples={'male'}, - hom_samples=set(), - coords=TEST_COORDS, + depths={'male': 50}, + coordinates=TEST_COORDS, ) rec = RecessiveAutosomalCH(pedigree=peddy_ped) assert not rec.run(failing_variant) @@ -456,17 +355,19 @@ def test_recessive_autosomal_comp_het_fails_no_paired_call(peddy_ped): :return: """ - failing_variant = RecessiveSimpleVariant( + failing_variant = SmallVariant( het_samples={'male'}, - coords=TEST_COORDS, + coordinates=TEST_COORDS, ab_ratios={'male': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'male': 50}, + info={'gene_id': 'TEST1'}, ) - failing_variant2 = RecessiveSimpleVariant( + failing_variant2 = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS2, + coordinates=TEST_COORDS2, ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'female': 50}, + info={'gene_id': 'TEST1'}, ) rec = RecessiveAutosomalCH(pedigree=peddy_ped) @@ -477,7 +378,7 @@ def test_recessive_autosomal_comp_het_fails_no_paired_call(peddy_ped): @pytest.mark.parametrize( - 'info', [{'gnomad_hom': 3, 'var_type': VariantType.SMALL}] + 'info', [{'gnomad_hom': 3, 'gene_id': 'TEST1'}] ) # threshold is 2 def test_recessive_autosomal_hom_fails(info, peddy_ped): """ @@ -485,8 +386,8 @@ def test_recessive_autosomal_hom_fails(info, peddy_ped): we have no confirmed MOI """ - failing_variant = SimpleVariant( - info=info, het_samples={'male'}, hom_samples={'male'}, coords=TEST_COORDS + failing_variant = SmallVariant( + info=info, het_samples={'male'}, hom_samples={'male'}, coordinates=TEST_COORDS ) rec = RecessiveAutosomalHomo(pedigree=peddy_ped) assert not rec.run(failing_variant) @@ -497,10 +398,12 @@ def test_x_dominant_female_and_male_het_passes(peddy_ped): check that a male is accepted as a het :return: """ - passing_variant = SimpleVariant( - info={'gnomad_hemi': 0, 'var_type': VariantType.SMALL}, + passing_variant = SmallVariant( + boolean_categories=['categoryboolean1'], + info={'gnomad_hemi': 0, 'gene_id': 'TEST1', 'categoryboolean1': True}, het_samples={'female', 'male'}, - coords=TEST_COORDS_X_1, + depths={'female': 50, 'male': 50}, + coordinates=TEST_COORDS_X_1, ) x_dom = XDominant(pedigree=peddy_ped) results = x_dom.run(passing_variant) @@ -515,10 +418,13 @@ def test_x_dominant_female_hom_passes(peddy_ped): check that a male is accepted as a het :return: """ - passing_variant = SimpleVariant( - info={'gnomad_hemi': 0, 'var_type': VariantType.SMALL}, + passing_variant = SmallVariant( + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, hom_samples={'female'}, - coords=TEST_COORDS_X_1, + depths={'female': 100}, + ab_ratios={'female': 0.5}, + coordinates=TEST_COORDS_X_1, ) x_dom = XDominant(pedigree=peddy_ped) results = x_dom.run(passing_variant) @@ -531,10 +437,12 @@ def test_x_dominant_male_hom_passes(peddy_ped): check that a male is accepted as a het :return: """ - passing_variant = SimpleVariant( - info={'gnomad_hemi': 0, 'var_type': VariantType.SMALL}, + passing_variant = SmallVariant( + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, hom_samples={'male'}, - coords=TEST_COORDS_X_1, + depths={'male': 100}, + coordinates=TEST_COORDS_X_1, ) x_dom = XDominant(pedigree=peddy_ped) results = x_dom.run(passing_variant) @@ -545,23 +453,20 @@ def test_x_dominant_male_hom_passes(peddy_ped): @pytest.mark.parametrize( 'info', [ - {'gnomad_af': 0.1, 'var_type': VariantType.SMALL}, - {'gnomad_hom': 2, 'var_type': VariantType.SMALL}, - {'gnomad_hemi': 3, 'var_type': VariantType.SMALL}, + {'gnomad_af': 0.1, 'gene_id': 'TEST1', 'categoryboolean1': True}, + {'gnomad_hom': 2, 'gene_id': 'TEST1', 'categoryboolean1': True}, + {'gnomad_hemi': 3, 'gene_id': 'TEST1', 'categoryboolean1': True}, ], ) def test_x_dominant_info_fails(info, peddy_ped): """ check for info dict exclusions - :param info: - :return: """ - passing_variant = SimpleVariant( + passing_variant = SmallVariant( info=info, hom_samples={'male'}, - het_samples=set(), - coords=TEST_COORDS_X_1, - categoryboolean1=False, + coordinates=TEST_COORDS_X_1, + boolean_categories=['categoryboolean1'], ) x_dom = XDominant(pedigree=peddy_ped) assert len(x_dom.run(passing_variant)) == 0 @@ -573,11 +478,13 @@ def test_x_recessive_male_hom_passes(peddy_ped): :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( hom_samples={'female', 'male'}, - coords=TEST_COORDS_X_1, + coordinates=TEST_COORDS_X_1, ab_ratios={'female': 1.0, 'male': 1.0}, - info={'var_type': VariantType.SMALL}, + depths={'female': 100, 'male': 100}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) x_rec = XRecessiveMale(pedigree=peddy_ped) results = x_rec.run(passing_variant, comp_het={}) @@ -590,11 +497,13 @@ def test_x_recessive_female_hom_passes(peddy_ped): :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( hom_samples={'female', 'male'}, - coords=TEST_COORDS_X_1, + coordinates=TEST_COORDS_X_1, ab_ratios={'female': 1.0, 'male': 1.0}, - info={'var_type': VariantType.SMALL}, + depths={'female': 100, 'male': 100}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) x_rec = XRecessiveFemaleHom(pedigree=peddy_ped) results = x_rec.run(passing_variant, comp_het={}) @@ -607,11 +516,13 @@ def test_x_recessive_male_het_passes(peddy_ped): :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'male'}, - coords=TEST_COORDS_X_1, + coordinates=TEST_COORDS_X_1, ab_ratios={'male': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'male': 50}, + boolean_categories=['categoryboolean1'], + info={'gene_id': 'TEST1', 'categoryboolean1': True}, ) x_rec = XRecessiveMale(pedigree=peddy_ped) results = x_rec.run(passing_variant) @@ -625,19 +536,27 @@ def test_x_recessive_female_het_passes(peddy_ped): :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS_X_1, - categorysample4=['female'], + coordinates=TEST_COORDS_X_1, ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, - ) - passing_variant_2 = RecessiveSimpleVariant( + depths={'female': 50}, + sample_categories=['categorysample4'], + info={ + 'gene_id': 'TEST1', + 'categorysample4': ['female'], + }, + ) + passing_variant_2 = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS_X_2, - categorysample4=['female'], + coordinates=TEST_COORDS_X_2, ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'female': 50}, + sample_categories=['categorysample4'], + info={ + 'gene_id': 'TEST1', + 'categorysample4': ['female'], + }, ) comp_hets = {'female': {'X-1-G-T': [passing_variant_2]}} x_rec = XRecessiveFemaleCH(pedigree=peddy_ped) @@ -646,18 +565,19 @@ def test_x_recessive_female_het_passes(peddy_ped): assert results[0].reasons == {'X_RecessiveFemaleCompHet'} -def test_het_de_novo_het_passes(peddy_ped): +def test_het_de_novo_passes(peddy_ped): """ :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS_X_1, - categorysample4=['female'], + coordinates=TEST_COORDS_X_1, + sample_categories=['categorysample4'], ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'female': 99}, + info={'gene_id': 'TEST1', 'categorysample4': ['female']}, ) dom_a = DominantAutosomal(pedigree=peddy_ped) results = dom_a.run(passing_variant) @@ -672,12 +592,13 @@ def test_het_de_novo_het_passes_flagged(peddy_ped): :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS_X_1, - categorysample4=['female'], + coordinates=TEST_COORDS_X_1, + sample_categories=['categorysample4'], ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'female': 99}, + info={'gene_id': 'TEST1', 'categorysample4': ['female']}, ) dom_a = DominantAutosomal(pedigree=peddy_ped) results = dom_a.run(passing_variant) @@ -690,19 +611,27 @@ def test_x_recessive_female_het_fails(peddy_ped): :return: """ - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS_X_1, - categorysample4=['male'], + coordinates=TEST_COORDS_X_1, ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, - ) - passing_variant_2 = RecessiveSimpleVariant( + depths={'female': 50}, + sample_categories=['categorysample4'], + info={ + 'gene_id': 'TEST1', + 'categorysample4': ['male'], + }, + ) + passing_variant_2 = SmallVariant( het_samples={'male'}, - coords=TEST_COORDS_X_2, - categorysample4=['male'], + coordinates=TEST_COORDS_X_2, ab_ratios={'male': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'male': 50}, + sample_categories=['categorysample4'], + info={ + 'gene_id': 'TEST1', + 'categorysample4': ['male'], + }, ) comp_hets = {'female': {'x-2-A-C': [passing_variant_2]}} x_rec = XRecessiveFemaleCH(pedigree=peddy_ped) @@ -711,18 +640,21 @@ def test_x_recessive_female_het_fails(peddy_ped): @mock.patch('reanalysis.moi_tests.check_for_second_hit') -def test_x_recessive_female_het_no_pair_fails(second_hit: mock.patch, peddy_ped): - """ - :return: - """ +def test_x_recessive_female_het_no_pair_fails(second_hit: mock.Mock, peddy_ped): + """ """ - second_hit.return_value = [] - passing_variant = RecessiveSimpleVariant( + passing_variant = SmallVariant( het_samples={'female'}, - coords=TEST_COORDS_X_1, + coordinates=TEST_COORDS_X_1, ab_ratios={'female': 0.5}, - info={'var_type': VariantType.SMALL}, + depths={'female': 50}, + info={ + 'gene_id': 'TEST1', + 'categorysample1': True, + 'boolean_categories': 'categorysample1', + }, ) + second_hit.return_value = [] x_rec = XRecessiveFemaleCH(pedigree=peddy_ped) assert not x_rec.run(passing_variant) @@ -850,10 +782,13 @@ def test_genotype_calls(peddy_ped): 'gnomad_af': 0.0001, 'gnomad_ac': 0, 'gnomad_hom': 0, - 'var_type': VariantType.SMALL, + 'gene_id': 'TEST1', } - variant = SimpleVariant( - info=info_dict, het_samples={'male'}, hom_samples={'female'}, coords=TEST_COORDS + variant = SmallVariant( + info=info_dict, + het_samples={'male'}, + hom_samples={'female'}, + coordinates=TEST_COORDS, ) assert base_moi.get_family_genotypes(variant, 'male') == { 'father_1': 'WT', @@ -865,11 +800,10 @@ def test_genotype_calls(peddy_ped): 'female': 'Hom', 'mother_2': 'WT', } - x_variant = SimpleVariant( + x_variant = SmallVariant( info=info_dict, het_samples={'male', 'female'}, - hom_samples=set(), - coords=TEST_COORDS_X_1, + coordinates=TEST_COORDS_X_1, ) assert base_moi.get_family_genotypes(x_variant, 'male') == { 'father_1': 'WT', @@ -882,11 +816,10 @@ def test_genotype_calls(peddy_ped): 'mother_2': 'WT', } - x_variant_2 = SimpleVariant( + x_variant_2 = SmallVariant( info=info_dict, - het_samples=set(), hom_samples={'male', 'female'}, - coords=TEST_COORDS_X_1, + coordinates=TEST_COORDS_X_1, ) assert base_moi.get_family_genotypes(x_variant_2, 'male') == { 'father_1': 'WT', @@ -899,8 +832,8 @@ def test_genotype_calls(peddy_ped): 'mother_2': 'WT', } - variant_missing = SimpleVariant( - info=info_dict, het_samples=set(), hom_samples=set(), coords=TEST_COORDS + variant_missing = SmallVariant( + info=info_dict, het_samples=set(), hom_samples=set(), coordinates=TEST_COORDS ) assert base_moi.get_family_genotypes(variant_missing, 'male') == { 'father_1': 'WT', diff --git a/test/test_results_comparison.py b/test/test_results_comparison.py index 4dd4e489..7b0f490f 100644 --- a/test/test_results_comparison.py +++ b/test/test_results_comparison.py @@ -10,12 +10,9 @@ from cpg_utils.config import get_config -from reanalysis.utils import ( - date_annotate_results, - find_latest_file, - get_granular_date, - Coordinates, -) +from reanalysis.utils import date_annotate_results, find_latest_file +from reanalysis.models import Coordinates +from reanalysis.static_values import get_granular_date CATEGORY_META = get_config()['categories'] @@ -42,8 +39,8 @@ class MiniReport: independent: bool = False -COORD_1 = Coordinates('1', 1, 'A', 'G') -COORD_2 = Coordinates('2', 2, 'A', 'G') +COORD_1 = Coordinates(chrom='1', pos=1, ref='A', alt='G') +COORD_2 = Coordinates(chrom='2', pos=2, ref='A', alt='G') GENERIC_REPORT = MiniReport(MiniVariant(categories=['1'], coords=COORD_1)) GENERIC_REPORT_12 = MiniReport(MiniVariant(categories=['1', '2'], coords=COORD_1)) diff --git a/test/test_utils.py b/test/test_utils.py index 3e6d8975..c98f2f21 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,22 +3,22 @@ """ from copy import deepcopy -from dataclasses import dataclass -from typing import List import pytest from cyvcf2 import VCFReader from reanalysis.utils import ( - AbstractVariant, - Coordinates, find_comp_hets, gather_gene_dict_from_contig, get_new_gene_map, get_non_ref_samples, get_simple_moi, identify_file_type, +) +from reanalysis.models import ( FileTypes, - MinimalVariant, - ReportedVariant, + ReportVariant, + Coordinates, + SmallVariant, + StructuralVariant, ) @@ -26,16 +26,16 @@ def test_coord_sorting(): """ check that coord sorting methods work """ - coord_1 = Coordinates('4', 20, 'A', 'C') - coord_1b = Coordinates('4', 21, 'A', 'C') - coord_1c = Coordinates('4', 21, 'A', 'C') - coord_2 = Coordinates('5', 20, 'A', 'C') + coord_1 = Coordinates(chrom='4', pos=20, ref='A', alt='C') + coord_1b = Coordinates(chrom='4', pos=21, ref='A', alt='C') + coord_1c = Coordinates(chrom='4', pos=21, ref='A', alt='C') + coord_2 = Coordinates(chrom='5', pos=20, ref='A', alt='C') assert coord_1 < coord_2 assert coord_1 < coord_1b assert not coord_1b < coord_1c -def test_abs_var_sorting(two_trio_abs_variants: list[AbstractVariant]): +def test_abs_var_sorting(two_trio_abs_variants: list[SmallVariant]): """ test sorting and equivalence at the AbsVar level """ @@ -53,7 +53,7 @@ def test_reported_variant_ordering(trio_abs_variant): """ test that equivalence between Report objects works as exp. """ - report_1 = ReportedVariant( + report_1 = ReportVariant( sample='1', family='1', gene='2', @@ -61,7 +61,7 @@ def test_reported_variant_ordering(trio_abs_variant): reasons={'test'}, genotypes={}, ) - report_2 = ReportedVariant( + report_2 = ReportVariant( sample='1', family='1', gene='2', @@ -131,26 +131,19 @@ def test_get_simple_moi(string: str, expected: str, chrom: str): assert get_simple_moi(string, chrom) == expected -def test_get_non_ref_samples(): +def test_get_non_ref_samples(cyvcf_example_variant): """ this simple test can be done without the use of a cyvcf2 object :return: """ - @dataclass - class SuperSimple: - """test_fixture""" - - gt_types: List[int] - - samples = ['a', 'b', 'c', 'd', 'e'] - variant = SuperSimple([0, 1, 2, 3, 1]) - het, hom = get_non_ref_samples(variant=variant, samples=samples) - assert het == {'b', 'e'} - assert hom == {'d'} + samples = ['male', 'father', 'mother'] + het, hom = get_non_ref_samples(variant=cyvcf_example_variant, samples=samples) + assert het == {'male'} + assert hom == {} -def test_av_categories(trio_abs_variant: AbstractVariant): +def test_av_categories(trio_abs_variant: SmallVariant | StructuralVariant): """ Cat. 3, and Cat. 4 for PROBAND only: """ @@ -164,7 +157,7 @@ def test_av_categories(trio_abs_variant: AbstractVariant): assert not trio_abs_variant.sample_categorised_check('father_1') -def test_av_phase(trio_abs_variant: AbstractVariant): +def test_av_phase(trio_abs_variant: SmallVariant): """ nothing here yet :param trio_abs_variant: @@ -189,12 +182,12 @@ def test_gene_dict(two_trio_variants_vcf): assert len(var_dict['ENSG00000075043']) == 2 -def test_comp_hets(two_trio_abs_variants: list[AbstractVariant], peddy_ped): +def test_comp_hets(two_trio_abs_variants: list[SmallVariant], peddy_ped): """ { 'male': { - '20-63406931-C-CGG': [AbstractVariant()], - '20-63406991-C-CGG': [AbstractVariant()] + '20-63406931-C-CGG': [Variant()], + '20-63406991-C-CGG': [Variant()] } } :param two_trio_abs_variants: @@ -228,7 +221,7 @@ def test_phased_dict(phased_vcf_path): assert variant.phased['mother_1'] == {420: '0|1'} -def test_phased_comp_hets(phased_variants: list[AbstractVariant], peddy_ped): +def test_phased_comp_hets(phased_variants: list[SmallVariant], peddy_ped): """ phased variants shouldn't form a comp-het 'mother_1' is het for both variants, but phase-set is same for both @@ -315,11 +308,3 @@ def test_new_gene_map_complex(): 'ENSG3': 'sam2', 'ENSG4': 'sam,sam2', } - - -def test_minimise(trio_abs_variant: AbstractVariant): - """ - check the variant minimiser - """ - minvar = MinimalVariant(trio_abs_variant, 'male') - assert sorted(minvar.categories) == ['3', '4']