Skip to content

Commit

Permalink
poke poke poke
Browse files Browse the repository at this point in the history
  • Loading branch information
MattWellie committed Nov 28, 2023
1 parent f6f69ee commit 18f5738
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 47 deletions.
2 changes: 1 addition & 1 deletion reanalysis/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ class HistoricSampleVariant(BaseModel):
default_factory=list,
description='supporting variants if this has been identified in a comp-he',
)
independent: bool = True
independent: bool = Field(default=True)


class HistoricVariants(BaseModel):
Expand Down
36 changes: 19 additions & 17 deletions reanalysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def gather_gene_dict_from_contig(
variant_source,
new_gene_map: dict[str, str],
singletons: bool = False,
second_source=None,
sv_source=None,
) -> GeneDict:
"""
takes a cyvcf2.VCFReader instance, and a specified chromosome
Expand All @@ -582,7 +582,7 @@ def gather_gene_dict_from_contig(
variant_source (): the VCF reader instance
new_gene_map ():
singletons ():
second_source (): an optional second VCF (SV)
sv_source (): an optional second VCF (SV)
Returns:
A lookup in the form
Expand All @@ -606,45 +606,47 @@ def gather_gene_dict_from_contig(
# if contig has no variants, prints an error and returns []
for variant in variant_source(contig):

abs_var = create_small_variant(
small_variant = create_small_variant(
var=variant,
samples=variant_source.samples,
as_singletons=singletons,
new_genes=new_gene_map,
)

if abs_var.coordinates.string_format in blacklist:
if small_variant.coordinates.string_format in blacklist:
get_logger().info(
f'Skipping blacklisted variant: {abs_var.coordinates.string_format}'
f'Skipping blacklisted variant: {small_variant.coordinates.string_format}'
)
continue

# if unclassified, skip the whole variant
if not abs_var.is_classified:
if not small_variant.is_classified:
continue

# update the variant count
contig_variants += 1

# update the gene index dictionary
contig_dict[abs_var.info.get('gene_id')].append(abs_var)
contig_dict[small_variant.info.get('gene_id')].append(small_variant)

if sv_source:
structural_variants = 0
for variant in sv_source(contig):

if second_source:
second_source_variants = 0
for variant in second_source(contig):
# create an abstract SV variant
abs_var = create_structural_variant(
var=variant, samples=second_source.samples
structural_variant = create_structural_variant(
var=variant, samples=sv_source.samples
)

# update the variant count
second_source_variants += 1
structural_variants += 1

# update the gene index dictionary
contig_dict[abs_var.info.get('gene_id')].append(abs_var)
contig_dict[structural_variant.info.get('gene_id')].append(
structural_variant
)

get_logger().info(
f'Contig {contig} contained {second_source_variants} variants'
)
get_logger().info(f'Contig {contig} contained {structural_variants} variants')

get_logger().info(f'Contig {contig} contained {contig_variants} variants')
get_logger().info(f'Contig {contig} contained {len(contig_dict)} genes')
Expand Down
50 changes: 21 additions & 29 deletions reanalysis/validate_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,18 +224,14 @@ def clean_and_filter(
# shouldn't be possible
assert each_event.categories, f'No categories for {each_event}'

# grab some attributes from the event
sample = each_event.sample
gene = each_event.gene

# find all panels for this gene
if gene in gene_details:
all_panels = gene_details[gene]
if each_event.gene in gene_details:
all_panels = gene_details[each_event.gene]

else:
# don't re-cast sets for every single variant
all_panels = set(panelapp_data.genes[gene].panels)
gene_details[gene] = all_panels
all_panels = set(panelapp_data.genes[each_event.gene].panels)
gene_details[each_event.gene] = all_panels

# get all forced panels this gene intersects with
cohort_intersection: set[int] = cohort_panels.intersection(all_panels)
Expand All @@ -246,9 +242,9 @@ def clean_and_filter(
if participant_panels is not None:

# intersection to find participant phenotype-matched panels
phenotype_intersection: set[int] = participant_panels[sample].intersection(
all_panels
)
phenotype_intersection: set[int] = participant_panels[
each_event.sample
].intersection(all_panels)

# is this gene relevant for this participant?
# this test includes matched, cohort-level, and core panel
Expand All @@ -271,12 +267,12 @@ def clean_and_filter(
# If this variant and that variant have same sample/pos, equivalent
# If either was independent, set that flag to True
# Add a union of all Support Variants from both events
if each_event not in results_holder[sample].variants:
results_holder[sample].variants.append(each_event)
if each_event not in results_holder[each_event.sample].variants:
results_holder[each_event.sample].variants.append(each_event)

else:
prev_event = results_holder[sample].variants[
results_holder[sample].variants.index(each_event)
prev_event = results_holder[each_event.sample].variants[
results_holder[each_event.sample].variants.index(each_event)
]

# if this is independent, set independent to True
Expand Down Expand Up @@ -402,7 +398,7 @@ def prepare_results_shell(

# find the solved cases in this project
solved_cases = get_cohort_config(dataset).get('solved_cases', [])
panel_meta = {content['id']: content['name'] for content in panelapp.metadata}
panel_meta = {content.id: content.name for content in panelapp.metadata}

# limit to affected samples present in both Pedigree and VCF
for sample in [
Expand Down Expand Up @@ -500,17 +496,15 @@ def main(
out_json_path = to_path(out_json)

# parse the pedigree from the file
pedigree_digest = Ped(pedigree)
ped = Ped(pedigree)

# parse panelapp data from dict
raw_panel_json = read_json_from_path(panelapp, {'metadata': [], 'genes': dict()})
assert isinstance(raw_panel_json, dict)
panelapp_data = PanelApp.model_construct(raw_panel_json)

# set up the inheritance checks
moi_lookup = set_up_moi_filters(
panelapp_data=panelapp_data, pedigree=pedigree_digest
)
moi_lookup = set_up_moi_filters(panelapp_data=panelapp_data, pedigree=ped)

# todo model this
pheno_panels = read_json_from_path(participant_panels)
Expand All @@ -519,21 +513,21 @@ def main(
# create the new gene map
new_gene_map = get_new_gene_map(panelapp_data, pheno_panels, dataset)

result_list = []
result_list: list[ReportVariant] = []

# open the small variant VCF using a cyvcf2 reader
vcf_opened = VCFReader(labelled_vcf)

# optional SV behaviour
sv_opened = VCFReader(labelled_sv) if labelled_sv is not None else None
sv_opened = VCFReader(labelled_sv) if labelled_sv else None

# obtain a set of all contigs with variants
for contig in canonical_contigs_from_vcf(vcf_opened):
# assemble {gene: [var1, var2, ..]}
contig_dict = gather_gene_dict_from_contig(
contig=contig,
variant_source=vcf_opened,
second_source=sv_opened,
sv_source=sv_opened,
new_gene_map=new_gene_map,
singletons=bool('singleton' in pedigree),
)
Expand All @@ -542,15 +536,15 @@ def main(
apply_moi_to_variants(
variant_dict=contig_dict,
moi_lookup=moi_lookup,
panelapp_data=panelapp_data['genes'],
pedigree=pedigree_digest,
panelapp_data=panelapp_data.genes,
pedigree=ped,
)
)

# create a shell to store results in
results_shell = prepare_results_shell(
vcf_samples=vcf_opened.samples,
pedigree=pedigree_digest,
pedigree=ped,
panel_data=pheno_panels,
dataset=dataset,
panelapp=panelapp_data,
Expand Down Expand Up @@ -578,9 +572,7 @@ def main(
**{
'input_file': input_path,
'cohort': dataset,
'family_breakdown': count_families(
pedigree_digest, samples=vcf_opened.samples
),
'family_breakdown': count_families(ped, samples=vcf_opened.samples),
'panels': panelapp_data.metadata,
'container': get_config()['workflow']['driver_image'],
}
Expand Down

0 comments on commit 18f5738

Please sign in to comment.