poke poke poke

populationgenomics · Nov 28, 2023 · 18f5738 · 18f5738
1 parent f6f69ee
commit 18f5738
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 47 deletions.
diff --git a/reanalysis/models.py b/reanalysis/models.py
@@ -396,7 +396,7 @@ class HistoricSampleVariant(BaseModel):
         default_factory=list,
         description='supporting variants if this has been identified in a comp-he',
     )
-    independent: bool = True
+    independent: bool = Field(default=True)
 
 
 class HistoricVariants(BaseModel):

diff --git a/reanalysis/utils.py b/reanalysis/utils.py
@@ -569,7 +569,7 @@ def gather_gene_dict_from_contig(
     variant_source,
     new_gene_map: dict[str, str],
     singletons: bool = False,
-    second_source=None,
+    sv_source=None,
 ) -> GeneDict:
     """
     takes a cyvcf2.VCFReader instance, and a specified chromosome
@@ -582,7 +582,7 @@ def gather_gene_dict_from_contig(
         variant_source (): the VCF reader instance
         new_gene_map ():
         singletons ():
-        second_source (): an optional second VCF (SV)
+        sv_source (): an optional second VCF (SV)
 
     Returns:
         A lookup in the form
@@ -606,45 +606,47 @@ def gather_gene_dict_from_contig(
     # if contig has no variants, prints an error and returns []
     for variant in variant_source(contig):
 
-        abs_var = create_small_variant(
+        small_variant = create_small_variant(
             var=variant,
             samples=variant_source.samples,
             as_singletons=singletons,
             new_genes=new_gene_map,
         )
 
-        if abs_var.coordinates.string_format in blacklist:
+        if small_variant.coordinates.string_format in blacklist:
             get_logger().info(
-                f'Skipping blacklisted variant: {abs_var.coordinates.string_format}'
+                f'Skipping blacklisted variant: {small_variant.coordinates.string_format}'
             )
             continue
 
         # if unclassified, skip the whole variant
-        if not abs_var.is_classified:
+        if not small_variant.is_classified:
             continue
 
         # update the variant count
         contig_variants += 1
 
         # update the gene index dictionary
-        contig_dict[abs_var.info.get('gene_id')].append(abs_var)
+        contig_dict[small_variant.info.get('gene_id')].append(small_variant)
+
+    if sv_source:
+        structural_variants = 0
+        for variant in sv_source(contig):
 
-    if second_source:
-        second_source_variants = 0
-        for variant in second_source(contig):
             # create an abstract SV variant
-            abs_var = create_structural_variant(
-                var=variant, samples=second_source.samples
+            structural_variant = create_structural_variant(
+                var=variant, samples=sv_source.samples
             )
+
             # update the variant count
-            second_source_variants += 1
+            structural_variants += 1
 
             # update the gene index dictionary
-            contig_dict[abs_var.info.get('gene_id')].append(abs_var)
+            contig_dict[structural_variant.info.get('gene_id')].append(
+                structural_variant
+            )
 
-        get_logger().info(
-            f'Contig {contig} contained {second_source_variants} variants'
-        )
+        get_logger().info(f'Contig {contig} contained {structural_variants} variants')
 
     get_logger().info(f'Contig {contig} contained {contig_variants} variants')
     get_logger().info(f'Contig {contig} contained {len(contig_dict)} genes')

diff --git a/reanalysis/validate_categories.py b/reanalysis/validate_categories.py
@@ -224,18 +224,14 @@ def clean_and_filter(
         # shouldn't be possible
         assert each_event.categories, f'No categories for {each_event}'
 
-        # grab some attributes from the event
-        sample = each_event.sample
-        gene = each_event.gene
-
         # find all panels for this gene
-        if gene in gene_details:
-            all_panels = gene_details[gene]
+        if each_event.gene in gene_details:
+            all_panels = gene_details[each_event.gene]
 
         else:
             # don't re-cast sets for every single variant
-            all_panels = set(panelapp_data.genes[gene].panels)
-            gene_details[gene] = all_panels
+            all_panels = set(panelapp_data.genes[each_event.gene].panels)
+            gene_details[each_event.gene] = all_panels
 
         # get all forced panels this gene intersects with
         cohort_intersection: set[int] = cohort_panels.intersection(all_panels)
@@ -246,9 +242,9 @@ def clean_and_filter(
         if participant_panels is not None:
 
             # intersection to find participant phenotype-matched panels
-            phenotype_intersection: set[int] = participant_panels[sample].intersection(
-                all_panels
-            )
+            phenotype_intersection: set[int] = participant_panels[
+                each_event.sample
+            ].intersection(all_panels)
 
             # is this gene relevant for this participant?
             # this test includes matched, cohort-level, and core panel
@@ -271,12 +267,12 @@ def clean_and_filter(
         # If this variant and that variant have same sample/pos, equivalent
         # If either was independent, set that flag to True
         # Add a union of all Support Variants from both events
-        if each_event not in results_holder[sample].variants:
-            results_holder[sample].variants.append(each_event)
+        if each_event not in results_holder[each_event.sample].variants:
+            results_holder[each_event.sample].variants.append(each_event)
 
         else:
-            prev_event = results_holder[sample].variants[
-                results_holder[sample].variants.index(each_event)
+            prev_event = results_holder[each_event.sample].variants[
+                results_holder[each_event.sample].variants.index(each_event)
             ]
 
             # if this is independent, set independent to True
@@ -402,7 +398,7 @@ def prepare_results_shell(
 
     # find the solved cases in this project
     solved_cases = get_cohort_config(dataset).get('solved_cases', [])
-    panel_meta = {content['id']: content['name'] for content in panelapp.metadata}
+    panel_meta = {content.id: content.name for content in panelapp.metadata}
 
     # limit to affected samples present in both Pedigree and VCF
     for sample in [
@@ -500,17 +496,15 @@ def main(
     out_json_path = to_path(out_json)
 
     # parse the pedigree from the file
-    pedigree_digest = Ped(pedigree)
+    ped = Ped(pedigree)
 
     # parse panelapp data from dict
     raw_panel_json = read_json_from_path(panelapp, {'metadata': [], 'genes': dict()})
     assert isinstance(raw_panel_json, dict)
     panelapp_data = PanelApp.model_construct(raw_panel_json)
 
     # set up the inheritance checks
-    moi_lookup = set_up_moi_filters(
-        panelapp_data=panelapp_data, pedigree=pedigree_digest
-    )
+    moi_lookup = set_up_moi_filters(panelapp_data=panelapp_data, pedigree=ped)
 
     # todo model this
     pheno_panels = read_json_from_path(participant_panels)
@@ -519,21 +513,21 @@ def main(
     # create the new gene map
     new_gene_map = get_new_gene_map(panelapp_data, pheno_panels, dataset)
 
-    result_list = []
+    result_list: list[ReportVariant] = []
 
     # open the small variant VCF using a cyvcf2 reader
     vcf_opened = VCFReader(labelled_vcf)
 
     # optional SV behaviour
-    sv_opened = VCFReader(labelled_sv) if labelled_sv is not None else None
+    sv_opened = VCFReader(labelled_sv) if labelled_sv else None
 
     # obtain a set of all contigs with variants
     for contig in canonical_contigs_from_vcf(vcf_opened):
         # assemble {gene: [var1, var2, ..]}
         contig_dict = gather_gene_dict_from_contig(
             contig=contig,
             variant_source=vcf_opened,
-            second_source=sv_opened,
+            sv_source=sv_opened,
             new_gene_map=new_gene_map,
             singletons=bool('singleton' in pedigree),
         )
@@ -542,15 +536,15 @@ def main(
             apply_moi_to_variants(
                 variant_dict=contig_dict,
                 moi_lookup=moi_lookup,
-                panelapp_data=panelapp_data['genes'],
-                pedigree=pedigree_digest,
+                panelapp_data=panelapp_data.genes,
+                pedigree=ped,
             )
         )
 
     # create a shell to store results in
     results_shell = prepare_results_shell(
         vcf_samples=vcf_opened.samples,
-        pedigree=pedigree_digest,
+        pedigree=ped,
         panel_data=pheno_panels,
         dataset=dataset,
         panelapp=panelapp_data,
@@ -578,9 +572,7 @@ def main(
                 **{
                     'input_file': input_path,
                     'cohort': dataset,
-                    'family_breakdown': count_families(
-                        pedigree_digest, samples=vcf_opened.samples
-                    ),
+                    'family_breakdown': count_families(ped, samples=vcf_opened.samples),
                     'panels': panelapp_data.metadata,
                     'container': get_config()['workflow']['driver_image'],
                 }