Skip to content

Commit

Permalink
Improved report summary script (#474)
Browse files Browse the repository at this point in the history
* improved report summary script
  • Loading branch information
MattWellie authored Jan 28, 2025
1 parent b7ff4a2 commit f11c77e
Showing 1 changed file with 79 additions and 7 deletions.
86 changes: 79 additions & 7 deletions src/talos/SummariseReport.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,81 @@

from talos.utils import read_json_from_path

MEAN_SLASH_SAMPLE = 'Mean/sample'


class NoVariantsFoundError(Exception):
"""raise if a report subset contains no data"""


def get_variant_summary(results: ResultData) -> dict:
"""
Run the numbers across all variant categories
Treat each primary-secondary comp-het pairing as one event
i.e. the thing being counted here is the number of events
which passed through the MOI process, not the absolute number
of variants in the report
Args:
results (ResultData): the results object in full
Returns:
a dictionary summarising the categorised variants
"""

# get the categories this report was aware of
all_categories = results.metadata.categories.keys()

ordered_categories = ['any', *all_categories]

category_count: dict = {key: [] for key in ordered_categories}
unique_variants: dict[str, set[str]] = {key: set() for key in ordered_categories}

for sample_data in results.results.values():
sample_variants: dict[str, set[str]] = {key: set() for key in ordered_categories}

# iterate over the list of variants
for variant in sample_data.variants:
var_string = variant.var_data.coordinates.string_format
unique_variants['any'].add(var_string)
sample_variants['any'].add(var_string)

# find all categories associated with this variant
# for each category, add to corresponding list and set
for category_value in variant.categories:
unique_variants[category_value].add(var_string)
sample_variants[category_value].add(var_string)

# update the global lists with per-sample counts
for key, key_list in category_count.items():
key_list.append(len(sample_variants[key]))

summary_dicts = {
key: {
'Description': results.metadata.categories.get(key, 'All Variants'),
'Total': sum(category_count[key]),
'Unique': len(unique_variants[key]),
'Peak #/sample': max(category_count[key]),
MEAN_SLASH_SAMPLE: sum(category_count[key]) / len(category_count[key]),
}
for key in ordered_categories
}

# this can fail if there are no categorised variants... at all
if not summary_dicts:
raise NoVariantsFoundError('No categorised variants found')

summary_dicts['samples_no_variants'] = category_count['any'].count(0)

return summary_dicts


def main(input_path: str, output_path: str | None = None, prefix: int | None = None):
"""
read the target report, and summarise the number of affected samples involved
read the target report, and summarise the content:
- the number of affected samples involved
- the number of variants in each category
- the number of samples with no variants
Args:
input_path (str): where to read the report from
Expand All @@ -37,21 +108,22 @@ def main(input_path: str, output_path: str | None = None, prefix: int | None = N
# read the report file, local or cloud
report = read_json_from_path(input_path, return_model=ResultData)

# this is a simple overview
family_breakdown = report.metadata.family_breakdown
summarised_content: dict = {'family_breakdown': report.metadata.family_breakdown}

if prefix:
# set up a section in the dictionary for this
family_breakdown['grouped_by_prefix'] = defaultdict(int)
summarised_content['family_breakdown']['grouped_by_prefix'] = defaultdict(int)
for proband in report.results.values():
family_breakdown['grouped_by_prefix'][proband.metadata.ext_id[:prefix]] += 1
summarised_content['family_breakdown']['grouped_by_prefix'][proband.metadata.ext_id[:prefix]] += 1

summarised_content['variant_summary'] = get_variant_summary(report)

print(json.dumps(family_breakdown, indent=4))
print(json.dumps(summarised_content, indent=4))

if output_path:
# write the output to file
with to_path(output_path).open('w') as handle:
json.dump(family_breakdown, handle, indent=4)
json.dump(summarised_content, handle, indent=4)


def cli_main():
Expand Down

0 comments on commit f11c77e

Please sign in to comment.