Skip to content

Commit

Permalink
seqr minimiser produces a second pheno-matched JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
MattWellie committed Mar 1, 2024
1 parent eae8bdb commit a09fa54
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 5 deletions.
13 changes: 8 additions & 5 deletions helpers/prepare_aip_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@

PED_QUERY = gql(
"""
query PedAndSGs($project: String!) {
query PedAndSGs($project: String!, $type: String!) {
project(name: $project) {
pedigree
sequencingGroups(activeOnly: {eq: true}) {
sequencingGroups(type: {eq: $type}, activeOnly: {eq: true}) {
id
sample {
participant {
Expand Down Expand Up @@ -218,19 +218,20 @@ def process_pedigree(


def get_pedigree_for_project(
project: str,
project: str, seq_type: str
) -> tuple[list[dict[str, str]], dict[str, str]]:
"""
fetches the project pedigree from sample-metadata
list, one dict per participant
Args:
project (str): project/dataset to use in query
seq_type (str): exome or genome
Returns:
All API returned content
"""
response = query(PED_QUERY, variables={'project': project})
response = query(PED_QUERY, variables={'project': project, 'type': seq_type})
pedigree = response['project']['pedigree']
lookup = {
sg['sample']['participant']['externalId']: [sg['id']]
Expand Down Expand Up @@ -281,7 +282,9 @@ def main(

# get the list of all pedigree members as list of dictionaries
logging.info('Pulling all pedigree members')
pedigree_dicts, ext_lookup = get_pedigree_for_project(project=project)
pedigree_dicts, ext_lookup = get_pedigree_for_project(
project=project, seq_type=exome_or_genome
)

# endpoint gives list of tuples e.g. [['A1234567_proband', 'CPGABCDE']]
# parser returns a dictionary, arbitrary # sample IDs per participant
Expand Down
108 changes: 108 additions & 0 deletions reanalysis/minimise_output_for_seqr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
a script to post-process the AIP output data
produces a minimised representation of the output data,
containing only the data required for the SEQR app.
- Metadata, containing the description for each flag
- Results, containing per-individual results
- Individual ID
- Variant ID
- Categories (list)
- Support Variants (list)
Also produce a second version of the same, limited to phenotype-matches
"""

import json
import logging
from argparse import ArgumentParser

from reanalysis.models import MiniForSeqr, MiniVariant, ResultData


def coord_to_string(coord: dict) -> str:
"""
converts a coordinate dict to a string
Args:
coord (dict): a coordinate dict
Returns:
str: a string representation of the coordinate dict
"""
return f"{coord['chrom']}-{coord['pos']}-{coord['ref']}-{coord['alt']}"


def main(
input_file: str, output: str, ext_map: str | None = None, pheno_match: bool = False
):
"""
reads in the input file, shrinks it, and writes the output file
Args:
input_file (str):
output (str):
ext_map (str): optional mapping of internal to external IDs for seqr
pheno_match (bool): whether to limit to phenotype-matching variants
"""

with open(input_file, encoding='utf-8') as f:
data = ResultData.model_validate(json.load(f))

lil_data = MiniForSeqr(
**{
'metadata': {'categories': data.metadata.categories},
}
)
ext_map_dict = None
if ext_map:
with open(ext_map, encoding='utf-8') as f:
ext_map_dict = json.load(f)

for individual, details in data.results.items():
# optionally update to point to Seqr identities
if ext_map_dict:
individual = ext_map_dict.get(individual, individual)

lil_data.results[individual] = {}
for variant in details.variants:
var_data = variant.var_data
if pheno_match and not variant.panels.matched:
continue
lil_data.results[individual][var_data.info['seqr_link']] = MiniVariant(
**{
'categories': variant.categories,
'support_vars': variant.support_vars,
}
)
additional_string = 'phenotype-matched' if pheno_match else ''
if not any(lil_data.results.values()):
logging.info(f'No {additional_string} results found')
return
with open(output, 'w', encoding='utf-8') as f:
f.write(MiniForSeqr.model_validate(lil_data).model_dump_json(indent=4))

logging.info(f'Wrote {additional_string} output to {output}')


if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('input_file', help='the input file to process')
parser.add_argument('output_file', help='the output file to write to')
parser.add_argument(
'--external_map',
help='mapping of internal to external IDs for seqr',
default=None,
type=str,
)
args = parser.parse_args()

main(input_file=args.input_file, output=args.output_file, ext_map=args.external_map)
main(
input_file=args.input_file,
output=args.output_file,
ext_map=args.external_map,
pheno_match=True,
)

0 comments on commit a09fa54

Please sign in to comment.