Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Swap out semsimian for hpo3 #458

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,15 @@ cloudpathlib[all]>=0.16.0
cyvcf2>=0.30.18
dill>=0.3.7
hail==0.2.133
hpo3
httpx>=0.27.0
Jinja2>=3.1.3
networkx>=3
obonet>=1
pandas>=2
peds>=1.2.0
phenopackets>=2
protobuf==3.20.2
pydantic>=2.5.2
pyspark>=3.5.1
python-dateutil>=2
# this is a ruff package with a python interface, limited to specific python versions
semsimian
tabulate>=0.8.9
toml==0.10.2
67 changes: 28 additions & 39 deletions src/talos/CPG/MakePhenopackets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@

import re
from argparse import ArgumentParser
from collections import defaultdict

import networkx as nx
import phenopackets.schema.v2 as pps2
from google.protobuf.json_format import MessageToJson
from obonet import read_obo
from pyhpo import Ontology

from metamist.graphql import gql, query
from talos.static_values import get_logger

HPO_KEY = 'HPO Terms (present)'
HPO_RE = re.compile(r'HP:[0-9]+')
HPO_RE = re.compile(r'HP:\d+')
PARTICIPANT_QUERY = gql(
"""
query MyQuery($project: String!, $sequencing_type: String!, $technology: String!) {
Expand All @@ -48,11 +48,14 @@
}""",
)

# create an ontology object, once
_ = Ontology()

# map the integer reported sex values to the enum
reported_sex_map = {1: pps2.Sex.MALE, 2: pps2.Sex.FEMALE}


def find_hpo_labels(metamist_data: dict, hpo_file: str | None = None) -> dict[str, list[dict[str, str]]]:
def find_hpo_labels(metamist_data: dict) -> dict[str, list[dict[str, str]]]:
"""
match HPO terms to their plaintext names

Expand All @@ -62,46 +65,34 @@ def find_hpo_labels(metamist_data: dict, hpo_file: str | None = None) -> dict[st
disease genes and their MOI terms directly, which messes with what we are trying to do:
associate patient _phenotypes_ with variant _genes_.

As we're now using hpo3/pyHPO which ships with a built-in ontology, we can use assume presence of a valid Ontology,
simplifying the code

Args:
metamist_data ():
hpo_file ():

Returns:
dict, participant IDs to HPO:labels
"""
all_hpos: set[str] = set()
per_sg_hpos: dict[str, set[str]] = {}

moi_nodes: set[str] = set()
hpo_graph = None
if hpo_file:
# create a graph of HPO terms
hpo_graph = read_obo(hpo_file, ignore_obsolete=False)
moi_nodes = nx.ancestors(hpo_graph, 'HP:0000005')
per_sg_hpos: dict[str, list[dict[str, str]]] = defaultdict(list)

for sg in metamist_data['project']['sequencingGroups']:
hpos = set(HPO_RE.findall(sg['sample']['participant']['phenotypes'].get(HPO_KEY, '')))

# groom out any strictly MOI related terms
hpos -= moi_nodes

all_hpos.update(hpos)
per_sg_hpos[sg['id']] = hpos

# no label obtained, that's fine...
if not (hpo_file and hpo_graph):
return {sg: [{'id': hp, 'label': 'Unknown'} for hp in hpos] for sg, hpos in per_sg_hpos.items()}
# select all HPO terms, so long as they are not a child of 'Mode of Inheritance' (HP:0000005)
hpos = {
hpo_term
for hpo_term in HPO_RE.findall(sg['sample']['participant']['phenotypes'].get(HPO_KEY, ''))
if 'HP:0000005' not in Ontology.get_hpo_object(hpo_term).all_parents
}

# create a dictionary of HPO terms to their text
hpo_to_text: dict[str, str] = {}
for hpo in all_hpos:
if not hpo_graph.has_node(hpo):
get_logger(__file__).error(f'HPO term was absent from the tree: {hpo}')
hpo_to_text[hpo] = 'Unknown'
else:
hpo_to_text[hpo] = hpo_graph.nodes[hpo]['name']
# allow for HPO terms to be missing from this edition of the ontology
for hpo in hpos:
try:
per_sg_hpos[sg['id']].append({'id': hpo, 'label': Ontology.get_hpo_object(hpo).name})
except ValueError:
get_logger(__file__).error(f'HPO term was absent from the tree: {hpo}')
per_sg_hpos[sg['id']].append({'id': hpo, 'label': 'Unknown'})

return {sg: [{'id': hp, 'label': hpo_to_text[hp]} for hp in hpos] for sg, hpos in per_sg_hpos.items()}
return per_sg_hpos


def assemble_phenopackets(dataset: str, metamist_data: dict, hpo_lookup: dict[str, list[dict[str, str]]]):
Expand Down Expand Up @@ -170,15 +161,14 @@ def assemble_phenopackets(dataset: str, metamist_data: dict, hpo_lookup: dict[st
return cohort


def main(output: str, dataset: str, seq_type: str, tech: str = 'short-read', hpo_file: str | None = None):
def main(output: str, dataset: str, seq_type: str, tech: str = 'short-read'):
"""
Assemble a cohort phenopacket from the metamist data
Args:
output (str): where to write the phenopacket
dataset (str): the dataset to query for
seq_type (str): exome/genome
tech (str): type of sequence data to query for
hpo_file (str): path to an obo file containing HPO tree
"""

# pull all the relevant data from metamist
Expand All @@ -188,7 +178,7 @@ def main(output: str, dataset: str, seq_type: str, tech: str = 'short-read', hpo
)

# match names to HPO terms
labelled_hpos = find_hpo_labels(hpo_file=hpo_file, metamist_data=metamist_data)
labelled_hpos = find_hpo_labels(metamist_data=metamist_data)

# build the cohort
cohort = assemble_phenopackets(dataset=dataset, metamist_data=metamist_data, hpo_lookup=labelled_hpos)
Expand All @@ -203,11 +193,10 @@ def cli_main():
parser.add_argument('--dataset', help='The dataset to query for')
parser.add_argument('--output', help='The output file')
parser.add_argument('--type', help='Sequencing type (exome or genome)')
parser.add_argument('--hpo', help='HPO ontology file')
parser.add_argument('--tech', help='Sequencing technology', default='short-read')
args = parser.parse_args()

main(dataset=args.dataset, output=args.output, seq_type=args.type, hpo_file=args.hpo, tech=args.tech)
main(dataset=args.dataset, output=args.output, seq_type=args.type, tech=args.tech)


if __name__ == '__main__':
Expand Down
3 changes: 2 additions & 1 deletion src/talos/FindGeneSymbolMap.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ async def match_ensgs_to_symbols(genes: list[str], session: ClientSession) -> di
r.raise_for_status()
json_reponse = await r.json()
# match symbol to the ENSG (or Unknown if the key is missing, or has a None value)
return {value.get('display_name'): key for key, value in json_reponse.items() if value}
# we want the key to be ENSG
return {key: value.get('display_name') for key, value in json_reponse.items() if value}


async def match_symbol_to_ensg(gene_symbol: str, session: ClientSession) -> tuple[str, str]:
Expand Down
88 changes: 27 additions & 61 deletions src/talos/GeneratePanelData.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@

import phenopackets.schema.v2 as pps2
from google.protobuf.json_format import ParseDict
from networkx import MultiDiGraph
from obonet import read_obo
from pyhpo import Ontology

from talos.config import config_retrieve
from talos.models import ParticipantHPOPanels, PhenoPacketHpo, PhenotypeMatchedPanels
Expand All @@ -33,6 +32,9 @@
PANELS_ENDPOINT = PANELAPP_HARD_CODED_DEFAULT
DEFAULT_PANEL = PANELAPP_HARD_CODED_BASE_PANEL

# create an ontology object, once
_ = Ontology()


def get_panels(endpoint: str = PANELS_ENDPOINT) -> dict[str, set[int]]:
"""
Expand Down Expand Up @@ -89,73 +91,39 @@ def set_up_cohort_pmp(cohort: pps2.Cohort) -> tuple[PhenotypeMatchedPanels, set[
return hpo_dict, all_hpos


def match_hpo_terms(
panel_map: dict[str, set[int]],
hpo_tree: MultiDiGraph,
hpo_str: str,
selections: set[int] | None = None,
) -> set[int]:
"""
get panels relevant for this HPO using a recursive edge traversal
for live terms we recurse on all parents
if a term is obsolete we instead check each replacement term

relevant usage guide:
https://github.com/dhimmel/obonet/blob/main/examples/go-obonet.ipynb

Args:
panel_map (dict):
hpo_tree (): a graph object representing the HPO tree
hpo_str (str): the query HPO term
selections (set[int]): collected panel IDs so far

Returns:
set: panel IDs relating to this HPO term
"""

if selections is None:
selections = set()

# identify identical match and select the panel
if hpo_str in panel_map:
selections.update(panel_map[hpo_str])

# if a node is invalid, recursively call this method for each replacement D:
# there are simpler ways, just none that are as fun to write
if not hpo_tree.has_node(hpo_str):
get_logger().error(f'HPO term was absent from the tree: {hpo_str}')
return selections

hpo_node = hpo_tree.nodes[hpo_str]
if hpo_node.get('is_obsolete', 'false') == 'true':
for hpo_term in hpo_node.get('replaced_by', []):
selections.update(match_hpo_terms(panel_map, hpo_tree, hpo_term, selections))
# search for parent(s), even if the term is obsolete
for hpo_term in hpo_node.get('is_a', []):
selections.update(match_hpo_terms(panel_map, hpo_tree, hpo_term, selections))
return selections


def match_hpos_to_panels(hpo_panel_map: dict[str, set[int]], hpo_file: str, all_hpos: set[str]) -> dict[str, set[int]]:
def match_hpos_to_panels(hpo_panel_map: dict[str, set[int]], all_hpos: set[str]) -> dict[str, set[int]]:
"""
take the HPO terms from the participant metadata, and match to panels

greatly simplified implementation, as we're now using hpo3/pyHPO which ships with a built-in ontology
we're checking for clashes between PanelApp contents and each exact term, or its ancestors

Args:
hpo_panel_map (dict): panel IDs to all related panels
hpo_file (str): path to an obo file containing HPO tree
all_hpos (set[str]): collection of all unique HPO terms

Returns:
a dictionary linking all HPO terms to a corresponding set of Panel IDs
a second dictionary linking all HPO terms to their plaintext names
"""

hpo_graph = read_obo(hpo_file, ignore_obsolete=False)

hpo_to_panels = {}
hpo_to_panels: dict[str, set[int]] = {}
for hpo in all_hpos:
panel_ids = match_hpo_terms(panel_map=hpo_panel_map, hpo_tree=hpo_graph, hpo_str=hpo)
hpo_to_panels[hpo] = panel_ids
# set must exist, even if empyt - defaultdict doesn't do this
hpo_to_panels[hpo] = set()

# if we have an exact match, use that as a starting point
if hpo in hpo_panel_map:
hpo_to_panels[hpo] = hpo_panel_map[hpo]

# try and fetch the node in the ontology
try:
hpo_node = Ontology.get_hpo_object(hpo)
for parent in hpo_node.all_parents:
if parent.id in hpo_panel_map:
hpo_to_panels[hpo].update(hpo_panel_map[parent.id])
except ValueError:
get_logger(__file__).error(f'HPO term was absent from the tree: {hpo}')

return hpo_to_panels

Expand Down Expand Up @@ -187,12 +155,11 @@ def cli_main():
parser = ArgumentParser()
parser.add_argument('--input', help='GA4GH Cohort/PhenoPacket File')
parser.add_argument('--output', help='Path to write PhenotypeMatchedPanels to (JSON)')
parser.add_argument('--hpo', help='Local copy of HPO obo file', required=True)
args = parser.parse_args()
main(ga4gh_cohort_file=args.input, panel_out=args.output, hpo_file=args.hpo)
main(ga4gh_cohort_file=args.input, panel_out=args.output)


def main(ga4gh_cohort_file: str, panel_out: str, hpo_file: str):
def main(ga4gh_cohort_file: str, panel_out: str):
"""
query PanelApp - get all panels and their assc. HPOs
read Cohort/PhenoPacket file
Expand All @@ -202,7 +169,6 @@ def main(ga4gh_cohort_file: str, panel_out: str, hpo_file: str):
Args:
ga4gh_cohort_file (str): path to GA4GH Cohort/PhenoPacket file
panel_out (str): where to write PhenotypeMatchedPanels file
hpo_file (str): path to an obo file containing HPO tree
"""

# read the Cohort/PhenoPacket file as a JSON
Expand All @@ -217,7 +183,7 @@ def main(ga4gh_cohort_file: str, panel_out: str, hpo_file: str):

# match HPO terms to panel IDs
# returns a lookup of each HPO term in the cohort, and panels it is associated with
hpo_to_panels = match_hpos_to_panels(hpo_panel_map=panels_by_hpo, all_hpos=all_hpos, hpo_file=hpo_file)
hpo_to_panels = match_hpos_to_panels(hpo_panel_map=panels_by_hpo, all_hpos=all_hpos)

match_participants_to_panels(pmp_dict, hpo_to_panels)

Expand Down
Loading
Loading