populationgenomics · MattWellie · Nov 10, 2024 · Nov 10, 2024 · Nov 10, 2024 · Nov 11, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -2,18 +2,15 @@ cloudpathlib[all]>=0.16.0
 cyvcf2>=0.30.18
 dill>=0.3.7
 hail==0.2.133
+hpo3
 httpx>=0.27.0
 Jinja2>=3.1.3
-networkx>=3
-obonet>=1
 pandas>=2
 peds>=1.2.0
 phenopackets>=2
 protobuf==3.20.2
 pydantic>=2.5.2
 pyspark>=3.5.1
 python-dateutil>=2
-# this is a ruff package with a python interface, limited to specific python versions
-semsimian
 tabulate>=0.8.9
 toml==0.10.2
diff --git a/src/talos/CPG/MakePhenopackets.py b/src/talos/CPG/MakePhenopackets.py
@@ -15,17 +15,17 @@
 
 import re
 from argparse import ArgumentParser
+from collections import defaultdict
 
-import networkx as nx
 import phenopackets.schema.v2 as pps2
 from google.protobuf.json_format import MessageToJson
-from obonet import read_obo
+from pyhpo import Ontology
 
 from metamist.graphql import gql, query
 from talos.static_values import get_logger
 
 HPO_KEY = 'HPO Terms (present)'
-HPO_RE = re.compile(r'HP:[0-9]+')
+HPO_RE = re.compile(r'HP:\d+')
 PARTICIPANT_QUERY = gql(
     """
 query MyQuery($project: String!, $sequencing_type: String!, $technology: String!) {
@@ -48,11 +48,14 @@
 }""",
 )
 
+# create an ontology object, once
+_ = Ontology()
+
 # map the integer reported sex values to the enum
 reported_sex_map = {1: pps2.Sex.MALE, 2: pps2.Sex.FEMALE}
 
 
-def find_hpo_labels(metamist_data: dict, hpo_file: str | None = None) -> dict[str, list[dict[str, str]]]:
+def find_hpo_labels(metamist_data: dict) -> dict[str, list[dict[str, str]]]:
     """
     match HPO terms to their plaintext names
 
@@ -62,46 +65,34 @@ def find_hpo_labels(metamist_data: dict, hpo_file: str | None = None) -> dict[st
     disease genes and their MOI terms directly, which messes with what we are trying to do:
         associate patient _phenotypes_ with variant _genes_.
 
+    As we're now using hpo3/pyHPO which ships with a built-in ontology, we can use assume presence of a valid Ontology,
+    simplifying the code
+
     Args:
         metamist_data ():
-        hpo_file ():
 
     Returns:
         dict, participant IDs to HPO:labels
     """
-    all_hpos: set[str] = set()
-    per_sg_hpos: dict[str, set[str]] = {}
-
-    moi_nodes: set[str] = set()
-    hpo_graph = None
-    if hpo_file:
-        # create a graph of HPO terms
-        hpo_graph = read_obo(hpo_file, ignore_obsolete=False)
-        moi_nodes = nx.ancestors(hpo_graph, 'HP:0000005')
+    per_sg_hpos: dict[str, list[dict[str, str]]] = defaultdict(list)
 
     for sg in metamist_data['project']['sequencingGroups']:
-        hpos = set(HPO_RE.findall(sg['sample']['participant']['phenotypes'].get(HPO_KEY, '')))
-
-        # groom out any strictly MOI related terms
-        hpos -= moi_nodes
-
-        all_hpos.update(hpos)
-        per_sg_hpos[sg['id']] = hpos
-
-    # no label obtained, that's fine...
-    if not (hpo_file and hpo_graph):
-        return {sg: [{'id': hp, 'label': 'Unknown'} for hp in hpos] for sg, hpos in per_sg_hpos.items()}
+        # select all HPO terms, so long as they are not a child of 'Mode of Inheritance' (HP:0000005)
+        hpos = {
+            hpo_term
+            for hpo_term in HPO_RE.findall(sg['sample']['participant']['phenotypes'].get(HPO_KEY, ''))
+            if 'HP:0000005' not in Ontology.get_hpo_object(hpo_term).all_parents
+        }
 
-    # create a dictionary of HPO terms to their text
-    hpo_to_text: dict[str, str] = {}
-    for hpo in all_hpos:
-        if not hpo_graph.has_node(hpo):
-            get_logger(__file__).error(f'HPO term was absent from the tree: {hpo}')
-            hpo_to_text[hpo] = 'Unknown'
-        else:
-            hpo_to_text[hpo] = hpo_graph.nodes[hpo]['name']
+        # allow for HPO terms to be missing from this edition of the ontology
+        for hpo in hpos:
+            try:
+                per_sg_hpos[sg['id']].append({'id': hpo, 'label': Ontology.get_hpo_object(hpo).name})
+            except ValueError:
+                get_logger(__file__).error(f'HPO term was absent from the tree: {hpo}')
+                per_sg_hpos[sg['id']].append({'id': hpo, 'label': 'Unknown'})
 
-    return {sg: [{'id': hp, 'label': hpo_to_text[hp]} for hp in hpos] for sg, hpos in per_sg_hpos.items()}
+    return per_sg_hpos
 
 
 def assemble_phenopackets(dataset: str, metamist_data: dict, hpo_lookup: dict[str, list[dict[str, str]]]):
@@ -170,15 +161,14 @@ def assemble_phenopackets(dataset: str, metamist_data: dict, hpo_lookup: dict[st
     return cohort
 
 
-def main(output: str, dataset: str, seq_type: str, tech: str = 'short-read', hpo_file: str | None = None):
+def main(output: str, dataset: str, seq_type: str, tech: str = 'short-read'):
     """
     Assemble a cohort phenopacket from the metamist data
     Args:
         output (str): where to write the phenopacket
         dataset (str): the dataset to query for
         seq_type (str): exome/genome
         tech (str): type of sequence data to query for
-        hpo_file (str): path to an obo file containing HPO tree
     """
 
     # pull all the relevant data from metamist
@@ -188,7 +178,7 @@ def main(output: str, dataset: str, seq_type: str, tech: str = 'short-read', hpo
     )
 
     # match names to HPO terms
-    labelled_hpos = find_hpo_labels(hpo_file=hpo_file, metamist_data=metamist_data)
+    labelled_hpos = find_hpo_labels(metamist_data=metamist_data)
 
     # build the cohort
     cohort = assemble_phenopackets(dataset=dataset, metamist_data=metamist_data, hpo_lookup=labelled_hpos)
@@ -203,11 +193,10 @@ def cli_main():
     parser.add_argument('--dataset', help='The dataset to query for')
     parser.add_argument('--output', help='The output file')
     parser.add_argument('--type', help='Sequencing type (exome or genome)')
-    parser.add_argument('--hpo', help='HPO ontology file')
     parser.add_argument('--tech', help='Sequencing technology', default='short-read')
     args = parser.parse_args()
 
-    main(dataset=args.dataset, output=args.output, seq_type=args.type, hpo_file=args.hpo, tech=args.tech)
+    main(dataset=args.dataset, output=args.output, seq_type=args.type, tech=args.tech)
 
 
 if __name__ == '__main__':

diff --git a/src/talos/FindGeneSymbolMap.py b/src/talos/FindGeneSymbolMap.py
@@ -34,7 +34,8 @@ async def match_ensgs_to_symbols(genes: list[str], session: ClientSession) -> di
     r.raise_for_status()
     json_reponse = await r.json()
     # match symbol to the ENSG (or Unknown if the key is missing, or has a None value)
-    return {value.get('display_name'): key for key, value in json_reponse.items() if value}
+    # we want the key to be ENSG
+    return {key: value.get('display_name') for key, value in json_reponse.items() if value}
 
 
 async def match_symbol_to_ensg(gene_symbol: str, session: ClientSession) -> tuple[str, str]:

diff --git a/src/talos/GeneratePanelData.py b/src/talos/GeneratePanelData.py
@@ -12,8 +12,7 @@
 
 import phenopackets.schema.v2 as pps2
 from google.protobuf.json_format import ParseDict
-from networkx import MultiDiGraph
-from obonet import read_obo
+from pyhpo import Ontology
 
 from talos.config import config_retrieve
 from talos.models import ParticipantHPOPanels, PhenoPacketHpo, PhenotypeMatchedPanels
@@ -33,6 +32,9 @@
     PANELS_ENDPOINT = PANELAPP_HARD_CODED_DEFAULT
     DEFAULT_PANEL = PANELAPP_HARD_CODED_BASE_PANEL
 
+# create an ontology object, once
+_ = Ontology()
+
 
 def get_panels(endpoint: str = PANELS_ENDPOINT) -> dict[str, set[int]]:
     """
@@ -89,73 +91,39 @@ def set_up_cohort_pmp(cohort: pps2.Cohort) -> tuple[PhenotypeMatchedPanels, set[
     return hpo_dict, all_hpos
 
 
-def match_hpo_terms(
-    panel_map: dict[str, set[int]],
-    hpo_tree: MultiDiGraph,
-    hpo_str: str,
-    selections: set[int] | None = None,
-) -> set[int]:
-    """
-    get panels relevant for this HPO using a recursive edge traversal
-    for live terms we recurse on all parents
-    if a term is obsolete we instead check each replacement term
-
-    relevant usage guide:
-    https://github.com/dhimmel/obonet/blob/main/examples/go-obonet.ipynb
-
-    Args:
-        panel_map (dict):
-        hpo_tree (): a graph object representing the HPO tree
-        hpo_str (str): the query HPO term
-        selections (set[int]): collected panel IDs so far
-
-    Returns:
-        set: panel IDs relating to this HPO term
-    """
-
-    if selections is None:
-        selections = set()
-
-    # identify identical match and select the panel
-    if hpo_str in panel_map:
-        selections.update(panel_map[hpo_str])
-
-    # if a node is invalid, recursively call this method for each replacement D:
-    # there are simpler ways, just none that are as fun to write
-    if not hpo_tree.has_node(hpo_str):
-        get_logger().error(f'HPO term was absent from the tree: {hpo_str}')
-        return selections
-
-    hpo_node = hpo_tree.nodes[hpo_str]
-    if hpo_node.get('is_obsolete', 'false') == 'true':
-        for hpo_term in hpo_node.get('replaced_by', []):
-            selections.update(match_hpo_terms(panel_map, hpo_tree, hpo_term, selections))
-    # search for parent(s), even if the term is obsolete
-    for hpo_term in hpo_node.get('is_a', []):
-        selections.update(match_hpo_terms(panel_map, hpo_tree, hpo_term, selections))
-    return selections
-
-
-def match_hpos_to_panels(hpo_panel_map: dict[str, set[int]], hpo_file: str, all_hpos: set[str]) -> dict[str, set[int]]:
+def match_hpos_to_panels(hpo_panel_map: dict[str, set[int]], all_hpos: set[str]) -> dict[str, set[int]]:
     """
     take the HPO terms from the participant metadata, and match to panels
 
+    greatly simplified implementation, as we're now using hpo3/pyHPO which ships with a built-in ontology
+    we're checking for clashes between PanelApp contents and each exact term, or its ancestors
+
     Args:
         hpo_panel_map (dict): panel IDs to all related panels
-        hpo_file (str): path to an obo file containing HPO tree
         all_hpos (set[str]): collection of all unique HPO terms
 
     Returns:
         a dictionary linking all HPO terms to a corresponding set of Panel IDs
         a second dictionary linking all HPO terms to their plaintext names
     """
 
-    hpo_graph = read_obo(hpo_file, ignore_obsolete=False)
-
-    hpo_to_panels = {}
+    hpo_to_panels: dict[str, set[int]] = {}
     for hpo in all_hpos:
-        panel_ids = match_hpo_terms(panel_map=hpo_panel_map, hpo_tree=hpo_graph, hpo_str=hpo)
-        hpo_to_panels[hpo] = panel_ids
+        # set must exist, even if empyt - defaultdict doesn't do this
+        hpo_to_panels[hpo] = set()
+
+        # if we have an exact match, use that as a starting point
+        if hpo in hpo_panel_map:
+            hpo_to_panels[hpo] = hpo_panel_map[hpo]
+
+        # try and fetch the node in the ontology
+        try:
+            hpo_node = Ontology.get_hpo_object(hpo)
+            for parent in hpo_node.all_parents:
+                if parent.id in hpo_panel_map:
+                    hpo_to_panels[hpo].update(hpo_panel_map[parent.id])
+        except ValueError:
+            get_logger(__file__).error(f'HPO term was absent from the tree: {hpo}')
 
     return hpo_to_panels
 
@@ -187,12 +155,11 @@ def cli_main():
     parser = ArgumentParser()
     parser.add_argument('--input', help='GA4GH Cohort/PhenoPacket File')
     parser.add_argument('--output', help='Path to write PhenotypeMatchedPanels to (JSON)')
-    parser.add_argument('--hpo', help='Local copy of HPO obo file', required=True)
     args = parser.parse_args()
-    main(ga4gh_cohort_file=args.input, panel_out=args.output, hpo_file=args.hpo)
+    main(ga4gh_cohort_file=args.input, panel_out=args.output)
 
 
-def main(ga4gh_cohort_file: str, panel_out: str, hpo_file: str):
+def main(ga4gh_cohort_file: str, panel_out: str):
     """
     query PanelApp - get all panels and their assc. HPOs
     read Cohort/PhenoPacket file
@@ -202,7 +169,6 @@ def main(ga4gh_cohort_file: str, panel_out: str, hpo_file: str):
     Args:
         ga4gh_cohort_file (str): path to GA4GH Cohort/PhenoPacket file
         panel_out (str): where to write PhenotypeMatchedPanels file
-        hpo_file (str): path to an obo file containing HPO tree
     """
 
     # read the Cohort/PhenoPacket file as a JSON
@@ -217,7 +183,7 @@ def main(ga4gh_cohort_file: str, panel_out: str, hpo_file: str):
 
     # match HPO terms to panel IDs
     # returns a lookup of each HPO term in the cohort, and panels it is associated with
-    hpo_to_panels = match_hpos_to_panels(hpo_panel_map=panels_by_hpo, all_hpos=all_hpos, hpo_file=hpo_file)
+    hpo_to_panels = match_hpos_to_panels(hpo_panel_map=panels_by_hpo, all_hpos=all_hpos)
 
     match_participants_to_panels(pmp_dict, hpo_to_panels)