From 0a21463af2c78e2374c427053957553f4a49f49c Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 1 Oct 2024 11:56:11 -0400 Subject: [PATCH 1/2] Add SQLite-based ontology --- indra/ontology/bio/sqlite_ontology.py | 149 ++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 indra/ontology/bio/sqlite_ontology.py diff --git a/indra/ontology/bio/sqlite_ontology.py b/indra/ontology/bio/sqlite_ontology.py new file mode 100644 index 0000000000..202913a944 --- /dev/null +++ b/indra/ontology/bio/sqlite_ontology.py @@ -0,0 +1,149 @@ +"""This module implements an SQLite back end to the +INDRA BioOntology.""" +import os +import sqlite3 +import logging +from collections import defaultdict +from indra.ontology.ontology_graph import IndraOntology +from indra.ontology.bio.ontology import CACHE_DIR +from indra.ontology.bio import bio_ontology + + +logger = logging.getLogger(__name__) + + +DEFAULT_SQLITE_ONTOLOGY = os.path.join(CACHE_DIR, 'bio_ontology.db') + + +class SqliteOntology(IndraOntology): + def __init__(self, db_path=DEFAULT_SQLITE_ONTOLOGY): + super().__init__() + self.db_path = db_path + build_sqlite_ontology(db_path) + conn = sqlite3.connect(db_path) + self.cur = conn.cursor() + + def initialize(self): + self._initialized = True + + def isa_or_partof(self, ns1, id1, ns2, id2): + q = """SELECT 1 FROM relationships + WHERE child_id=? AND child_ns=? AND parent_id=? AND parent_ns=? + LIMIT 1;""" + self.cur.execute(q, (id1, ns1, id2, ns2)) + return self.cur.fetchone() is not None + + def child_rel(self, ns, id, rel_types): + q = """SELECT children FROM child_lookup + WHERE parent_id=? AND parent_ns=? + LIMIT 1;""" + self.cur.execute(q, (id, ns)) + res = self.cur.fetchone() + if res is None: + yield from [] + else: + yield from [tuple(x.split(':', 1)) for x in res[0].split(',')] + + def get_parents(self, ns, id): + return list(self.parent_rel(ns, id, {'isa', 'partof'})) + + def get_children(self, ns, id): + return list(self.child_rel(ns, id, {'isa', 'partof'})) + + def parent_rel(self, ns, id, rel_types): + q = """SELECT parents FROM parent_lookup + WHERE child_id=? AND child_ns=? + LIMIT 1;""" + self.cur.execute(q, (id, ns)) + res = self.cur.fetchone() + if res is None: + yield from [] + else: + yield from [tuple(x.split(':', 1)) for x in res[0].split(',')] + + def get_node_property(self, ns, id, property): + return None + + def get_id_from_name(self, ns, name): + return None + + +def build_sqlite_ontology(db_path=DEFAULT_SQLITE_ONTOLOGY, force=False): + # If the database already exists and we are not forcing a rebuild, return + if os.path.exists(db_path) and not force: + return + + # Initialize the bio ontology and build the transitive closure + bio_ontology.initialize() + bio_ontology._build_transitive_closure() + + # Set up connection + conn = sqlite3.connect(db_path) + cur = conn.cursor() + + logger.info('Building SQLite ontology at %s' % db_path) + # First, we create the relationships table and populate + # it with child/parent pairs + q = """CREATE TABLE relationships ( + child_id TEXT NOT NULL, + child_ns TEXT NOT NULL, + parent_id TEXT NOT NULL, + parent_ns TEXT NOT NULL, + UNIQUE (child_id, child_ns, parent_id, parent_ns) + );""" + cur.execute(q) + + # Insert into the database in chunks + chunk_size = 10000 + tc = sorted(bio_ontology.transitive_closure) + all_children = defaultdict(set) + all_parents = defaultdict(set) + for i in range(0, len(tc), chunk_size): + chunk = tc[i:i+chunk_size] + chunk_values = [(child.split(':', 1)[1], child.split(':')[0], + parent.split(':', 1)[1], parent.split(':')[0]) + for child, parent in chunk] + for cid, cns, pid, pns in chunk_values: + all_children[(pid, pns)].add('%s:%s' % (cns, cid)) + all_parents[(cid, cns)].add('%s:%s' % (pns, pid)) + cur.executemany("""INSERT INTO relationships (child_id, + child_ns, parent_id, parent_ns) + VALUES (?, ?, ?, ?);""", chunk_values) + q = """CREATE INDEX idx_child_parent ON relationships + (child_id, child_ns, parent_id, parent_ns);""" + cur.execute(q) + + # Next, create child and parent lookup tables and populate them + q = """CREATE TABLE child_lookup ( + parent_id TEXT NOT NULL, + parent_ns TEXT NOT NULL, + children TEXT NOT NULL, + UNIQUE (parent_id, parent_ns) + );""" + cur.execute(q) + q = """CREATE TABLE parent_lookup ( + child_id TEXT NOT NULL, + child_ns TEXT NOT NULL, + parents TEXT NOT NULL, + UNIQUE (child_id, child_ns) + );""" + cur.execute(q) + for (pid, pns), children in all_children.items(): + cur.execute("INSERT INTO child_lookup (parent_id, parent_ns, children) " + "VALUES (?, ?, ?);", + (pid, pns, ','.join(children))) + for (cid, cns), parents in all_parents.items(): + cur.execute("INSERT INTO parent_lookup (child_id, child_ns, parents) " + "VALUES (?, ?, ?);", + (cid, cns, ','.join(parents))) + # Now add indices to the lookup tables + q = """CREATE INDEX idx_child_lookup ON child_lookup + (parent_id, parent_ns);""" + cur.execute(q) + q = """CREATE INDEX idx_parent_lookup ON parent_lookup + (child_id, child_ns);""" + cur.execute(q) + + conn.commit() + conn.close() + logger.info('Finished building SQLite ontology') From 190e02acfc314c446f098a8f87f17ce9db0380df Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 19 Nov 2024 13:38:04 -0500 Subject: [PATCH 2/2] Implement node properties --- indra/ontology/bio/__init__.py | 2 +- indra/ontology/bio/sqlite_ontology.py | 47 +++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/indra/ontology/bio/__init__.py b/indra/ontology/bio/__init__.py index 3c88976ca8..969efcca60 100644 --- a/indra/ontology/bio/__init__.py +++ b/indra/ontology/bio/__init__.py @@ -8,4 +8,4 @@ indra_ontology_url = get_config('INDRA_ONTOLOGY_URL') bio_ontology = BioOntology() if not indra_ontology_url else \ - VirtualOntology(url=indra_ontology_url) \ No newline at end of file + VirtualOntology(url=indra_ontology_url) diff --git a/indra/ontology/bio/sqlite_ontology.py b/indra/ontology/bio/sqlite_ontology.py index 202913a944..e0cd8871e6 100644 --- a/indra/ontology/bio/sqlite_ontology.py +++ b/indra/ontology/bio/sqlite_ontology.py @@ -1,6 +1,8 @@ """This module implements an SQLite back end to the INDRA BioOntology.""" + import os +import json import sqlite3 import logging from collections import defaultdict @@ -23,9 +25,6 @@ def __init__(self, db_path=DEFAULT_SQLITE_ONTOLOGY): conn = sqlite3.connect(db_path) self.cur = conn.cursor() - def initialize(self): - self._initialized = True - def isa_or_partof(self, ns1, id1, ns2, id2): q = """SELECT 1 FROM relationships WHERE child_id=? AND child_ns=? AND parent_id=? AND parent_ns=? @@ -47,8 +46,12 @@ def child_rel(self, ns, id, rel_types): def get_parents(self, ns, id): return list(self.parent_rel(ns, id, {'isa', 'partof'})) - def get_children(self, ns, id): - return list(self.child_rel(ns, id, {'isa', 'partof'})) + def get_children(self, ns, id, ns_filter=None): + children = list(self.child_rel(ns, id, {'isa', 'partof'})) + if ns_filter: + children = [(cns, cid) for cns, cid in children + if cns in ns_filter] + return children def parent_rel(self, ns, id, rel_types): q = """SELECT parents FROM parent_lookup @@ -62,7 +65,15 @@ def parent_rel(self, ns, id, rel_types): yield from [tuple(x.split(':', 1)) for x in res[0].split(',')] def get_node_property(self, ns, id, property): - return None + q = """SELECT properties FROM node_properties + WHERE id=? AND ns=? + LIMIT 1;""" + self.cur.execute(q, (id, ns)) + res = self.cur.fetchone() + if res is None: + return None + props = json.loads(res[0]) + return props.get(property) def get_id_from_name(self, ns, name): return None @@ -73,6 +84,13 @@ def build_sqlite_ontology(db_path=DEFAULT_SQLITE_ONTOLOGY, force=False): if os.path.exists(db_path) and not force: return + if force: + try: + logger.info('Removing existing SQLite ontology at %s' % db_path) + os.remove(db_path) + except FileNotFoundError: + pass + # Initialize the bio ontology and build the transitive closure bio_ontology.initialize() bio_ontology._build_transitive_closure() @@ -144,6 +162,23 @@ def build_sqlite_ontology(db_path=DEFAULT_SQLITE_ONTOLOGY, force=False): (child_id, child_ns);""" cur.execute(q) + # Create node property table + # Here we just keep track of the namespace and ID, + # and then put all the data into a json string + q = """CREATE TABLE node_properties ( + id TEXT NOT NULL, + ns TEXT NOT NULL, + properties TEXT NOT NULL, + UNIQUE (id, ns) + );""" + cur.execute(q) + + for node in bio_ontology.nodes: + ns, id = bio_ontology.get_ns_id(node) + props = json.dumps(bio_ontology.nodes[node]) + cur.execute("INSERT INTO node_properties (id, ns, properties) " + "VALUES (?, ?, ?);", (id, ns, props)) + conn.commit() conn.close() logger.info('Finished building SQLite ontology')