Merge pull request #20 from microbiomedata/19-refgraph-process-schema…

…-directly-instead-of-requiring-refscan-generated-tsv-file `refgraph`: Read schema directly instead of requiring TSV file generated by `refscan`
microbiomedata · Sep 6, 2024 · 62b9de1 · 62b9de1
2 parents 7f197f6 + 5ea3b9d
commit 62b9de1
Show file tree

Hide file tree

Showing 8 changed files with 248 additions and 52 deletions.
diff --git a/refscan/lib/Reference.py b/refscan/lib/Reference.py
@@ -16,3 +16,22 @@ class Reference:
     source_field_name: str = field()  # e.g. "part_of"
     target_collection_name: str = field()  # e.g. "study_set" (reminder: a study can be part of another study)
     target_class_name: str = field()  # e.g. "Study"
+
+    def __eq__(self, other):
+        r"""
+        Determines whether an instance of this class is equal to the specified "other" value.
+
+        Note: This method dictates what will happen under the hood when the `==` operator is used.
+              Reference: https://docs.python.org/3/reference/datamodel.html#object.__eq__
+        """
+
+        if not isinstance(other, Reference):
+            return False
+        else:
+            return (
+                self.source_collection_name == other.source_collection_name
+                and self.source_class_name == other.source_class_name
+                and self.source_field_name == other.source_field_name
+                and self.target_collection_name == other.target_collection_name
+                and self.target_class_name == other.target_class_name
+            )
diff --git a/refscan/lib/helpers.py b/refscan/lib/helpers.py
@@ -6,6 +6,8 @@
 from rich.console import Console
 from rich.progress import Progress, TextColumn, MofNCompleteColumn, BarColumn, TimeElapsedColumn, TimeRemainingColumn
 
+from refscan.lib.ReferenceList import ReferenceList
+from refscan.lib.Reference import Reference
 from refscan.lib.constants import DATABASE_CLASS_NAME, console
 
 
@@ -174,3 +176,47 @@ def get_names_of_classes_in_effective_range_of_slot(
     names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))
 
     return names_of_eligible_target_classes
+
+
+def identify_references(
+    schema_view: SchemaView,
+    collection_name_to_class_names: dict[str, list[str]],
+) -> ReferenceList:
+    r"""
+    Returns a `ReferenceList`, identifying all the inter-document references that the schema allows to exist.
+
+    Note: This list is derived from the schema alone. There is no database involved.
+    """
+
+    # Initialize the list of references.
+    references = ReferenceList()
+
+    # For each class whose instances can be stored in each collection, determine which of its slots can be a reference.
+    sorted_collection_names_to_class_names = sorted(collection_name_to_class_names.items(), key=get_lowercase_key)
+    for collection_name, class_names in sorted_collection_names_to_class_names:
+        for class_name in class_names:
+            for slot_name in schema_view.class_slots(class_name):
+
+                # Get the slot definition in the context of its use on this particular class.
+                slot_definition = schema_view.induced_slot(slot_name=slot_name, class_name=class_name)
+
+                # Determine the slot's "effective" range, taking into account its `any_of` constraint (if it has one).
+                names_of_eligible_target_classes = get_names_of_classes_in_effective_range_of_slot(
+                    schema_view=schema_view,
+                    slot_definition=slot_definition,
+                )
+
+                # For each of those classes whose instances can be stored in any collection, catalog a reference.
+                for name_of_eligible_target_class in names_of_eligible_target_classes:
+                    for target_collection_name, class_names_in_collection in collection_name_to_class_names.items():
+                        if name_of_eligible_target_class in class_names_in_collection:
+                            reference = Reference(
+                                source_collection_name=collection_name,
+                                source_class_name=class_name,
+                                source_field_name=slot_name,
+                                target_collection_name=target_collection_name,
+                                target_class_name=name_of_eligible_target_class,
+                            )
+                            references.append(reference)
+
+    return references
diff --git a/refscan/refgraph.py b/refscan/refgraph.py
@@ -2,21 +2,24 @@
 from pathlib import Path
 from typing import Optional
 from typing_extensions import Annotated
-import csv
 import json
 import base64
 from importlib import resources
 
 import typer
+import linkml_runtime
 
 from refscan.lib.constants import console
 from refscan.lib.helpers import (
     print_section_header,
+    get_collection_names_from_schema,
+    get_names_of_classes_eligible_for_collection,
+    identify_references,
 )
-from refscan.lib.Reference import Reference
+from refscan.refscan import display_app_version_and_exit
 
 app = typer.Typer(
-    help="Generates an interactive graph (network diagram) of a reference report.",
+    help="Generates an interactive graph (network diagram) of the references described by a schema.",
     add_completion=False,  # hides the shell completion options from `--help` output
     rich_markup_mode="markdown",  # enables use of Markdown in docstrings and CLI help
 )
@@ -45,15 +48,16 @@ def load_template(resource_path: str) -> str:
 
 @app.command("graph")
 def graph(
-    reference_report_file_path: Annotated[
+    # Reference: https://typer.tiangolo.com/tutorial/parameter-types/path/
+    schema_file_path: Annotated[
         Path,
         typer.Option(
-            "--reference-report",
+            "--schema",
             dir_okay=False,
             writable=False,
             readable=True,
             resolve_path=True,
-            help="Filesystem path at which the reference report resides.",
+            help="Filesystem path at which the YAML file representing the schema is located.",
         ),
     ],
     graph_file_path: Annotated[
@@ -82,21 +86,51 @@ def graph(
             help="Show verbose output.",
         ),
     ] = False,
+    version: Annotated[
+        Optional[bool],
+        typer.Option(
+            "--version",
+            callback=display_app_version_and_exit,
+            is_eager=True,  # tells Typer to process this option first
+            help="Show version number and exit.",
+        ),
+    ] = None,
 ):
     r"""
-    Generates an interactive graph (network diagram) of a reference report.
+    Generates an interactive graph (network diagram) of the references described by a schema.
     """
 
-    print_section_header(console, text="Reading reference report")
+    print_section_header(console, text="Reading schema")
 
-    # Instantiate a `Reference` for each data row in the reference report.
-    references = []
-    with open(reference_report_file_path, "r") as f:
-        reader = csv.DictReader(f, delimiter="\t")  # automatically gets field names from first row
-        for reference_dict in reader:
-            assert isinstance(reference_dict, dict)  # added because, otherwise, PyCharm thinks it's a `str`
-            reference = Reference(**reference_dict)  # uses dict (i.e. keys and their values) as kwargs
-            references.append(reference)
+    # Instantiate a `linkml_runtime.SchemaView` bound to the specified schema.
+    if verbose:
+        console.print(f"Schema YAML file: {schema_file_path}")
+    schema_view = linkml_runtime.SchemaView(schema_file_path)
+
+    # Show high-level information about the schema.
+    console.print(f"Schema version: {schema_view.schema.version}")
+
+    # Show a header on the console, to tell the user which stage of execution we're entering.
+    print_section_header(console, text="Identifying references")
+
+    # Get a list of collection names (technically, `Database` slot names) from the schema.
+    # e.g. ["study_set", ...]
+    collection_names = get_collection_names_from_schema(schema_view)
+    console.print(f"Collections described by schema: {len(collection_names)}")
+
+    # For each collection, determine the names of the classes whose instances can be stored in that collection.
+    collection_name_to_class_names = {}  # example: { "study_set": ["Study"] }
+    for collection_name in sorted(collection_names):
+        collection_name_to_class_names[collection_name] = get_names_of_classes_eligible_for_collection(
+            schema_view=schema_view,
+            collection_name=collection_name,
+        )
+
+    # Identify the inter-document references that the schema allows to exist.
+    references = identify_references(
+        schema_view=schema_view, collection_name_to_class_names=collection_name_to_class_names
+    )
+    console.print(f"References described by schema: {len(references)}")
 
     console.print(f"References: {len(references)}")
     if verbose:
@@ -152,8 +186,8 @@ def graph(
     graph_data_json_bytes = graph_data_json_str.encode("utf-8")
     graph_data_json_base64 = base64.b64encode(graph_data_json_bytes)
     graph_data_json_base64_str = graph_data_json_base64.decode("utf-8")
-    placeholder = "{{ graph_data_json_base64 }}"
-    html_result = html_template.replace(placeholder, graph_data_json_base64_str)
+    html_result = html_template.replace("{{ schema_version }}", schema_view.schema.version)
+    html_result = html_result.replace("{{ graph_data_json_base64 }}", graph_data_json_base64_str)
 
     if verbose:
         console.print(html_result)

diff --git a/refscan/refscan.py b/refscan/refscan.py
@@ -15,10 +15,8 @@
     get_lowercase_key,
     print_section_header,
     get_names_of_classes_eligible_for_collection,
-    get_names_of_classes_in_effective_range_of_slot,
+    identify_references,
 )
-from refscan.lib.Reference import Reference
-from refscan.lib.ReferenceList import ReferenceList
 from refscan.lib.Violation import Violation
 from refscan.lib.ViolationList import ViolationList
 from refscan import get_package_metadata
@@ -176,37 +174,10 @@ def scan(
             collection_name=collection_name,
         )
 
-    # Initialize the list of references. A reference is effectively a "foreign key" (i.e. a pointer).
-    references = ReferenceList()
-
-    # For each class whose instances can be stored in each collection, determine which of its slots can be a reference.
-    sorted_collection_names_to_class_names = sorted(collection_name_to_class_names.items(), key=get_lowercase_key)
-    for collection_name, class_names in sorted_collection_names_to_class_names:
-        for class_name in class_names:
-            for slot_name in schema_view.class_slots(class_name):
-
-                # Get the slot definition in the context of its use on this particular class.
-                slot_definition = schema_view.induced_slot(slot_name=slot_name, class_name=class_name)
-
-                # Determine the slot's "effective" range, taking into account its `any_of` constraint (if it has one).
-                names_of_eligible_target_classes = get_names_of_classes_in_effective_range_of_slot(
-                    schema_view=schema_view,
-                    slot_definition=slot_definition,
-                )
-
-                # For each of those classes whose instances can be stored in any collection, catalog a reference.
-                for name_of_eligible_target_class in names_of_eligible_target_classes:
-                    for target_collection_name, class_names_in_collection in collection_name_to_class_names.items():
-                        if name_of_eligible_target_class in class_names_in_collection:
-                            reference = Reference(
-                                source_collection_name=collection_name,
-                                source_class_name=class_name,
-                                source_field_name=slot_name,
-                                target_collection_name=target_collection_name,
-                                target_class_name=name_of_eligible_target_class,
-                            )
-                            references.append(reference)
-
+    # Identify the inter-document references that the schema allows to exist.
+    references = identify_references(
+        schema_view=schema_view, collection_name_to_class_names=collection_name_to_class_names
+    )
     console.print(f"References described by schema: {len(references)}")
 
     num_collections_having_references = references.count_source_collections()

diff --git a/refscan/templates/graph.template.html b/refscan/templates/graph.template.html
@@ -29,6 +29,9 @@
 
 <script>
     (function () {
+        // Note: A Python script will replace the `{{ ... }}` placeholder.
+        console.log("Schema version: {{ schema_version }}");
+
         // Extract the graph data, base64decode it, and parse it as JSON.
         // Reference: https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/dataset
         let graphData = []

diff --git a/tests/schemas/database_with_references.yaml b/tests/schemas/database_with_references.yaml
@@ -0,0 +1,32 @@
+id: my-schema
+name: MySchema
+
+classes:
+  Database:
+    slots:
+      - company_set
+      - employee_set
+  Company:
+    slots:
+      - employs
+  Employee:
+    slots:
+      - works_for
+      - managed_by
+
+slots:
+  company_set:
+    inlined_as_list: true
+    multivalued: true
+    range: Company
+  employee_set:
+    inlined_as_list: true
+    multivalued: true
+    range: Employee
+  works_for:
+    range: Company
+  employs:
+    range: Employee
+  managed_by:
+    range: Employee
+
diff --git a/tests/test_Reference.py b/tests/test_Reference.py
@@ -0,0 +1,49 @@
+from refscan.lib.Reference import Reference
+
+
+def test_eq():
+    ref_a = Reference(
+        source_collection_name="foo_set",
+        source_class_name="Foo",
+        source_field_name="owns",
+        target_collection_name="bar_set",
+        target_class_name="Bar",
+    )
+    ref_b = Reference(
+        source_collection_name="foo_set",
+        source_class_name="Foo",
+        source_field_name="owns",
+        target_collection_name="bar_set",
+        target_class_name="Baz",  # not "Bar"
+    )
+    ref_c = Reference(
+        source_collection_name="foo_set",
+        source_class_name="Foo",
+        source_field_name="owns",
+        target_collection_name="bar_set",
+        target_class_name="Bar",
+    )
+
+    # Focus on `==`.
+    assert ref_a == ref_a
+    assert ref_a != ref_b
+    assert ref_a == ref_c
+
+    assert ref_b != ref_a
+    assert ref_b == ref_b
+    assert ref_b != ref_c
+
+    assert ref_c == ref_a
+    assert ref_c != ref_b
+    assert ref_c == ref_c
+
+    # Focus on `in`.
+    assert ref_a in [ref_a, ref_b, ref_c]
+    assert ref_b in [ref_a, ref_b, ref_c]
+    assert ref_c in [ref_a, ref_b, ref_c]
+
+    assert ref_a not in [ref_b]
+
+    # Focus on `is`.
+    assert ref_a is ref_a
+    assert ref_a is not ref_c  # equal, but not identical