Skip to content

Commit

Permalink
Merge pull request #20 from microbiomedata/19-refgraph-process-schema…
Browse files Browse the repository at this point in the history
…-directly-instead-of-requiring-refscan-generated-tsv-file

`refgraph`: Read schema directly instead of requiring TSV file generated by `refscan`
  • Loading branch information
eecavanna authored Sep 6, 2024
2 parents 7f197f6 + 5ea3b9d commit 62b9de1
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 52 deletions.
19 changes: 19 additions & 0 deletions refscan/lib/Reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,22 @@ class Reference:
source_field_name: str = field() # e.g. "part_of"
target_collection_name: str = field() # e.g. "study_set" (reminder: a study can be part of another study)
target_class_name: str = field() # e.g. "Study"

def __eq__(self, other):
r"""
Determines whether an instance of this class is equal to the specified "other" value.
Note: This method dictates what will happen under the hood when the `==` operator is used.
Reference: https://docs.python.org/3/reference/datamodel.html#object.__eq__
"""

if not isinstance(other, Reference):
return False
else:
return (
self.source_collection_name == other.source_collection_name
and self.source_class_name == other.source_class_name
and self.source_field_name == other.source_field_name
and self.target_collection_name == other.target_collection_name
and self.target_class_name == other.target_class_name
)
46 changes: 46 additions & 0 deletions refscan/lib/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from rich.console import Console
from rich.progress import Progress, TextColumn, MofNCompleteColumn, BarColumn, TimeElapsedColumn, TimeRemainingColumn

from refscan.lib.ReferenceList import ReferenceList
from refscan.lib.Reference import Reference
from refscan.lib.constants import DATABASE_CLASS_NAME, console


Expand Down Expand Up @@ -174,3 +176,47 @@ def get_names_of_classes_in_effective_range_of_slot(
names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))

return names_of_eligible_target_classes


def identify_references(
schema_view: SchemaView,
collection_name_to_class_names: dict[str, list[str]],
) -> ReferenceList:
r"""
Returns a `ReferenceList`, identifying all the inter-document references that the schema allows to exist.
Note: This list is derived from the schema alone. There is no database involved.
"""

# Initialize the list of references.
references = ReferenceList()

# For each class whose instances can be stored in each collection, determine which of its slots can be a reference.
sorted_collection_names_to_class_names = sorted(collection_name_to_class_names.items(), key=get_lowercase_key)
for collection_name, class_names in sorted_collection_names_to_class_names:
for class_name in class_names:
for slot_name in schema_view.class_slots(class_name):

# Get the slot definition in the context of its use on this particular class.
slot_definition = schema_view.induced_slot(slot_name=slot_name, class_name=class_name)

# Determine the slot's "effective" range, taking into account its `any_of` constraint (if it has one).
names_of_eligible_target_classes = get_names_of_classes_in_effective_range_of_slot(
schema_view=schema_view,
slot_definition=slot_definition,
)

# For each of those classes whose instances can be stored in any collection, catalog a reference.
for name_of_eligible_target_class in names_of_eligible_target_classes:
for target_collection_name, class_names_in_collection in collection_name_to_class_names.items():
if name_of_eligible_target_class in class_names_in_collection:
reference = Reference(
source_collection_name=collection_name,
source_class_name=class_name,
source_field_name=slot_name,
target_collection_name=target_collection_name,
target_class_name=name_of_eligible_target_class,
)
references.append(reference)

return references
70 changes: 52 additions & 18 deletions refscan/refgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,24 @@
from pathlib import Path
from typing import Optional
from typing_extensions import Annotated
import csv
import json
import base64
from importlib import resources

import typer
import linkml_runtime

from refscan.lib.constants import console
from refscan.lib.helpers import (
print_section_header,
get_collection_names_from_schema,
get_names_of_classes_eligible_for_collection,
identify_references,
)
from refscan.lib.Reference import Reference
from refscan.refscan import display_app_version_and_exit

app = typer.Typer(
help="Generates an interactive graph (network diagram) of a reference report.",
help="Generates an interactive graph (network diagram) of the references described by a schema.",
add_completion=False, # hides the shell completion options from `--help` output
rich_markup_mode="markdown", # enables use of Markdown in docstrings and CLI help
)
Expand Down Expand Up @@ -45,15 +48,16 @@ def load_template(resource_path: str) -> str:

@app.command("graph")
def graph(
reference_report_file_path: Annotated[
# Reference: https://typer.tiangolo.com/tutorial/parameter-types/path/
schema_file_path: Annotated[
Path,
typer.Option(
"--reference-report",
"--schema",
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
help="Filesystem path at which the reference report resides.",
help="Filesystem path at which the YAML file representing the schema is located.",
),
],
graph_file_path: Annotated[
Expand Down Expand Up @@ -82,21 +86,51 @@ def graph(
help="Show verbose output.",
),
] = False,
version: Annotated[
Optional[bool],
typer.Option(
"--version",
callback=display_app_version_and_exit,
is_eager=True, # tells Typer to process this option first
help="Show version number and exit.",
),
] = None,
):
r"""
Generates an interactive graph (network diagram) of a reference report.
Generates an interactive graph (network diagram) of the references described by a schema.
"""

print_section_header(console, text="Reading reference report")
print_section_header(console, text="Reading schema")

# Instantiate a `Reference` for each data row in the reference report.
references = []
with open(reference_report_file_path, "r") as f:
reader = csv.DictReader(f, delimiter="\t") # automatically gets field names from first row
for reference_dict in reader:
assert isinstance(reference_dict, dict) # added because, otherwise, PyCharm thinks it's a `str`
reference = Reference(**reference_dict) # uses dict (i.e. keys and their values) as kwargs
references.append(reference)
# Instantiate a `linkml_runtime.SchemaView` bound to the specified schema.
if verbose:
console.print(f"Schema YAML file: {schema_file_path}")
schema_view = linkml_runtime.SchemaView(schema_file_path)

# Show high-level information about the schema.
console.print(f"Schema version: {schema_view.schema.version}")

# Show a header on the console, to tell the user which stage of execution we're entering.
print_section_header(console, text="Identifying references")

# Get a list of collection names (technically, `Database` slot names) from the schema.
# e.g. ["study_set", ...]
collection_names = get_collection_names_from_schema(schema_view)
console.print(f"Collections described by schema: {len(collection_names)}")

# For each collection, determine the names of the classes whose instances can be stored in that collection.
collection_name_to_class_names = {} # example: { "study_set": ["Study"] }
for collection_name in sorted(collection_names):
collection_name_to_class_names[collection_name] = get_names_of_classes_eligible_for_collection(
schema_view=schema_view,
collection_name=collection_name,
)

# Identify the inter-document references that the schema allows to exist.
references = identify_references(
schema_view=schema_view, collection_name_to_class_names=collection_name_to_class_names
)
console.print(f"References described by schema: {len(references)}")

console.print(f"References: {len(references)}")
if verbose:
Expand Down Expand Up @@ -152,8 +186,8 @@ def graph(
graph_data_json_bytes = graph_data_json_str.encode("utf-8")
graph_data_json_base64 = base64.b64encode(graph_data_json_bytes)
graph_data_json_base64_str = graph_data_json_base64.decode("utf-8")
placeholder = "{{ graph_data_json_base64 }}"
html_result = html_template.replace(placeholder, graph_data_json_base64_str)
html_result = html_template.replace("{{ schema_version }}", schema_view.schema.version)
html_result = html_result.replace("{{ graph_data_json_base64 }}", graph_data_json_base64_str)

if verbose:
console.print(html_result)
Expand Down
39 changes: 5 additions & 34 deletions refscan/refscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
get_lowercase_key,
print_section_header,
get_names_of_classes_eligible_for_collection,
get_names_of_classes_in_effective_range_of_slot,
identify_references,
)
from refscan.lib.Reference import Reference
from refscan.lib.ReferenceList import ReferenceList
from refscan.lib.Violation import Violation
from refscan.lib.ViolationList import ViolationList
from refscan import get_package_metadata
Expand Down Expand Up @@ -176,37 +174,10 @@ def scan(
collection_name=collection_name,
)

# Initialize the list of references. A reference is effectively a "foreign key" (i.e. a pointer).
references = ReferenceList()

# For each class whose instances can be stored in each collection, determine which of its slots can be a reference.
sorted_collection_names_to_class_names = sorted(collection_name_to_class_names.items(), key=get_lowercase_key)
for collection_name, class_names in sorted_collection_names_to_class_names:
for class_name in class_names:
for slot_name in schema_view.class_slots(class_name):

# Get the slot definition in the context of its use on this particular class.
slot_definition = schema_view.induced_slot(slot_name=slot_name, class_name=class_name)

# Determine the slot's "effective" range, taking into account its `any_of` constraint (if it has one).
names_of_eligible_target_classes = get_names_of_classes_in_effective_range_of_slot(
schema_view=schema_view,
slot_definition=slot_definition,
)

# For each of those classes whose instances can be stored in any collection, catalog a reference.
for name_of_eligible_target_class in names_of_eligible_target_classes:
for target_collection_name, class_names_in_collection in collection_name_to_class_names.items():
if name_of_eligible_target_class in class_names_in_collection:
reference = Reference(
source_collection_name=collection_name,
source_class_name=class_name,
source_field_name=slot_name,
target_collection_name=target_collection_name,
target_class_name=name_of_eligible_target_class,
)
references.append(reference)

# Identify the inter-document references that the schema allows to exist.
references = identify_references(
schema_view=schema_view, collection_name_to_class_names=collection_name_to_class_names
)
console.print(f"References described by schema: {len(references)}")

num_collections_having_references = references.count_source_collections()
Expand Down
3 changes: 3 additions & 0 deletions refscan/templates/graph.template.html
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

<script>
(function () {
// Note: A Python script will replace the `{{ ... }}` placeholder.
console.log("Schema version: {{ schema_version }}");

// Extract the graph data, base64decode it, and parse it as JSON.
// Reference: https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/dataset
let graphData = []
Expand Down
32 changes: 32 additions & 0 deletions tests/schemas/database_with_references.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
id: my-schema
name: MySchema

classes:
Database:
slots:
- company_set
- employee_set
Company:
slots:
- employs
Employee:
slots:
- works_for
- managed_by

slots:
company_set:
inlined_as_list: true
multivalued: true
range: Company
employee_set:
inlined_as_list: true
multivalued: true
range: Employee
works_for:
range: Company
employs:
range: Employee
managed_by:
range: Employee

49 changes: 49 additions & 0 deletions tests/test_Reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from refscan.lib.Reference import Reference


def test_eq():
ref_a = Reference(
source_collection_name="foo_set",
source_class_name="Foo",
source_field_name="owns",
target_collection_name="bar_set",
target_class_name="Bar",
)
ref_b = Reference(
source_collection_name="foo_set",
source_class_name="Foo",
source_field_name="owns",
target_collection_name="bar_set",
target_class_name="Baz", # not "Bar"
)
ref_c = Reference(
source_collection_name="foo_set",
source_class_name="Foo",
source_field_name="owns",
target_collection_name="bar_set",
target_class_name="Bar",
)

# Focus on `==`.
assert ref_a == ref_a
assert ref_a != ref_b
assert ref_a == ref_c

assert ref_b != ref_a
assert ref_b == ref_b
assert ref_b != ref_c

assert ref_c == ref_a
assert ref_c != ref_b
assert ref_c == ref_c

# Focus on `in`.
assert ref_a in [ref_a, ref_b, ref_c]
assert ref_b in [ref_a, ref_b, ref_c]
assert ref_c in [ref_a, ref_b, ref_c]

assert ref_a not in [ref_b]

# Focus on `is`.
assert ref_a is ref_a
assert ref_a is not ref_c # equal, but not identical
Loading

0 comments on commit 62b9de1

Please sign in to comment.