Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fhir #9

Merged
merged 5 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# data files, use -f to git-add(1)
*.csv

.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
Binary file added arc-fhir/ARC_pre_1.0.0_preset_dengue.xlsx
Binary file not shown.
4 changes: 4 additions & 0 deletions arc-fhir/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
This folder contains mapping files from ARC to FHIR.

Current file is taken from https://docs.google.com/spreadsheets/d/1GnpJzQ9rm2AbWUIDsmx_M0tkQbpOi6bcFsbzWqCPlEo/edit?gid=462215516#gid=462215516
(snapshot taken on 2024-11-08)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dependencies = [
"chardet>=5.2.0",
"dash-bootstrap-components>=1.6.0",
"dash>=2.18.1",
"pandas>=2.2.2",
"pandas[excel]>=2.2.2",
"scikit-learn>=1.5.1",
"sentence-transformers>=3.2.1",
"waitress>=3.0.0",
Expand All @@ -22,4 +22,5 @@ build-backend = "hatchling.build"
dev = [
"pytest-cov>=6.0.0",
"pytest>=8.3.3",
"syrupy>=4.7.2",
]
1 change: 0 additions & 1 deletion src/arcmapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from .app import app
from .arc import read_arc_schema
from .dictionary import read_data_dictionary
from .strategies import map

__version__ = "0.1.0"

Expand Down
13 changes: 10 additions & 3 deletions src/arcmapper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .components import arc_form, upload_form
from .util import read_upload_data
from .dictionary import read_data_dictionary
from .strategies import map as map_data_dictionary_to_arc
from .strategies import use_map
from .arc import read_arc_schema

app = dash.Dash("arcmapper", external_stylesheets=[dbc.themes.BOOTSTRAP])
Expand Down Expand Up @@ -137,7 +137,7 @@ def invoke_map_arc(data, _, version, method, num_matches):
arc = read_arc_schema(version)
dictionary = pd.read_json(data)

mapped_data = map_data_dictionary_to_arc(method, dictionary, arc, num_matches)
mapped_data = use_map(method, dictionary, arc, num_matches)
data = mapped_data.to_dict("records")
for i, row in enumerate(data):
row["id"] = i
Expand All @@ -147,7 +147,14 @@ def invoke_map_arc(data, _, version, method, num_matches):
data=data,
columns=[
{"name": i, "id": i, "editable": i != "status"}
for i in mapped_data.columns
for i in [
"status",
"raw_variable",
"raw_description",
"arc_variable",
"arc_description",
"rank",
]
],
editable=True,
style_data={
Expand Down
6 changes: 3 additions & 3 deletions src/arcmapper/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ def read_arc_schema(
arc_version_or_file: str, preset: str | None = None
) -> pd.DataFrame:
types_mapping: dict[str, DataType] = {
"radio": "categorical",
"radio": "enum",
"number": "number",
"text": "string",
"date_dmy": "date",
"checkbox": "categorical",
"dropdown": "categorical",
"checkbox": "multiselect",
"dropdown": "enum",
"datetime_dmy": "date",
}
arc_location = (
Expand Down
94 changes: 94 additions & 0 deletions src/arcmapper/fhir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Final mapping of data dictionary to FHIR

Mapping file MUST conform to specification at
https://fhirflat.readthedocs.io/en/latest/spec/mapping.html
"""

import warnings
from pathlib import Path

import pandas as pd

from .strategies import infer_response_mapping

VALID_FHIR_RESOURCES = [
"Condition",
"DiagnosticReport",
"Encounter",
"Immunization",
"MedicationAdministration",
"MedicationStatement",
"Observation",
"Patient",
"Procedure",
"Specimen",
]


class FHIRMapping:
"Loads mapping file from a Excel (XLSX) sheet"

def __init__(self, file: str | Path):
path = Path(file)
if path.suffix != ".xlsx":
raise ValueError("FHIRMapping only supports Excel sheets at the moment")
index = pd.read_excel(path)
if "Resources" not in index.columns:
raise ValueError(
"Required 'Resources' column not present in FHIR mapping file"
)
self.resources = sorted(set(index.Resources) & set(VALID_FHIR_RESOURCES))
if "Patient" not in self.resources:
raise ValueError(
"Required FHIR mapping for FHIR resource 'Patient' not found in mapping file"
)
self.path = path

def get_resource(self, resource: str) -> pd.DataFrame:
"Gets resource from FHIR mapping Excel sheet"
resource = resource.capitalize() # capitalize first letter
if resource not in self.resources:
raise ValueError(
f"Resource '{resource}' not found, valid resources: {self.resources}"
)
df = pd.read_excel(self.path, sheet_name=resource)

# forward fill NaNs to enable merge with mapping frame
df["raw_variable"] = df["raw_variable"].ffill()
return df.rename(
columns={"raw_variable": "arc_variable", "raw_response": "arc_response"}
)


def merge(
draft: pd.DataFrame, mapping: FHIRMapping, resources: list[str] = []
) -> dict[str, pd.DataFrame]:
out = {}
draft = infer_response_mapping(draft)
for resource in resources or mapping.resources:
# first generate choice responses for each mapping
if resource not in mapping.resources:
warnings.warn(
f"Resource requested to be mapped but not found in mapping file: {resource}"
)
out[resource] = draft.merge(
mapping.get_resource(resource), on=["arc_variable", "arc_response"]
)
return out


def format_merge(merged_data, selected_columns: list[str] | None = None):
out = ""
selected_columns = selected_columns or [
"raw_variable",
"raw_response",
"arc_variable",
"arc_response",
"raw_description",
"arc_description",
]
for resource in merged_data:
out += "{{{ resource " + resource + "\n"
out += merged_data[resource][selected_columns].to_csv(index=False, sep="\t")
out += "}}}\n"
return out.strip()
153 changes: 135 additions & 18 deletions src/arcmapper/strategies.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
import ast
from collections import namedtuple

import pandas as pd
import numpy as np
import numpy.typing
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

SBERT_MODEL = "all-MiniLM-L6-v2"

NULL_RESPONSES = ["none", "na", "nk", "n/a", "n/k"]
Response = namedtuple("Response", ["val", "text"])
Response.__str__ = lambda self: f"{self.val}, {self.text}"


def get_match_dataframe_from_similarity_matrix(
dictionary: pd.DataFrame,
arc: pd.DataFrame,
similarity_matrix: np.array,
similarity_matrix: numpy.typing.ArrayLike,
num_matches: int,
threshold: float,
) -> pd.DataFrame:
Expand Down Expand Up @@ -45,8 +55,11 @@ def get_match_dataframe_from_similarity_matrix(
"status",
"raw_variable",
"raw_description",
"raw_response",
"arc_variable",
"arc_description",
"arc_response",
"arc_type",
"rank",
],
data=sum(
Expand All @@ -56,12 +69,15 @@ def get_match_dataframe_from_similarity_matrix(
"-",
dictionary.iloc[i].variable,
dictionary.iloc[i].description,
dictionary.iloc[i].responses,
arc.iloc[k].variable,
arc.iloc[k].description,
arc.iloc[k].responses,
arc.iloc[k].type,
j,
]
for j, k in enumerate(S[i])
if similarity_matrix[i, S[i, j]] > threshold
if similarity_matrix[i, S[i, j]] > threshold # type: ignore
]
for i in range(len(dictionary))
],
Expand All @@ -85,39 +101,140 @@ def get_match_dataframe_from_similarity_matrix(
return match_df


def get_categorical_mapping(
source: list[str], target: list[str], similarity_matrix: np.array
) -> dict[str, str]:
def match_responses(
source: list[Response], target: list[Response], sbert_model: str = SBERT_MODEL
) -> list[tuple[Response, Response]]:
"""Returns mapping of categorical values from source list to target list.
Finds the closest match in target for each string in the source list. This
is used to map categorical values from the source dictionary to ARC

Example: in the source data dictionary, there is a `sex` variable which
takes the values `man` and `woman`. ARC has a `demog_sex` variable which
takes the values `male`, `female` and `unknown`. Then this function, given
an appropriate similarity matrix between [man, woman] and [male, female]
takes the values `male`, `female` and `unknown`. Then this function
constructs a similarity matrix between [man, woman] and [male, female]
would return

```json
{ "man": "male", "woman": "female" }
```
.. code::

[(("2", "man"),("1", "male")), (("1", "woman"), ("2", "female"))]

Parameters
----------
source
Source list of strings
Source mapping of response description to response, e.g.
``[("male", "1"), ("female": "2")]``
target
Target list of strings, usually from the ARC `responses` key
similarity_matrix
Similarity matrix to use to determine categorical mapping
e.g. ``[("men", "2"), ("woman", "1")]``
sbert_model
SBERT model to use (optional)

Returns
-------
dict[str, str]
Dictionary of source string to target string mappings
list[tuple[tuple[str, str], tuple[str, str]]]
List of pairs of mappings of dictionary to ARC
"""
model = SentenceTransformer(sbert_model)
source_embeddings = model.encode([i.text for i in source])
target_embeddings = model.encode([i.text for i in target])
source_map: dict[str, str] = {v: k for k, v in source}
target_map: dict[str, str] = {v: k for k, v in target}
S = model.similarity(source_embeddings, target_embeddings).numpy()
max_idx = np.argmax(S, axis=1)
return [
(
Response(source_map[source[i].text], source[i].text),
Response(target_map[target[max_idx[i]].text], target[max_idx[i]].text),
)
for i in range(len(source))
]


def has_valid_response(row) -> bool:
return isinstance(row.raw_response, str) and isinstance(row.arc_response, str)


def infer_response_mapping(
m: pd.DataFrame, sbert_model: str = SBERT_MODEL
) -> pd.DataFrame:
"""Infer response mapping from data dicitonary to ARC.

This is a simplified version of the mapping that takes place in strategies
"""
max_idx = np.argmax(similarity_matrix, axis=1)
return {source[i]: target[max_idx[i]] for i in range(len(source))}
# data schema for m:
# raw_variable, raw_description, raw_response,
# arc_variable, arc_description, arc_response,
out = []
sbert_model = SentenceTransformer(sbert_model)

for row in m.itertuples():
if has_valid_response(row):
raw_response = (
row.raw_response
if isinstance(row.raw_response, list)
else ast.literal_eval(row.raw_response)
)
arc_response = (
row.arc_response
if isinstance(row.arc_response, list)
else ast.literal_eval(row.arc_response)
)
s = list(map(lambda r: Response(*r), raw_response))
t = list(map(lambda r: Response(*r), arc_response))
if row.arc_type != "multiselect":
out.extend(
[
(
row.raw_variable,
row.raw_description,
str(sr),
row.arc_variable,
row.arc_description,
str(tr),
)
for sr, tr in match_responses(s, t)
]
)
else:
print("multiselect mode::")
out.extend(
[
(
row.raw_variable,
row.raw_description,
str(sr),
row.arc_variable + "___" + tr.val,
row.arc_description,
"1, " + str(tr.text),
)
for sr, tr in match_responses(s, t)
if sr.text.lower() not in NULL_RESPONSES
]
)
print(out[-3])
else:
out.append(
(
row.raw_variable,
row.raw_description,
None,
row.arc_variable,
row.arc_description,
None,
)
)
df = pd.DataFrame(
out,
columns=[
"raw_variable",
"raw_description",
"raw_response",
"arc_variable",
"arc_description",
"arc_response",
],
)
return df


def tf_idf(
Expand Down Expand Up @@ -221,7 +338,7 @@ def sbert(
)


def map(
def use_map(
method: str,
dictionary: pd.DataFrame,
arc: pd.DataFrame,
Expand Down
Loading