ISARICResearch · abhidg · Nov 18, 2024 · Nov 12, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # data files, use -f to git-add(1)
 *.csv
 
+.DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/arc-fhir/ARC_pre_1.0.0_preset_dengue.xlsx b/arc-fhir/ARC_pre_1.0.0_preset_dengue.xlsx
diff --git a/arc-fhir/README.md b/arc-fhir/README.md
@@ -0,0 +1,4 @@
+This folder contains mapping files from ARC to FHIR.
+
+Current file is taken from https://docs.google.com/spreadsheets/d/1GnpJzQ9rm2AbWUIDsmx_M0tkQbpOi6bcFsbzWqCPlEo/edit?gid=462215516#gid=462215516
+(snapshot taken on 2024-11-08)
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ dependencies = [
     "chardet>=5.2.0",
     "dash-bootstrap-components>=1.6.0",
     "dash>=2.18.1",
-    "pandas>=2.2.2",
+    "pandas[excel]>=2.2.2",
     "scikit-learn>=1.5.1",
     "sentence-transformers>=3.2.1",
     "waitress>=3.0.0",
@@ -22,4 +22,5 @@ build-backend = "hatchling.build"
 dev = [
     "pytest-cov>=6.0.0",
     "pytest>=8.3.3",
+    "syrupy>=4.7.2",
 ]
diff --git a/src/arcmapper/__init__.py b/src/arcmapper/__init__.py
@@ -11,7 +11,6 @@
 from .app import app
 from .arc import read_arc_schema
 from .dictionary import read_data_dictionary
-from .strategies import map
 
 __version__ = "0.1.0"
 

diff --git a/src/arcmapper/app.py b/src/arcmapper/app.py
@@ -8,7 +8,7 @@
 from .components import arc_form, upload_form
 from .util import read_upload_data
 from .dictionary import read_data_dictionary
-from .strategies import map as map_data_dictionary_to_arc
+from .strategies import use_map
 from .arc import read_arc_schema
 
 app = dash.Dash("arcmapper", external_stylesheets=[dbc.themes.BOOTSTRAP])
@@ -137,7 +137,7 @@ def invoke_map_arc(data, _, version, method, num_matches):
         arc = read_arc_schema(version)
         dictionary = pd.read_json(data)
 
-        mapped_data = map_data_dictionary_to_arc(method, dictionary, arc, num_matches)
+        mapped_data = use_map(method, dictionary, arc, num_matches)
         data = mapped_data.to_dict("records")
         for i, row in enumerate(data):
             row["id"] = i
@@ -147,7 +147,14 @@ def invoke_map_arc(data, _, version, method, num_matches):
                 data=data,
                 columns=[
                     {"name": i, "id": i, "editable": i != "status"}
-                    for i in mapped_data.columns
+                    for i in [
+                        "status",
+                        "raw_variable",
+                        "raw_description",
+                        "arc_variable",
+                        "arc_description",
+                        "rank",
+                    ]
                 ],
                 editable=True,
                 style_data={

diff --git a/src/arcmapper/arc.py b/src/arcmapper/arc.py
@@ -15,12 +15,12 @@ def read_arc_schema(
     arc_version_or_file: str, preset: str | None = None
 ) -> pd.DataFrame:
     types_mapping: dict[str, DataType] = {
-        "radio": "categorical",
+        "radio": "enum",
         "number": "number",
         "text": "string",
         "date_dmy": "date",
-        "checkbox": "categorical",
-        "dropdown": "categorical",
+        "checkbox": "multiselect",
+        "dropdown": "enum",
         "datetime_dmy": "date",
     }
     arc_location = (

diff --git a/src/arcmapper/fhir.py b/src/arcmapper/fhir.py
@@ -0,0 +1,94 @@
+"""Final mapping of data dictionary to FHIR
+
+Mapping file MUST conform to specification at
+https://fhirflat.readthedocs.io/en/latest/spec/mapping.html
+"""
+
+import warnings
+from pathlib import Path
+
+import pandas as pd
+
+from .strategies import infer_response_mapping
+
+VALID_FHIR_RESOURCES = [
+    "Condition",
+    "DiagnosticReport",
+    "Encounter",
+    "Immunization",
+    "MedicationAdministration",
+    "MedicationStatement",
+    "Observation",
+    "Patient",
+    "Procedure",
+    "Specimen",
+]
+
+
+class FHIRMapping:
+    "Loads mapping file from a Excel (XLSX) sheet"
+
+    def __init__(self, file: str | Path):
+        path = Path(file)
+        if path.suffix != ".xlsx":
+            raise ValueError("FHIRMapping only supports Excel sheets at the moment")
+        index = pd.read_excel(path)
+        if "Resources" not in index.columns:
+            raise ValueError(
+                "Required 'Resources' column not present in FHIR mapping file"
+            )
+        self.resources = sorted(set(index.Resources) & set(VALID_FHIR_RESOURCES))
+        if "Patient" not in self.resources:
+            raise ValueError(
+                "Required FHIR mapping for FHIR resource 'Patient' not found in mapping file"
+            )
+        self.path = path
+
+    def get_resource(self, resource: str) -> pd.DataFrame:
+        "Gets resource from FHIR mapping Excel sheet"
+        resource = resource.capitalize()  # capitalize first letter
+        if resource not in self.resources:
+            raise ValueError(
+                f"Resource '{resource}' not found, valid resources: {self.resources}"
+            )
+        df = pd.read_excel(self.path, sheet_name=resource)
+
+        # forward fill NaNs to enable merge with mapping frame
+        df["raw_variable"] = df["raw_variable"].ffill()
+        return df.rename(
+            columns={"raw_variable": "arc_variable", "raw_response": "arc_response"}
+        )
+
+
+def merge(
+    draft: pd.DataFrame, mapping: FHIRMapping, resources: list[str] = []
+) -> dict[str, pd.DataFrame]:
+    out = {}
+    draft = infer_response_mapping(draft)
+    for resource in resources or mapping.resources:
+        # first generate choice responses for each mapping
+        if resource not in mapping.resources:
+            warnings.warn(
+                f"Resource requested to be mapped but not found in mapping file: {resource}"
+            )
+        out[resource] = draft.merge(
+            mapping.get_resource(resource), on=["arc_variable", "arc_response"]
+        )
+    return out
+
+
+def format_merge(merged_data, selected_columns: list[str] | None = None):
+    out = ""
+    selected_columns = selected_columns or [
+        "raw_variable",
+        "raw_response",
+        "arc_variable",
+        "arc_response",
+        "raw_description",
+        "arc_description",
+    ]
+    for resource in merged_data:
+        out += "{{{ resource " + resource + "\n"
+        out += merged_data[resource][selected_columns].to_csv(index=False, sep="\t")
+        out += "}}}\n"
+    return out.strip()
diff --git a/src/arcmapper/strategies.py b/src/arcmapper/strategies.py
@@ -1,13 +1,23 @@
+import ast
+from collections import namedtuple
+
 import pandas as pd
 import numpy as np
+import numpy.typing
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer
 
+SBERT_MODEL = "all-MiniLM-L6-v2"
+
+NULL_RESPONSES = ["none", "na", "nk", "n/a", "n/k"]
+Response = namedtuple("Response", ["val", "text"])
+Response.__str__ = lambda self: f"{self.val}, {self.text}"
+
 
 def get_match_dataframe_from_similarity_matrix(
     dictionary: pd.DataFrame,
     arc: pd.DataFrame,
-    similarity_matrix: np.array,
+    similarity_matrix: numpy.typing.ArrayLike,
     num_matches: int,
     threshold: float,
 ) -> pd.DataFrame:
@@ -45,8 +55,11 @@ def get_match_dataframe_from_similarity_matrix(
             "status",
             "raw_variable",
             "raw_description",
+            "raw_response",
             "arc_variable",
             "arc_description",
+            "arc_response",
+            "arc_type",
             "rank",
         ],
         data=sum(
@@ -56,12 +69,15 @@ def get_match_dataframe_from_similarity_matrix(
                         "-",
                         dictionary.iloc[i].variable,
                         dictionary.iloc[i].description,
+                        dictionary.iloc[i].responses,
                         arc.iloc[k].variable,
                         arc.iloc[k].description,
+                        arc.iloc[k].responses,
+                        arc.iloc[k].type,
                         j,
                     ]
                     for j, k in enumerate(S[i])
-                    if similarity_matrix[i, S[i, j]] > threshold
+                    if similarity_matrix[i, S[i, j]] > threshold  # type: ignore
                 ]
                 for i in range(len(dictionary))
             ],
@@ -85,39 +101,140 @@ def get_match_dataframe_from_similarity_matrix(
         return match_df
 
 
-def get_categorical_mapping(
-    source: list[str], target: list[str], similarity_matrix: np.array
-) -> dict[str, str]:
+def match_responses(
+    source: list[Response], target: list[Response], sbert_model: str = SBERT_MODEL
+) -> list[tuple[Response, Response]]:
     """Returns mapping of categorical values from source list to target list.
     Finds the closest match in target for each string in the source list. This
     is used to map categorical values from the source dictionary to ARC
 
     Example: in the source data dictionary, there is a `sex` variable which
     takes the values `man` and `woman`. ARC has a `demog_sex` variable which
-    takes the values `male`, `female` and `unknown`. Then this function, given
-    an appropriate similarity matrix between [man, woman] and [male, female]
+    takes the values `male`, `female` and `unknown`. Then this function
+    constructs a similarity matrix between [man, woman] and [male, female]
     would return
 
-    ```json
-    { "man": "male", "woman": "female" }
-    ```
+    .. code::
+
+        [(("2", "man"),("1", "male")), (("1", "woman"), ("2", "female"))]
 
     Parameters
     ----------
     source
-        Source list of strings
+        Source mapping of response description to response, e.g.
+        ``[("male", "1"), ("female": "2")]``
     target
         Target list of strings, usually from the ARC `responses` key
-    similarity_matrix
-        Similarity matrix to use to determine categorical mapping
+        e.g. ``[("men", "2"), ("woman", "1")]``
+    sbert_model
+        SBERT model to use (optional)
 
     Returns
     -------
-    dict[str, str]
-        Dictionary of source string to target string mappings
+    list[tuple[tuple[str, str], tuple[str, str]]]
+        List of pairs of mappings of dictionary to ARC
+    """
+    model = SentenceTransformer(sbert_model)
+    source_embeddings = model.encode([i.text for i in source])
+    target_embeddings = model.encode([i.text for i in target])
+    source_map: dict[str, str] = {v: k for k, v in source}
+    target_map: dict[str, str] = {v: k for k, v in target}
+    S = model.similarity(source_embeddings, target_embeddings).numpy()
+    max_idx = np.argmax(S, axis=1)
+    return [
+        (
+            Response(source_map[source[i].text], source[i].text),
+            Response(target_map[target[max_idx[i]].text], target[max_idx[i]].text),
+        )
+        for i in range(len(source))
+    ]
+
+
+def has_valid_response(row) -> bool:
+    return isinstance(row.raw_response, str) and isinstance(row.arc_response, str)
+
+
+def infer_response_mapping(
+    m: pd.DataFrame, sbert_model: str = SBERT_MODEL
+) -> pd.DataFrame:
+    """Infer response mapping from data dicitonary to ARC.
+
+    This is a simplified version of the mapping that takes place in strategies
     """
-    max_idx = np.argmax(similarity_matrix, axis=1)
-    return {source[i]: target[max_idx[i]] for i in range(len(source))}
+    # data schema for m:
+    #   raw_variable, raw_description, raw_response,
+    #   arc_variable, arc_description, arc_response,
+    out = []
+    sbert_model = SentenceTransformer(sbert_model)
+
+    for row in m.itertuples():
+        if has_valid_response(row):
+            raw_response = (
+                row.raw_response
+                if isinstance(row.raw_response, list)
+                else ast.literal_eval(row.raw_response)
+            )
+            arc_response = (
+                row.arc_response
+                if isinstance(row.arc_response, list)
+                else ast.literal_eval(row.arc_response)
+            )
+            s = list(map(lambda r: Response(*r), raw_response))
+            t = list(map(lambda r: Response(*r), arc_response))
+            if row.arc_type != "multiselect":
+                out.extend(
+                    [
+                        (
+                            row.raw_variable,
+                            row.raw_description,
+                            str(sr),
+                            row.arc_variable,
+                            row.arc_description,
+                            str(tr),
+                        )
+                        for sr, tr in match_responses(s, t)
+                    ]
+                )
+            else:
+                print("multiselect mode::")
+                out.extend(
+                    [
+                        (
+                            row.raw_variable,
+                            row.raw_description,
+                            str(sr),
+                            row.arc_variable + "___" + tr.val,
+                            row.arc_description,
+                            "1, " + str(tr.text),
+                        )
+                        for sr, tr in match_responses(s, t)
+                        if sr.text.lower() not in NULL_RESPONSES
+                    ]
+                )
+                print(out[-3])
+        else:
+            out.append(
+                (
+                    row.raw_variable,
+                    row.raw_description,
+                    None,
+                    row.arc_variable,
+                    row.arc_description,
+                    None,
+                )
+            )
+    df = pd.DataFrame(
+        out,
+        columns=[
+            "raw_variable",
+            "raw_description",
+            "raw_response",
+            "arc_variable",
+            "arc_description",
+            "arc_response",
+        ],
+    )
+    return df
 
 
 def tf_idf(
@@ -221,7 +338,7 @@ def sbert(
     )
 
 
-def map(
+def use_map(
     method: str,
     dictionary: pd.DataFrame,
     arc: pd.DataFrame,