111 make current beta compatible with 010 (#112)

* Bump up version * Add query_to_dataframe function to ensure compatibility with v0.1.0 * Fix text in examples * Fix parameter inconsistency in query_to_dataframe§ * Update tests with query_to_dataframe * Adjust tests
UMEssen · Feb 15, 2023 · 9c07aaa · 9c07aaa
1 parent 0136e9b
commit 9c07aaa
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 75 deletions.
diff --git a/examples/1-simple-json-to-df.ipynb b/examples/1-simple-json-to-df.ipynb
@@ -9,10 +9,7 @@
     "First, we initialize the needed classes."
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
    }
   },
   {
@@ -52,26 +49,18 @@
     ")"
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
+    "collapsed": false
    }
   },
   {
    "cell_type": "markdown",
    "source": [
     "We can now start with our actual query. Let's assume, for this very simple case, that we know the\n",
     " Observation ID. Then, we just need to call the following function with the Observation resource\n",
-    " and the request parameters to obtain our result.\n",
-    "\n",
-    "`query_to_dataframe` is a wrapper function that downloads a list of bundles using any function specified in `bundles_function`, and then uses a process function to build a DataFrame with the requested information. The default processing function returns the entire structure of the resource.\n"
+    " and the request parameters to obtain our result."
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
    }
   },
   {
@@ -112,10 +101,7 @@
     "observation_all"
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
+    "collapsed": false
    }
   },
   {
@@ -129,10 +115,7 @@
     "`fhir_paths` parameter."
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
    }
   },
   {
@@ -181,10 +164,7 @@
     "observation_values"
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
+    "collapsed": false
    }
   },
   {
@@ -200,10 +180,7 @@
    "cell_type": "markdown",
    "source": [],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
    }
   }
  ],

diff --git a/fhir_pyrate/pirate.py b/fhir_pyrate/pirate.py
@@ -392,38 +392,6 @@ def trade_rows_for_bundles(
             tqdm_df_build=False,
         )
 
-    def trade_rows_for_dataframe_with_ref(
-        self,
-        df: pd.DataFrame,
-        resource_type: str,
-        df_constraints: Dict[
-            str, Union[Union[str, Tuple[str, str]], List[Union[str, Tuple[str, str]]]]
-        ],
-        process_function: Callable[[FHIRObj], Any] = flatten_data,
-        fhir_paths: List[Union[str, Tuple[str, str]]] = None,
-        request_params: Dict[str, Any] = None,
-        num_pages: int = -1,
-        merge_on: str = None,
-    ) -> pd.DataFrame:
-        """
-        Deprecated, use trade_rows_for_dataframe(..., with_ref=True) instead.
-        """
-        warnings.warn(
-            "The trade_rows_for_dataframe_with_ref function is deprecated, please use "
-            "trade_rows_for_dataframe(..., with_ref=True) instead."
-        )
-        return self.trade_rows_for_dataframe(
-            df=df,
-            resource_type=resource_type,
-            df_constraints=df_constraints,
-            process_function=process_function,
-            fhir_paths=fhir_paths,
-            request_params=request_params,
-            num_pages=num_pages,
-            with_ref=True,
-            merge_on=merge_on,
-        )
-
     def trade_rows_for_dataframe(
         self,
         df: pd.DataFrame,
@@ -435,7 +403,7 @@ def trade_rows_for_dataframe(
         fhir_paths: List[Union[str, Tuple[str, str]]] = None,
         request_params: Dict[str, Any] = None,
         num_pages: int = -1,
-        with_ref: bool = False,
+        with_ref: bool = True,
         with_columns: List[Union[str, Tuple[str, str]]] = None,
         merge_on: str = None,
         build_df_after_query: bool = False,
@@ -899,7 +867,12 @@ def _adjust_df_constraints(
                 if isinstance(column_constraint, str)
                 else (
                     column_constraint[0]
-                    + ("%7C" if "http" in column_constraint[0] else ""),
+                    + (
+                        "%7C"
+                        if "http" in column_constraint[0]
+                        and "%7C" not in column_constraint[0]
+                        else ""
+                    ),
                     column_constraint[1],
                 )
                 for column_constraint in list_of_constraints
@@ -1498,3 +1471,87 @@ def wrap(
                 )
 
         return wrap
+
+    def query_to_dataframe(
+        self,
+        bundles_function: Callable,
+        process_function: Callable[[FHIRObj], Any] = flatten_data,
+        fhir_paths: List[Union[str, Tuple[str, str]]] = None,
+        build_df_after_query: bool = False,
+        merge_on: str = None,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        Wrapper function that given any of the functions that return bundles, builds the
+        DataFrame straight away.
+        :param bundles_function: The function that should be used to get the bundles,
+        e.g. self.sail_through_search_space, trade_rows_for_bundles
+        :param process_function: The transformation function going through the entries and
+        storing the entries to save
+        :param fhir_paths: A list of FHIR paths (https://hl7.org/fhirpath/) to be used to build the
+        DataFrame, alternatively, a list of tuples can be used to specify the column name of the
+        future column with (column_name, fhir_path). Please refer to the `bundles_to_dataframe`
+        functions for notes on how to use the FHIR paths.
+        :param build_df_after_query: Whether the DataFrame should be built after all bundles have
+        been collected, or whether the bundles should be transformed just after retrieving
+        :param merge_on: Whether to merge the results on a certain row after computing. This is
+        useful when using includes, if you store the IDs on the same column you can use that column
+        to merge all the rows into one, example below
+        :param kwargs: The arguments that will be passed to the `bundles_function` function,
+        please refer to the documentation of the respective methods.
+        :return: A pandas DataFrame containing the queried information
+        The following example will initially return one row for each entry, but using
+        `group_row="patient_id"` we choose a column to run the merge on. This will merge the
+        columns that contain values that for the others are empty, having then one row representing
+        one patient.
+        ```
+        df = search.query_to_dataframe(
+            bundles_function=search.steal_bundles,
+            resource_type="Patient",
+            request_params={
+                "_sort": "_id",
+                "_count": 10,
+                "birthdate": "ge1990",
+                "_revinclude": "Condition:subject",
+            },
+            fhir_paths=[
+                ("patient_id", "Patient.id"),
+                ("patient_id", "Condition.subject.reference.replace('Patient/', '')"),
+                "Patient.gender",
+                "Condition.code.coding.code",
+            ],
+            num_pages=1,
+            group_row="patient_id"
+        )
+        ```
+        """
+        if bundles_function == self.steal_bundles:
+            return self.steal_bundles_to_dataframe(
+                **kwargs,
+                process_function=process_function,
+                fhir_paths=fhir_paths,
+                merge_on=merge_on,
+                build_df_after_query=build_df_after_query,
+            )
+        elif bundles_function == self.sail_through_search_space:
+            return self.sail_through_search_space_to_dataframe(
+                **kwargs,
+                process_function=process_function,
+                fhir_paths=fhir_paths,
+                merge_on=merge_on,
+                build_df_after_query=build_df_after_query,
+            )
+        elif bundles_function == self.trade_rows_for_bundles:
+            return self.trade_rows_for_dataframe(
+                **kwargs,
+                process_function=process_function,
+                fhir_paths=fhir_paths,
+                with_ref=False,
+                merge_on=merge_on,
+                build_df_after_query=build_df_after_query,
+            )
+        else:
+            raise ValueError(
+                f"The given function {bundles_function.__name__} "
+                f"cannot be used to obtain a dataframe."
+            )
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,9 @@
 [tool.poetry]
 name = "fhir-pyrate"
-version = "0.2.0-beta.6"
+version = "0.2.0-beta.7"
 description = "FHIR-PYrate is a package that provides a high-level API to query FHIR Servers for bundles of resources and return the structured information as pandas DataFrames. It can also be used to filter resources using RegEx and SpaCy and download DICOM studies and series."
 license = "MIT"
-authors = ["Giulia Baldini <giulia.baldini@uk-essen.de>", "Rene Hosch <rene.hosch@uk-essen.de>"]
+authors = ["Rene Hosch <rene.hosch@uk-essen.de>", "Giulia Baldini <giulia.baldini@uk-essen.de>"]
 readme = "README.md"
 repository = "https://github.com/UMEssen/FHIR-PYrate"
 keywords = ["python", "fhir", "data-science", "fhirpath", "healthcare"]

diff --git a/tests/test_public.py b/tests/test_public.py
@@ -8,6 +8,7 @@
 
 import pandas as pd
 from bs4 import BeautifulSoup
+from pandas.testing import assert_frame_equal
 
 from fhir_pyrate import Ahoy, Miner, Pirate
 from fhir_pyrate.util import FHIRObj
@@ -394,12 +395,19 @@ def testStealBundles(self) -> None:
                             with self.subTest(
                                 msg=f"build_after_query_{build_after_query}"
                             ):
-                                obs_df = search.steal_bundles_to_dataframe(
+                                obs_df1 = search.steal_bundles_to_dataframe(
                                     resource_type="Observation",
                                     num_pages=5,
                                     build_df_after_query=build_after_query,
                                 )
-                                assert len(obs_df) == first_length
+                                assert len(obs_df1) == first_length
+                                obs_df2 = search.query_to_dataframe(
+                                    bundles_function=search.steal_bundles,
+                                    resource_type="Observation",
+                                    num_pages=5,
+                                    build_df_after_query=build_after_query,
+                                )
+                                assert obs_df1.equals(obs_df2)
                         search.close()
 
     def testSail(self) -> None:
@@ -429,16 +437,39 @@ def testSail(self) -> None:
                         first_length = len(obs_df)
                         for build_after_query in [True, False]:
                             with self.subTest(
-                                msg=f"build_after_query_{build_after_query}"
+                                msg=f"cache_{cache_val}_req_{d_requests}_build_{d_build}_"
+                                f"build_after_query_{build_after_query}"
                             ):
-                                obs_df = search.sail_through_search_space_to_dataframe(
+                                obs_df1 = search.sail_through_search_space_to_dataframe(
                                     resource_type="Observation",
                                     time_attribute_name="_lastUpdated",
                                     date_init="2021-01-01",
                                     date_end="2022-01-01",
                                     build_df_after_query=build_after_query,
                                 )
-                                assert len(obs_df) == first_length
+                                assert len(obs_df1) == first_length
+                                obs_df2 = search.query_to_dataframe(
+                                    bundles_function=search.sail_through_search_space,
+                                    resource_type="Observation",
+                                    time_attribute_name="_lastUpdated",
+                                    date_init="2021-01-01",
+                                    date_end="2022-01-01",
+                                    build_df_after_query=build_after_query,
+                                )
+                                sorted_obs1 = (
+                                    obs_df1.sort_index(axis=1)
+                                    .sort_values(by="id")
+                                    .reset_index(drop=True)
+                                )
+                                sorted_obs2 = (
+                                    obs_df2.sort_index(axis=1)
+                                    .sort_values(by="id")
+                                    .reset_index(drop=True)
+                                )
+
+                                assert_frame_equal(
+                                    sorted_obs1, sorted_obs2, check_dtype=False
+                                )
 
     def testTrade(self) -> None:
         trade_df = pd.DataFrame(["18262-6", "2571-8"], columns=["code"])
@@ -477,16 +508,40 @@ def testTrade(self) -> None:
                         assert len(obs_df) == first_length
                         for build_after_query in [True, False]:
                             with self.subTest(
-                                msg=f"build_after_query_{build_after_query}"
+                                msg=f"cache_{cache_val}_req_{d_requests}_build_{d_build}_"
+                                f"build_after_query_{build_after_query}"
                             ):
-                                obs_df = search.trade_rows_for_dataframe(
+                                obs_df1 = search.trade_rows_for_dataframe(
                                     trade_df,
                                     resource_type="Observation",
                                     df_constraints={"code": "code"},
                                     request_params={"_lastUpdated": "ge2020"},
                                     build_df_after_query=build_after_query,
+                                    with_ref=False,
+                                )
+                                assert len(obs_df1) == first_length
+                                obs_df2 = search.query_to_dataframe(
+                                    df=trade_df,
+                                    bundles_function=search.trade_rows_for_bundles,
+                                    resource_type="Observation",
+                                    df_constraints={"code": "code"},
+                                    request_params={"_lastUpdated": "ge2020"},
+                                    build_df_after_query=build_after_query,
+                                )
+                                sorted_obs1 = (
+                                    obs_df1.sort_index(axis=1)
+                                    .sort_values(by="id")
+                                    .reset_index(drop=True)
+                                )
+                                sorted_obs2 = (
+                                    obs_df2.sort_index(axis=1)
+                                    .sort_values(by="id")
+                                    .reset_index(drop=True)
+                                )
+
+                                assert_frame_equal(
+                                    sorted_obs1, sorted_obs2, check_dtype=False
                                 )
-                                assert len(obs_df) == first_length
 
 
 class ContraintsTest(unittest.TestCase):