diff --git a/examples/1-simple-json-to-df.ipynb b/examples/1-simple-json-to-df.ipynb index 54414e4..a7bdc8c 100644 --- a/examples/1-simple-json-to-df.ipynb +++ b/examples/1-simple-json-to-df.ipynb @@ -9,10 +9,7 @@ "First, we initialize the needed classes." ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { @@ -52,10 +49,7 @@ ")" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } + "collapsed": false } }, { @@ -63,15 +57,10 @@ "source": [ "We can now start with our actual query. Let's assume, for this very simple case, that we know the\n", " Observation ID. Then, we just need to call the following function with the Observation resource\n", - " and the request parameters to obtain our result.\n", - "\n", - "`query_to_dataframe` is a wrapper function that downloads a list of bundles using any function specified in `bundles_function`, and then uses a process function to build a DataFrame with the requested information. The default processing function returns the entire structure of the resource.\n" + " and the request parameters to obtain our result." ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { @@ -112,10 +101,7 @@ "observation_all" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } + "collapsed": false } }, { @@ -129,10 +115,7 @@ "`fhir_paths` parameter." ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { @@ -181,10 +164,7 @@ "observation_values" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } + "collapsed": false } }, { @@ -200,10 +180,7 @@ "cell_type": "markdown", "source": [], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } } ], diff --git a/fhir_pyrate/pirate.py b/fhir_pyrate/pirate.py index a9f67c8..71f66b5 100644 --- a/fhir_pyrate/pirate.py +++ b/fhir_pyrate/pirate.py @@ -392,38 +392,6 @@ def trade_rows_for_bundles( tqdm_df_build=False, ) - def trade_rows_for_dataframe_with_ref( - self, - df: pd.DataFrame, - resource_type: str, - df_constraints: Dict[ - str, Union[Union[str, Tuple[str, str]], List[Union[str, Tuple[str, str]]]] - ], - process_function: Callable[[FHIRObj], Any] = flatten_data, - fhir_paths: List[Union[str, Tuple[str, str]]] = None, - request_params: Dict[str, Any] = None, - num_pages: int = -1, - merge_on: str = None, - ) -> pd.DataFrame: - """ - Deprecated, use trade_rows_for_dataframe(..., with_ref=True) instead. - """ - warnings.warn( - "The trade_rows_for_dataframe_with_ref function is deprecated, please use " - "trade_rows_for_dataframe(..., with_ref=True) instead." - ) - return self.trade_rows_for_dataframe( - df=df, - resource_type=resource_type, - df_constraints=df_constraints, - process_function=process_function, - fhir_paths=fhir_paths, - request_params=request_params, - num_pages=num_pages, - with_ref=True, - merge_on=merge_on, - ) - def trade_rows_for_dataframe( self, df: pd.DataFrame, @@ -435,7 +403,7 @@ def trade_rows_for_dataframe( fhir_paths: List[Union[str, Tuple[str, str]]] = None, request_params: Dict[str, Any] = None, num_pages: int = -1, - with_ref: bool = False, + with_ref: bool = True, with_columns: List[Union[str, Tuple[str, str]]] = None, merge_on: str = None, build_df_after_query: bool = False, @@ -899,7 +867,12 @@ def _adjust_df_constraints( if isinstance(column_constraint, str) else ( column_constraint[0] - + ("%7C" if "http" in column_constraint[0] else ""), + + ( + "%7C" + if "http" in column_constraint[0] + and "%7C" not in column_constraint[0] + else "" + ), column_constraint[1], ) for column_constraint in list_of_constraints @@ -1498,3 +1471,87 @@ def wrap( ) return wrap + + def query_to_dataframe( + self, + bundles_function: Callable, + process_function: Callable[[FHIRObj], Any] = flatten_data, + fhir_paths: List[Union[str, Tuple[str, str]]] = None, + build_df_after_query: bool = False, + merge_on: str = None, + **kwargs: Any, + ) -> pd.DataFrame: + """ + Wrapper function that given any of the functions that return bundles, builds the + DataFrame straight away. + :param bundles_function: The function that should be used to get the bundles, + e.g. self.sail_through_search_space, trade_rows_for_bundles + :param process_function: The transformation function going through the entries and + storing the entries to save + :param fhir_paths: A list of FHIR paths (https://hl7.org/fhirpath/) to be used to build the + DataFrame, alternatively, a list of tuples can be used to specify the column name of the + future column with (column_name, fhir_path). Please refer to the `bundles_to_dataframe` + functions for notes on how to use the FHIR paths. + :param build_df_after_query: Whether the DataFrame should be built after all bundles have + been collected, or whether the bundles should be transformed just after retrieving + :param merge_on: Whether to merge the results on a certain row after computing. This is + useful when using includes, if you store the IDs on the same column you can use that column + to merge all the rows into one, example below + :param kwargs: The arguments that will be passed to the `bundles_function` function, + please refer to the documentation of the respective methods. + :return: A pandas DataFrame containing the queried information + The following example will initially return one row for each entry, but using + `group_row="patient_id"` we choose a column to run the merge on. This will merge the + columns that contain values that for the others are empty, having then one row representing + one patient. + ``` + df = search.query_to_dataframe( + bundles_function=search.steal_bundles, + resource_type="Patient", + request_params={ + "_sort": "_id", + "_count": 10, + "birthdate": "ge1990", + "_revinclude": "Condition:subject", + }, + fhir_paths=[ + ("patient_id", "Patient.id"), + ("patient_id", "Condition.subject.reference.replace('Patient/', '')"), + "Patient.gender", + "Condition.code.coding.code", + ], + num_pages=1, + group_row="patient_id" + ) + ``` + """ + if bundles_function == self.steal_bundles: + return self.steal_bundles_to_dataframe( + **kwargs, + process_function=process_function, + fhir_paths=fhir_paths, + merge_on=merge_on, + build_df_after_query=build_df_after_query, + ) + elif bundles_function == self.sail_through_search_space: + return self.sail_through_search_space_to_dataframe( + **kwargs, + process_function=process_function, + fhir_paths=fhir_paths, + merge_on=merge_on, + build_df_after_query=build_df_after_query, + ) + elif bundles_function == self.trade_rows_for_bundles: + return self.trade_rows_for_dataframe( + **kwargs, + process_function=process_function, + fhir_paths=fhir_paths, + with_ref=False, + merge_on=merge_on, + build_df_after_query=build_df_after_query, + ) + else: + raise ValueError( + f"The given function {bundles_function.__name__} " + f"cannot be used to obtain a dataframe." + ) diff --git a/pyproject.toml b/pyproject.toml index 1eff5da..1e9cbf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [tool.poetry] name = "fhir-pyrate" -version = "0.2.0-beta.6" +version = "0.2.0-beta.7" description = "FHIR-PYrate is a package that provides a high-level API to query FHIR Servers for bundles of resources and return the structured information as pandas DataFrames. It can also be used to filter resources using RegEx and SpaCy and download DICOM studies and series." license = "MIT" -authors = ["Giulia Baldini ", "Rene Hosch "] +authors = ["Rene Hosch ", "Giulia Baldini "] readme = "README.md" repository = "https://github.com/UMEssen/FHIR-PYrate" keywords = ["python", "fhir", "data-science", "fhirpath", "healthcare"] diff --git a/tests/test_public.py b/tests/test_public.py index c0d15be..2fc1fd4 100644 --- a/tests/test_public.py +++ b/tests/test_public.py @@ -8,6 +8,7 @@ import pandas as pd from bs4 import BeautifulSoup +from pandas.testing import assert_frame_equal from fhir_pyrate import Ahoy, Miner, Pirate from fhir_pyrate.util import FHIRObj @@ -394,12 +395,19 @@ def testStealBundles(self) -> None: with self.subTest( msg=f"build_after_query_{build_after_query}" ): - obs_df = search.steal_bundles_to_dataframe( + obs_df1 = search.steal_bundles_to_dataframe( resource_type="Observation", num_pages=5, build_df_after_query=build_after_query, ) - assert len(obs_df) == first_length + assert len(obs_df1) == first_length + obs_df2 = search.query_to_dataframe( + bundles_function=search.steal_bundles, + resource_type="Observation", + num_pages=5, + build_df_after_query=build_after_query, + ) + assert obs_df1.equals(obs_df2) search.close() def testSail(self) -> None: @@ -429,16 +437,39 @@ def testSail(self) -> None: first_length = len(obs_df) for build_after_query in [True, False]: with self.subTest( - msg=f"build_after_query_{build_after_query}" + msg=f"cache_{cache_val}_req_{d_requests}_build_{d_build}_" + f"build_after_query_{build_after_query}" ): - obs_df = search.sail_through_search_space_to_dataframe( + obs_df1 = search.sail_through_search_space_to_dataframe( resource_type="Observation", time_attribute_name="_lastUpdated", date_init="2021-01-01", date_end="2022-01-01", build_df_after_query=build_after_query, ) - assert len(obs_df) == first_length + assert len(obs_df1) == first_length + obs_df2 = search.query_to_dataframe( + bundles_function=search.sail_through_search_space, + resource_type="Observation", + time_attribute_name="_lastUpdated", + date_init="2021-01-01", + date_end="2022-01-01", + build_df_after_query=build_after_query, + ) + sorted_obs1 = ( + obs_df1.sort_index(axis=1) + .sort_values(by="id") + .reset_index(drop=True) + ) + sorted_obs2 = ( + obs_df2.sort_index(axis=1) + .sort_values(by="id") + .reset_index(drop=True) + ) + + assert_frame_equal( + sorted_obs1, sorted_obs2, check_dtype=False + ) def testTrade(self) -> None: trade_df = pd.DataFrame(["18262-6", "2571-8"], columns=["code"]) @@ -477,16 +508,40 @@ def testTrade(self) -> None: assert len(obs_df) == first_length for build_after_query in [True, False]: with self.subTest( - msg=f"build_after_query_{build_after_query}" + msg=f"cache_{cache_val}_req_{d_requests}_build_{d_build}_" + f"build_after_query_{build_after_query}" ): - obs_df = search.trade_rows_for_dataframe( + obs_df1 = search.trade_rows_for_dataframe( trade_df, resource_type="Observation", df_constraints={"code": "code"}, request_params={"_lastUpdated": "ge2020"}, build_df_after_query=build_after_query, + with_ref=False, + ) + assert len(obs_df1) == first_length + obs_df2 = search.query_to_dataframe( + df=trade_df, + bundles_function=search.trade_rows_for_bundles, + resource_type="Observation", + df_constraints={"code": "code"}, + request_params={"_lastUpdated": "ge2020"}, + build_df_after_query=build_after_query, + ) + sorted_obs1 = ( + obs_df1.sort_index(axis=1) + .sort_values(by="id") + .reset_index(drop=True) + ) + sorted_obs2 = ( + obs_df2.sort_index(axis=1) + .sort_values(by="id") + .reset_index(drop=True) + ) + + assert_frame_equal( + sorted_obs1, sorted_obs2, check_dtype=False ) - assert len(obs_df) == first_length class ContraintsTest(unittest.TestCase):