Skip to content

Commit

Permalink
111 make current beta compatible with 010 (#112)
Browse files Browse the repository at this point in the history
* Bump up version

* Add query_to_dataframe function to ensure compatibility with v0.1.0

* Fix text in examples

* Fix parameter inconsistency in query_to_dataframe§

* Update tests with query_to_dataframe

* Adjust tests
  • Loading branch information
giuliabaldini authored Feb 15, 2023
1 parent 0136e9b commit 9c07aaa
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 75 deletions.
39 changes: 8 additions & 31 deletions examples/1-simple-json-to-df.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@
"First, we initialize the needed classes."
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
"collapsed": false
}
},
{
Expand Down Expand Up @@ -52,26 +49,18 @@
")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"We can now start with our actual query. Let's assume, for this very simple case, that we know the\n",
" Observation ID. Then, we just need to call the following function with the Observation resource\n",
" and the request parameters to obtain our result.\n",
"\n",
"`query_to_dataframe` is a wrapper function that downloads a list of bundles using any function specified in `bundles_function`, and then uses a process function to build a DataFrame with the requested information. The default processing function returns the entire structure of the resource.\n"
" and the request parameters to obtain our result."
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
"collapsed": false
}
},
{
Expand Down Expand Up @@ -112,10 +101,7 @@
"observation_all"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
Expand All @@ -129,10 +115,7 @@
"`fhir_paths` parameter."
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
"collapsed": false
}
},
{
Expand Down Expand Up @@ -181,10 +164,7 @@
"observation_values"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
Expand All @@ -200,10 +180,7 @@
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
"collapsed": false
}
}
],
Expand Down
125 changes: 91 additions & 34 deletions fhir_pyrate/pirate.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,38 +392,6 @@ def trade_rows_for_bundles(
tqdm_df_build=False,
)

def trade_rows_for_dataframe_with_ref(
self,
df: pd.DataFrame,
resource_type: str,
df_constraints: Dict[
str, Union[Union[str, Tuple[str, str]], List[Union[str, Tuple[str, str]]]]
],
process_function: Callable[[FHIRObj], Any] = flatten_data,
fhir_paths: List[Union[str, Tuple[str, str]]] = None,
request_params: Dict[str, Any] = None,
num_pages: int = -1,
merge_on: str = None,
) -> pd.DataFrame:
"""
Deprecated, use trade_rows_for_dataframe(..., with_ref=True) instead.
"""
warnings.warn(
"The trade_rows_for_dataframe_with_ref function is deprecated, please use "
"trade_rows_for_dataframe(..., with_ref=True) instead."
)
return self.trade_rows_for_dataframe(
df=df,
resource_type=resource_type,
df_constraints=df_constraints,
process_function=process_function,
fhir_paths=fhir_paths,
request_params=request_params,
num_pages=num_pages,
with_ref=True,
merge_on=merge_on,
)

def trade_rows_for_dataframe(
self,
df: pd.DataFrame,
Expand All @@ -435,7 +403,7 @@ def trade_rows_for_dataframe(
fhir_paths: List[Union[str, Tuple[str, str]]] = None,
request_params: Dict[str, Any] = None,
num_pages: int = -1,
with_ref: bool = False,
with_ref: bool = True,
with_columns: List[Union[str, Tuple[str, str]]] = None,
merge_on: str = None,
build_df_after_query: bool = False,
Expand Down Expand Up @@ -899,7 +867,12 @@ def _adjust_df_constraints(
if isinstance(column_constraint, str)
else (
column_constraint[0]
+ ("%7C" if "http" in column_constraint[0] else ""),
+ (
"%7C"
if "http" in column_constraint[0]
and "%7C" not in column_constraint[0]
else ""
),
column_constraint[1],
)
for column_constraint in list_of_constraints
Expand Down Expand Up @@ -1498,3 +1471,87 @@ def wrap(
)

return wrap

def query_to_dataframe(
self,
bundles_function: Callable,
process_function: Callable[[FHIRObj], Any] = flatten_data,
fhir_paths: List[Union[str, Tuple[str, str]]] = None,
build_df_after_query: bool = False,
merge_on: str = None,
**kwargs: Any,
) -> pd.DataFrame:
"""
Wrapper function that given any of the functions that return bundles, builds the
DataFrame straight away.
:param bundles_function: The function that should be used to get the bundles,
e.g. self.sail_through_search_space, trade_rows_for_bundles
:param process_function: The transformation function going through the entries and
storing the entries to save
:param fhir_paths: A list of FHIR paths (https://hl7.org/fhirpath/) to be used to build the
DataFrame, alternatively, a list of tuples can be used to specify the column name of the
future column with (column_name, fhir_path). Please refer to the `bundles_to_dataframe`
functions for notes on how to use the FHIR paths.
:param build_df_after_query: Whether the DataFrame should be built after all bundles have
been collected, or whether the bundles should be transformed just after retrieving
:param merge_on: Whether to merge the results on a certain row after computing. This is
useful when using includes, if you store the IDs on the same column you can use that column
to merge all the rows into one, example below
:param kwargs: The arguments that will be passed to the `bundles_function` function,
please refer to the documentation of the respective methods.
:return: A pandas DataFrame containing the queried information
The following example will initially return one row for each entry, but using
`group_row="patient_id"` we choose a column to run the merge on. This will merge the
columns that contain values that for the others are empty, having then one row representing
one patient.
```
df = search.query_to_dataframe(
bundles_function=search.steal_bundles,
resource_type="Patient",
request_params={
"_sort": "_id",
"_count": 10,
"birthdate": "ge1990",
"_revinclude": "Condition:subject",
},
fhir_paths=[
("patient_id", "Patient.id"),
("patient_id", "Condition.subject.reference.replace('Patient/', '')"),
"Patient.gender",
"Condition.code.coding.code",
],
num_pages=1,
group_row="patient_id"
)
```
"""
if bundles_function == self.steal_bundles:
return self.steal_bundles_to_dataframe(
**kwargs,
process_function=process_function,
fhir_paths=fhir_paths,
merge_on=merge_on,
build_df_after_query=build_df_after_query,
)
elif bundles_function == self.sail_through_search_space:
return self.sail_through_search_space_to_dataframe(
**kwargs,
process_function=process_function,
fhir_paths=fhir_paths,
merge_on=merge_on,
build_df_after_query=build_df_after_query,
)
elif bundles_function == self.trade_rows_for_bundles:
return self.trade_rows_for_dataframe(
**kwargs,
process_function=process_function,
fhir_paths=fhir_paths,
with_ref=False,
merge_on=merge_on,
build_df_after_query=build_df_after_query,
)
else:
raise ValueError(
f"The given function {bundles_function.__name__} "
f"cannot be used to obtain a dataframe."
)
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[tool.poetry]
name = "fhir-pyrate"
version = "0.2.0-beta.6"
version = "0.2.0-beta.7"
description = "FHIR-PYrate is a package that provides a high-level API to query FHIR Servers for bundles of resources and return the structured information as pandas DataFrames. It can also be used to filter resources using RegEx and SpaCy and download DICOM studies and series."
license = "MIT"
authors = ["Giulia Baldini <giulia.baldini@uk-essen.de>", "Rene Hosch <rene.hosch@uk-essen.de>"]
authors = ["Rene Hosch <rene.hosch@uk-essen.de>", "Giulia Baldini <giulia.baldini@uk-essen.de>"]
readme = "README.md"
repository = "https://github.com/UMEssen/FHIR-PYrate"
keywords = ["python", "fhir", "data-science", "fhirpath", "healthcare"]
Expand Down
71 changes: 63 additions & 8 deletions tests/test_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pandas as pd
from bs4 import BeautifulSoup
from pandas.testing import assert_frame_equal

from fhir_pyrate import Ahoy, Miner, Pirate
from fhir_pyrate.util import FHIRObj
Expand Down Expand Up @@ -394,12 +395,19 @@ def testStealBundles(self) -> None:
with self.subTest(
msg=f"build_after_query_{build_after_query}"
):
obs_df = search.steal_bundles_to_dataframe(
obs_df1 = search.steal_bundles_to_dataframe(
resource_type="Observation",
num_pages=5,
build_df_after_query=build_after_query,
)
assert len(obs_df) == first_length
assert len(obs_df1) == first_length
obs_df2 = search.query_to_dataframe(
bundles_function=search.steal_bundles,
resource_type="Observation",
num_pages=5,
build_df_after_query=build_after_query,
)
assert obs_df1.equals(obs_df2)
search.close()

def testSail(self) -> None:
Expand Down Expand Up @@ -429,16 +437,39 @@ def testSail(self) -> None:
first_length = len(obs_df)
for build_after_query in [True, False]:
with self.subTest(
msg=f"build_after_query_{build_after_query}"
msg=f"cache_{cache_val}_req_{d_requests}_build_{d_build}_"
f"build_after_query_{build_after_query}"
):
obs_df = search.sail_through_search_space_to_dataframe(
obs_df1 = search.sail_through_search_space_to_dataframe(
resource_type="Observation",
time_attribute_name="_lastUpdated",
date_init="2021-01-01",
date_end="2022-01-01",
build_df_after_query=build_after_query,
)
assert len(obs_df) == first_length
assert len(obs_df1) == first_length
obs_df2 = search.query_to_dataframe(
bundles_function=search.sail_through_search_space,
resource_type="Observation",
time_attribute_name="_lastUpdated",
date_init="2021-01-01",
date_end="2022-01-01",
build_df_after_query=build_after_query,
)
sorted_obs1 = (
obs_df1.sort_index(axis=1)
.sort_values(by="id")
.reset_index(drop=True)
)
sorted_obs2 = (
obs_df2.sort_index(axis=1)
.sort_values(by="id")
.reset_index(drop=True)
)

assert_frame_equal(
sorted_obs1, sorted_obs2, check_dtype=False
)

def testTrade(self) -> None:
trade_df = pd.DataFrame(["18262-6", "2571-8"], columns=["code"])
Expand Down Expand Up @@ -477,16 +508,40 @@ def testTrade(self) -> None:
assert len(obs_df) == first_length
for build_after_query in [True, False]:
with self.subTest(
msg=f"build_after_query_{build_after_query}"
msg=f"cache_{cache_val}_req_{d_requests}_build_{d_build}_"
f"build_after_query_{build_after_query}"
):
obs_df = search.trade_rows_for_dataframe(
obs_df1 = search.trade_rows_for_dataframe(
trade_df,
resource_type="Observation",
df_constraints={"code": "code"},
request_params={"_lastUpdated": "ge2020"},
build_df_after_query=build_after_query,
with_ref=False,
)
assert len(obs_df1) == first_length
obs_df2 = search.query_to_dataframe(
df=trade_df,
bundles_function=search.trade_rows_for_bundles,
resource_type="Observation",
df_constraints={"code": "code"},
request_params={"_lastUpdated": "ge2020"},
build_df_after_query=build_after_query,
)
sorted_obs1 = (
obs_df1.sort_index(axis=1)
.sort_values(by="id")
.reset_index(drop=True)
)
sorted_obs2 = (
obs_df2.sort_index(axis=1)
.sort_values(by="id")
.reset_index(drop=True)
)

assert_frame_equal(
sorted_obs1, sorted_obs2, check_dtype=False
)
assert len(obs_df) == first_length


class ContraintsTest(unittest.TestCase):
Expand Down

0 comments on commit 9c07aaa

Please sign in to comment.