created integration tests for generating pretraining data (generate_t…

…raining_data) and finetuning data (hf_admission)
knatarajan-lab · Sep 8, 2024 · 48a293a · 48a293a
1 parent 1150a2a
commit 48a293a
Show file tree

Hide file tree

Showing 61 changed files with 43 additions and 51 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -18,22 +18,22 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.10.0
-      uses: actions/setup-python@v3
-      with:
-        python-version: "3.10"
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest
-        pip install -e .
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        PYTHONPATH=./: pytest
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.10.0
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8 pytest 
+          pip install -e .
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Test with pytest
+        run: |
+          PYTHONPATH=./: pytest
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ build/
 .eggs/
 *.egg-info/
 *__pycache__/
+*venv*
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "numpy==1.24.3",
     "packaging==23.2",
     "pandas==2.2.0",
-    "pyspark==3.2.2"
+    "pyspark==3.1.2"
 ]
 
 [tool.setuptools_scm]

diff --git a/sample_data/omop_sample/concept/._SUCCESS.crc b/sample_data/omop_sample/concept/._SUCCESS.crc
diff --git a/...p_sample/concept/.part-00000-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet.crc b/...p_sample/concept/.part-00000-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet.crc
diff --git a/...p_sample/concept/.part-00003-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet.crc b/...p_sample/concept/.part-00003-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet.crc
diff --git a/...p_sample/concept/.part-00010-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet.crc b/...p_sample/concept/.part-00010-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/concept/_SUCCESS b/sample_data/omop_sample/concept/_SUCCESS
diff --git a/...a/omop_sample/concept/part-00000-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet b/...a/omop_sample/concept/part-00000-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet
diff --git a/...a/omop_sample/concept/part-00003-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet b/...a/omop_sample/concept/part-00003-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet
diff --git a/...a/omop_sample/concept/part-00010-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet b/...a/omop_sample/concept/part-00010-4b12270c-f6c8-4b59-8e0f-fd588bd79386-c000.snappy.parquet
diff --git a/sample_data/omop_sample/concept_ancestor/._SUCCESS.crc b/sample_data/omop_sample/concept_ancestor/._SUCCESS.crc
diff --git a/...concept_ancestor/.part-00000-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc b/...concept_ancestor/.part-00000-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc
diff --git a/...concept_ancestor/.part-00002-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc b/...concept_ancestor/.part-00002-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc
diff --git a/...concept_ancestor/.part-00006-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc b/...concept_ancestor/.part-00006-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc
diff --git a/...concept_ancestor/.part-00011-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc b/...concept_ancestor/.part-00011-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc
diff --git a/...concept_ancestor/.part-00013-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc b/...concept_ancestor/.part-00013-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/concept_ancestor/_SUCCESS b/sample_data/omop_sample/concept_ancestor/_SUCCESS
diff --git a/...mple/concept_ancestor/part-00000-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet b/...mple/concept_ancestor/part-00000-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet
diff --git a/...mple/concept_ancestor/part-00002-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet b/...mple/concept_ancestor/part-00002-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet
diff --git a/...mple/concept_ancestor/part-00006-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet b/...mple/concept_ancestor/part-00006-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet
diff --git a/...mple/concept_ancestor/part-00011-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet b/...mple/concept_ancestor/part-00011-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet
diff --git a/...mple/concept_ancestor/part-00013-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet b/...mple/concept_ancestor/part-00013-eafbd8be-3337-46da-89d3-20f79c2565d4-c000.snappy.parquet
diff --git a/sample_data/omop_sample/concept_relationship/._SUCCESS.crc b/sample_data/omop_sample/concept_relationship/._SUCCESS.crc
diff --git a/...ept_relationship/.part-00000-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc b/...ept_relationship/.part-00000-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc
diff --git a/...ept_relationship/.part-00002-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc b/...ept_relationship/.part-00002-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc
diff --git a/...ept_relationship/.part-00007-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc b/...ept_relationship/.part-00007-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc
diff --git a/...ept_relationship/.part-00012-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc b/...ept_relationship/.part-00012-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/concept_relationship/_SUCCESS b/sample_data/omop_sample/concept_relationship/_SUCCESS
diff --git a/.../concept_relationship/part-00000-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet b/.../concept_relationship/part-00000-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet
diff --git a/.../concept_relationship/part-00002-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet b/.../concept_relationship/part-00002-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet
diff --git a/.../concept_relationship/part-00007-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet b/.../concept_relationship/part-00007-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet
diff --git a/.../concept_relationship/part-00012-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet b/.../concept_relationship/part-00012-5752b472-8ba7-4189-ab69-8c92e46443aa-c000.snappy.parquet
diff --git a/sample_data/omop_sample/condition_occurrence/._SUCCESS.crc b/sample_data/omop_sample/condition_occurrence/._SUCCESS.crc
diff --git a/...ition_occurrence/.part-00000-4eff03a1-cdcf-4c89-b0cd-9ce590b9b1eb-c000.snappy.parquet.crc b/...ition_occurrence/.part-00000-4eff03a1-cdcf-4c89-b0cd-9ce590b9b1eb-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/condition_occurrence/_SUCCESS b/sample_data/omop_sample/condition_occurrence/_SUCCESS
diff --git a/.../condition_occurrence/part-00000-4eff03a1-cdcf-4c89-b0cd-9ce590b9b1eb-c000.snappy.parquet b/.../condition_occurrence/part-00000-4eff03a1-cdcf-4c89-b0cd-9ce590b9b1eb-c000.snappy.parquet
diff --git a/sample_data/omop_sample/drug_exposure/._SUCCESS.crc b/sample_data/omop_sample/drug_exposure/._SUCCESS.crc
diff --git a/...le/drug_exposure/.part-00000-10bbf1a4-a7da-416e-9703-58609c7edfad-c000.snappy.parquet.crc b/...le/drug_exposure/.part-00000-10bbf1a4-a7da-416e-9703-58609c7edfad-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/drug_exposure/_SUCCESS b/sample_data/omop_sample/drug_exposure/_SUCCESS
diff --git a/..._sample/drug_exposure/part-00000-10bbf1a4-a7da-416e-9703-58609c7edfad-c000.snappy.parquet b/..._sample/drug_exposure/part-00000-10bbf1a4-a7da-416e-9703-58609c7edfad-c000.snappy.parquet
diff --git a/sample_data/omop_sample/observation_period/._SUCCESS.crc b/sample_data/omop_sample/observation_period/._SUCCESS.crc
diff --git a/...servation_period/.part-00000-694316e5-cc95-49f1-9fad-5a7f377e2602-c000.snappy.parquet.crc b/...servation_period/.part-00000-694316e5-cc95-49f1-9fad-5a7f377e2602-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/observation_period/_SUCCESS b/sample_data/omop_sample/observation_period/_SUCCESS
diff --git a/...le/observation_period/part-00000-694316e5-cc95-49f1-9fad-5a7f377e2602-c000.snappy.parquet b/...le/observation_period/part-00000-694316e5-cc95-49f1-9fad-5a7f377e2602-c000.snappy.parquet
diff --git a/sample_data/omop_sample/person/._SUCCESS.crc b/sample_data/omop_sample/person/._SUCCESS.crc
diff --git a/...op_sample/person/.part-00000-7d789011-f361-48da-af6f-cfe102978b3a-c000.snappy.parquet.crc b/...op_sample/person/.part-00000-7d789011-f361-48da-af6f-cfe102978b3a-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/person/_SUCCESS b/sample_data/omop_sample/person/_SUCCESS
diff --git a/...ta/omop_sample/person/part-00000-7d789011-f361-48da-af6f-cfe102978b3a-c000.snappy.parquet b/...ta/omop_sample/person/part-00000-7d789011-f361-48da-af6f-cfe102978b3a-c000.snappy.parquet
diff --git a/sample_data/omop_sample/procedure_occurrence/._SUCCESS.crc b/sample_data/omop_sample/procedure_occurrence/._SUCCESS.crc
diff --git a/...edure_occurrence/.part-00000-e73003c1-aed5-41c0-b2d4-eaccaccf044a-c000.snappy.parquet.crc b/...edure_occurrence/.part-00000-e73003c1-aed5-41c0-b2d4-eaccaccf044a-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/procedure_occurrence/_SUCCESS b/sample_data/omop_sample/procedure_occurrence/_SUCCESS
diff --git a/.../procedure_occurrence/part-00000-e73003c1-aed5-41c0-b2d4-eaccaccf044a-c000.snappy.parquet b/.../procedure_occurrence/part-00000-e73003c1-aed5-41c0-b2d4-eaccaccf044a-c000.snappy.parquet
diff --git a/sample_data/omop_sample/visit_occurrence/._SUCCESS.crc b/sample_data/omop_sample/visit_occurrence/._SUCCESS.crc
diff --git a/...visit_occurrence/.part-00000-e874b5f1-bf9e-4cb9-93bf-c309a47b0476-c000.snappy.parquet.crc b/...visit_occurrence/.part-00000-e874b5f1-bf9e-4cb9-93bf-c309a47b0476-c000.snappy.parquet.crc
diff --git a/sample_data/omop_sample/visit_occurrence/_SUCCESS b/sample_data/omop_sample/visit_occurrence/_SUCCESS
diff --git a/...mple/visit_occurrence/part-00000-e874b5f1-bf9e-4cb9-93bf-c309a47b0476-c000.snappy.parquet b/...mple/visit_occurrence/part-00000-e874b5f1-bf9e-4cb9-93bf-c309a47b0476-c000.snappy.parquet
diff --git a/src/cehrbert_data/utils/spark_utils.py b/src/cehrbert_data/utils/spark_utils.py
@@ -856,6 +856,7 @@ def extract_ehr_records(
             patient_ehr_records["person_id"],
             patient_ehr_records["standard_concept_id"],
             patient_ehr_records["date"],
+            patient_ehr_records["datetime"],
             patient_ehr_records["visit_occurrence_id"],
             patient_ehr_records["domain"],
             visit_occurrence["visit_concept_id"],

diff --git a/tests/integration_tests/test_generate_training_data.py b/tests/integration_tests/test_generate_training_data.py
@@ -1,32 +1,24 @@
-import sys
 import unittest
 from ..pyspark_test_base import PySparkAbstract
-from cehrbert_data.spark_parse_args import create_spark_args
-from cehrbert_data.prediction_cohorts.hf_readmission import main
+from cehrbert_data.decorators.patient_event_decorator import AttType
+from cehrbert_data.apps.generate_training_data import main
 
 
 class HfReadmissionTest(PySparkAbstract):
 
     def test_run_pyspark_app(self):
-        sys.argv = [
-            "hf_readmission.py",
-            "--cohort_name", "hf_readmission",
-            "--input_folder", self.get_sample_data_folder(),
-            "--output_folder", self.get_output_folder(),
-            "--date_lower_bound", "1985-01-01",
-            "--date_upper_bound", "2023-12-31",
-            "--age_lower_bound", "18",
-            "--age_upper_bound", "100",
-            "--observation_window", "360",
-            "--prediction_start_days", "0",
-            "--prediction_window", "30",
-            "--include_visit_type",
-            "--is_new_patient_representation",
-            "--att_type", "cehr_bert",
-            "--ehr_table_list", "condition_occurrence", "procedure_occurrence", "drug_exposure"
-        ]
-
-        main(create_spark_args())
+        main(
+            input_folder=self.get_sample_data_folder(),
+            output_folder=self.get_output_folder(),
+            domain_table_list=["condition_occurrence", "drug_exposure", "procedure_occurrence"],
+            date_filter="1985-01-01",
+            include_visit_type=True,
+            is_new_patient_representation=True,
+            include_concept_list=False,
+            gpt_patient_sequence=True,
+            apply_age_filter=True,
+            att_type=AttType.DAY
+        )
 
 
 if __name__ == "__main__":

diff --git a/tests/integration_tests/test_hf_readmission.py b/tests/integration_tests/test_hf_readmission.py
@@ -1,13 +1,13 @@
 import sys
 import unittest
-from ..pyspark_test import PySparkAbstract
-from cehrbert_data.spark_parse_args import create_spark_args
+from ..pyspark_test_base import PySparkAbstract
+from cehrbert_data.utils.spark_parse_args import create_spark_args
 from cehrbert_data.prediction_cohorts.hf_readmission import main
 
 
 class HfReadmissionTest(PySparkAbstract):
 
-    def run_pyspark_app_test(self):
+    def test_run_pyspark_app(self):
         sys.argv = [
             "hf_readmission.py",
             "--cohort_name", "hf_readmission",
@@ -23,7 +23,7 @@ def run_pyspark_app_test(self):
             "--include_visit_type",
             "--is_new_patient_representation",
             "--att_type", "cehr_bert",
-            "--ehr_table_list", "condition_occurrence procedure_occurrence drug_exposure"
+            "--ehr_table_list", "condition_occurrence", "procedure_occurrence", "drug_exposure"
         ]
 
         main(create_spark_args())

diff --git a/tests/pyspark_test_base.py b/tests/pyspark_test_base.py
@@ -3,12 +3,10 @@
 import unittest
 import tempfile
 from pathlib import Path
-from abc import abstractmethod
-from cehrbert_data.spark_parse_args import create_spark_args
-from cehrbert_data.prediction_cohorts.hf_readmission import main
+from abc import abstractmethod, ABC
 
 
-class PySparkAbstract(unittest.TestCase):
+class PySparkAbstract(unittest.TestCase, ABC):
 
     @classmethod
     def setUpClass(cls):
@@ -34,7 +32,7 @@ def setUp(self):
 
     @abstractmethod
     def test_run_pyspark_app(self):
-        raise NotImplementedError("Not implemented yet")
+        pass
 
     def get_sample_data_folder(self):
         return self.data_folder
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ build/ @@
     .eggs/
     *.egg-info/
     *__pycache__/
+    *venv*