added time_to_event columns (#2)

* added time_to_event columns * added the python-build.yml
knatarajan-lab · Sep 23, 2024 · 7dfc953 · 7dfc953
1 parent d8b8e35
commit 7dfc953
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 49 deletions.
diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml
@@ -0,0 +1,93 @@
+name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
+
+on: push
+
+jobs:
+  build:
+    name: Build distribution 📦
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.x"
+    - name: Install pypa/build
+      run: >-
+        python3 -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+
+  publish-to-pypi:
+    name: >-
+      Publish Python 🐍 distribution 📦 to PyPI
+    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/cehrbert-data  # Replace <package-name> with your PyPI project name
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+
+  github-release:
+    name: >-
+      Sign the Python 🐍 distribution 📦 with Sigstore
+      and upload them to GitHub Release
+    needs:
+    - publish-to-pypi
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write  # IMPORTANT: mandatory for making GitHub Releases
+      id-token: write  # IMPORTANT: mandatory for sigstore
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Sign the dists with Sigstore
+      uses: sigstore/[email protected]
+      with:
+        inputs: >-
+          ./dist/*.tar.gz
+          ./dist/*.whl
+    - name: Create GitHub Release
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      run: >-
+        gh release create
+        '${{ github.ref_name }}'
+        --repo '${{ github.repository }}'
+        --notes ""
+    - name: Upload artifact signatures to GitHub Release
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      # Upload to GitHub Release using the `gh` CLI.
+      # `dist/` contains the built packages, and the
+      # sigstore-produced signatures and certificates.
+      run: >-
+        gh release upload
+        '${{ github.ref_name }}' dist/**
+        --repo '${{ github.repository }}'
diff --git a/src/cehrbert_data/cohorts/spark_app_base.py b/src/cehrbert_data/cohorts/spark_app_base.py
@@ -87,16 +87,16 @@ class BaseCohortBuilder(ABC):
     cohort_required_columns = ["person_id", "index_date", "visit_occurrence_id"]
 
     def __init__(
-        self,
-        query_builder: QueryBuilder,
-        input_folder: str,
-        output_folder: str,
-        date_lower_bound: str,
-        date_upper_bound: str,
-        age_lower_bound: int,
-        age_upper_bound: int,
-        prior_observation_period: int,
-        post_observation_period: int,
+            self,
+            query_builder: QueryBuilder,
+            input_folder: str,
+            output_folder: str,
+            date_lower_bound: str,
+            date_upper_bound: str,
+            age_lower_bound: int,
+            age_upper_bound: int,
+            prior_observation_period: int,
+            post_observation_period: int,
     ):
 
         self._query_builder = query_builder
@@ -256,41 +256,41 @@ def get_logger(cls):
 
 class NestedCohortBuilder:
     def __init__(
-        self,
-        cohort_name: str,
-        input_folder: str,
-        output_folder: str,
-        target_cohort: DataFrame,
-        outcome_cohort: DataFrame,
-        ehr_table_list: List[str],
-        observation_window: int,
-        hold_off_window: int,
-        prediction_start_days: int,
-        prediction_window: int,
-        num_of_visits: int,
-        num_of_concepts: int,
-        patient_splits_folder: str = None,
-        is_window_post_index: bool = False,
-        include_visit_type: bool = True,
-        allow_measurement_only: bool = False,
-        exclude_visit_tokens: bool = False,
-        is_feature_concept_frequency: bool = False,
-        is_roll_up_concept: bool = False,
-        include_concept_list: bool = True,
-        is_new_patient_representation: bool = False,
-        gpt_patient_sequence: bool = False,
-        is_hierarchical_bert: bool = False,
-        classic_bert_seq: bool = False,
-        is_first_time_outcome: bool = False,
-        is_questionable_outcome_existed: bool = False,
-        is_remove_index_prediction_starts: bool = False,
-        is_prediction_window_unbounded: bool = False,
-        is_observation_window_unbounded: bool = False,
-        is_population_estimation: bool = False,
-        att_type: AttType = AttType.CEHR_BERT,
-        exclude_demographic: bool = True,
-        use_age_group: bool = False,
-        single_contribution: bool = False,
+            self,
+            cohort_name: str,
+            input_folder: str,
+            output_folder: str,
+            target_cohort: DataFrame,
+            outcome_cohort: DataFrame,
+            ehr_table_list: List[str],
+            observation_window: int,
+            hold_off_window: int,
+            prediction_start_days: int,
+            prediction_window: int,
+            num_of_visits: int,
+            num_of_concepts: int,
+            patient_splits_folder: str = None,
+            is_window_post_index: bool = False,
+            include_visit_type: bool = True,
+            allow_measurement_only: bool = False,
+            exclude_visit_tokens: bool = False,
+            is_feature_concept_frequency: bool = False,
+            is_roll_up_concept: bool = False,
+            include_concept_list: bool = True,
+            is_new_patient_representation: bool = False,
+            gpt_patient_sequence: bool = False,
+            is_hierarchical_bert: bool = False,
+            classic_bert_seq: bool = False,
+            is_first_time_outcome: bool = False,
+            is_questionable_outcome_existed: bool = False,
+            is_remove_index_prediction_starts: bool = False,
+            is_prediction_window_unbounded: bool = False,
+            is_observation_window_unbounded: bool = False,
+            is_population_estimation: bool = False,
+            att_type: AttType = AttType.CEHR_BERT,
+            exclude_demographic: bool = True,
+            use_age_group: bool = False,
+            single_contribution: bool = False,
     ):
         self._cohort_name = cohort_name
         self._input_folder = input_folder
@@ -494,6 +494,13 @@ def build(self):
             .where(F.col("num_of_concepts") >= self._num_of_concepts)
         )
 
+        # Add time_to_event
+        cohort = cohort.withColumn(
+            "study_end_date",
+            F.coalesce(F.col("outcome_date"), F.date_add(cohort.index_date, self._prediction_window))
+        )
+        cohort = cohort.withColumn("time_to_event", F.datediff("study_end_date", "index_date"))
+
         # if patient_splits is provided, we will
         if self._patient_splits_folder:
             patient_splits = self.spark.read.parquet(self._patient_splits_folder)
@@ -623,10 +630,10 @@ def get_logger(cls):
 
 
 def create_prediction_cohort(
-    spark_args,
-    target_query_builder: QueryBuilder,
-    outcome_query_builder: QueryBuilder,
-    ehr_table_list,
+        spark_args,
+        target_query_builder: QueryBuilder,
+        outcome_query_builder: QueryBuilder,
+        ehr_table_list,
 ):
     """
     TODO.

diff --git a/src/cehrbert_data/const/artificial_tokens.py b/src/cehrbert_data/const/artificial_tokens.py