Skip to content

Commit

Permalink
added time_to_event columns (#2)
Browse files Browse the repository at this point in the history
* added time_to_event columns

* added the python-build.yml
  • Loading branch information
ChaoPang authored Sep 23, 2024
1 parent d8b8e35 commit 7dfc953
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 49 deletions.
93 changes: 93 additions & 0 deletions .github/workflows/python-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI

on: push

jobs:
build:
name: Build distribution 📦
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.x"
- name: Install pypa/build
run: >-
python3 -m
pip install
build
--user
- name: Build a binary wheel and a source tarball
run: python3 -m build
- name: Store the distribution packages
uses: actions/upload-artifact@v4
with:
name: python-package-distributions
path: dist/

publish-to-pypi:
name: >-
Publish Python 🐍 distribution 📦 to PyPI
if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
needs:
- build
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/cehrbert-data # Replace <package-name> with your PyPI project name
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing

steps:
- name: Download all the dists
uses: actions/download-artifact@v4
with:
name: python-package-distributions
path: dist/
- name: Publish distribution 📦 to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

github-release:
name: >-
Sign the Python 🐍 distribution 📦 with Sigstore
and upload them to GitHub Release
needs:
- publish-to-pypi
runs-on: ubuntu-latest

permissions:
contents: write # IMPORTANT: mandatory for making GitHub Releases
id-token: write # IMPORTANT: mandatory for sigstore

steps:
- name: Download all the dists
uses: actions/download-artifact@v4
with:
name: python-package-distributions
path: dist/
- name: Sign the dists with Sigstore
uses: sigstore/[email protected]
with:
inputs: >-
./dist/*.tar.gz
./dist/*.whl
- name: Create GitHub Release
env:
GITHUB_TOKEN: ${{ github.token }}
run: >-
gh release create
'${{ github.ref_name }}'
--repo '${{ github.repository }}'
--notes ""
- name: Upload artifact signatures to GitHub Release
env:
GITHUB_TOKEN: ${{ github.token }}
# Upload to GitHub Release using the `gh` CLI.
# `dist/` contains the built packages, and the
# sigstore-produced signatures and certificates.
run: >-
gh release upload
'${{ github.ref_name }}' dist/**
--repo '${{ github.repository }}'
105 changes: 56 additions & 49 deletions src/cehrbert_data/cohorts/spark_app_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,16 @@ class BaseCohortBuilder(ABC):
cohort_required_columns = ["person_id", "index_date", "visit_occurrence_id"]

def __init__(
self,
query_builder: QueryBuilder,
input_folder: str,
output_folder: str,
date_lower_bound: str,
date_upper_bound: str,
age_lower_bound: int,
age_upper_bound: int,
prior_observation_period: int,
post_observation_period: int,
self,
query_builder: QueryBuilder,
input_folder: str,
output_folder: str,
date_lower_bound: str,
date_upper_bound: str,
age_lower_bound: int,
age_upper_bound: int,
prior_observation_period: int,
post_observation_period: int,
):

self._query_builder = query_builder
Expand Down Expand Up @@ -256,41 +256,41 @@ def get_logger(cls):

class NestedCohortBuilder:
def __init__(
self,
cohort_name: str,
input_folder: str,
output_folder: str,
target_cohort: DataFrame,
outcome_cohort: DataFrame,
ehr_table_list: List[str],
observation_window: int,
hold_off_window: int,
prediction_start_days: int,
prediction_window: int,
num_of_visits: int,
num_of_concepts: int,
patient_splits_folder: str = None,
is_window_post_index: bool = False,
include_visit_type: bool = True,
allow_measurement_only: bool = False,
exclude_visit_tokens: bool = False,
is_feature_concept_frequency: bool = False,
is_roll_up_concept: bool = False,
include_concept_list: bool = True,
is_new_patient_representation: bool = False,
gpt_patient_sequence: bool = False,
is_hierarchical_bert: bool = False,
classic_bert_seq: bool = False,
is_first_time_outcome: bool = False,
is_questionable_outcome_existed: bool = False,
is_remove_index_prediction_starts: bool = False,
is_prediction_window_unbounded: bool = False,
is_observation_window_unbounded: bool = False,
is_population_estimation: bool = False,
att_type: AttType = AttType.CEHR_BERT,
exclude_demographic: bool = True,
use_age_group: bool = False,
single_contribution: bool = False,
self,
cohort_name: str,
input_folder: str,
output_folder: str,
target_cohort: DataFrame,
outcome_cohort: DataFrame,
ehr_table_list: List[str],
observation_window: int,
hold_off_window: int,
prediction_start_days: int,
prediction_window: int,
num_of_visits: int,
num_of_concepts: int,
patient_splits_folder: str = None,
is_window_post_index: bool = False,
include_visit_type: bool = True,
allow_measurement_only: bool = False,
exclude_visit_tokens: bool = False,
is_feature_concept_frequency: bool = False,
is_roll_up_concept: bool = False,
include_concept_list: bool = True,
is_new_patient_representation: bool = False,
gpt_patient_sequence: bool = False,
is_hierarchical_bert: bool = False,
classic_bert_seq: bool = False,
is_first_time_outcome: bool = False,
is_questionable_outcome_existed: bool = False,
is_remove_index_prediction_starts: bool = False,
is_prediction_window_unbounded: bool = False,
is_observation_window_unbounded: bool = False,
is_population_estimation: bool = False,
att_type: AttType = AttType.CEHR_BERT,
exclude_demographic: bool = True,
use_age_group: bool = False,
single_contribution: bool = False,
):
self._cohort_name = cohort_name
self._input_folder = input_folder
Expand Down Expand Up @@ -494,6 +494,13 @@ def build(self):
.where(F.col("num_of_concepts") >= self._num_of_concepts)
)

# Add time_to_event
cohort = cohort.withColumn(
"study_end_date",
F.coalesce(F.col("outcome_date"), F.date_add(cohort.index_date, self._prediction_window))
)
cohort = cohort.withColumn("time_to_event", F.datediff("study_end_date", "index_date"))

# if patient_splits is provided, we will
if self._patient_splits_folder:
patient_splits = self.spark.read.parquet(self._patient_splits_folder)
Expand Down Expand Up @@ -623,10 +630,10 @@ def get_logger(cls):


def create_prediction_cohort(
spark_args,
target_query_builder: QueryBuilder,
outcome_query_builder: QueryBuilder,
ehr_table_list,
spark_args,
target_query_builder: QueryBuilder,
outcome_query_builder: QueryBuilder,
ehr_table_list,
):
"""
TODO.
Expand Down
Empty file.

0 comments on commit 7dfc953

Please sign in to comment.