cumc-dbmi · ChaoPang · Sep 6, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/.github/workflows/python-app.yml → .github/workflows/tests.yml b/.github/workflows/python-app.yml → .github/workflows/tests.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: Python application
+name: Tests
 
 on:
   push:
@@ -36,4 +36,4 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        PYTHONPATH=./: pytest
+        PYTHONPATH=./: pytest
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 .idea/
 .vscode/
 venv*
-
+dist/*
 
 *ipynb_checkpoints/
 *h5
@@ -35,4 +35,4 @@ cehr_transformers.egg-info/top_level.txt
 
 test_data
 test_dataset_prepared
-test*results
+test*results
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,83 @@
+# For documentation on pre-commit usage, see https://pre-commit.com/
+# This file should be updated quarterly by a developer running `pre-commit autoupdate`
+# with changes added and committed.
+# This will run all defined formatters prior to adding a commit.
+default_language_version:
+  python: python3  # or python3.10 to set a specific default version
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+
+  - repo: https://github.com/DanielNoord/pydocstringformatter
+    rev: 'v0.7.3'
+    hooks:
+      - id: pydocstringformatter
+
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.0
+    hooks:
+      - id: autoflake
+
+  - repo: https://github.com/psf/black
+    rev: '24.1.1'
+    hooks:
+      - id: black
+        # It is recommended to specify the latest version of Python
+        # supported by your project here, or alternatively use
+        # pre-commit's default_language_version, see
+        # https://pre-commit.com/#top_level-default_language_version
+        # Pre-commit hook info from: https://black.readthedocs.io/en/stable/integrations/source_version_control.html
+        # Editor integration here:  https://black.readthedocs.io/en/stable/integrations/editors.html
+
+  - repo: https://github.com/adamchainz/blacken-docs
+    rev: "v1.12.1"  # replace with latest tag on GitHub
+    hooks:
+      - id: blacken-docs
+        additional_dependencies:
+          - black>=22.12.0
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: 'v4.5.0'
+    hooks:
+      - id: trailing-whitespace
+        exclude: .git/COMMIT_EDITMSG
+      - id: end-of-file-fixer
+        exclude: .git/COMMIT_EDITMSG
+      - id: detect-private-key
+      - id: debug-statements
+      - id: check-json
+      - id: pretty-format-json
+      - id: check-yaml
+      - id: name-tests-test
+      - id: requirements-txt-fixer
+
+  - repo: https://github.com/pre-commit/pygrep-hooks
+    rev: 'v1.10.0'
+    hooks:
+      - id: python-no-eval
+      - id: python-no-log-warn
+      - id: python-use-type-annotations
+
+  - repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.5.4
+    hooks:
+      - id: remove-crlf
+      - id: remove-tabs  # defaults to: 4
+        exclude: .git/COMMIT_EDITMSG
+
+  - repo: https://github.com/PyCQA/isort.git
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args: [ "--profile", "black" ]
+
+  - repo: https://github.com/PyCQA/bandit
+    rev: '1.7.7'
+    hooks:
+      - id: bandit
+        args: ["--skip", "B101,B106,B107,B301,B311,B105,B608,B403"]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Department of Biomedical Informatics
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -57,15 +57,15 @@ pip install -e .[dev]
 
 Download [jtds-1.3.1.jar](jtds-1.3.1.jar) into the spark jars folder in the python environment
 ```console
-cp jtds-1.3.1.jar .venv/lib/python3.10/site-packages/pyspark/jars/ 
+cp jtds-1.3.1.jar .venv/lib/python3.10/site-packages/pyspark/jars/
 ```
 
 ## Instructions for Use with [MEDS](https://github.com/Medical-Event-Data-Standard/meds)
 
 ### 1. Convert MEDS to the [meds_reader](https://github.com/som-shahlab/meds_reader) database
 
 If you don't have the MEDS dataset, you could convert the OMOP dataset to the MEDS
-using [meds_etl](https://github.com/Medical-Event-Data-Standard/meds_etl). 
+using [meds_etl](https://github.com/Medical-Event-Data-Standard/meds_etl).
 We have prepared a synthea dataset with 1M patients for you to test, you could download it
 at [omop_synthea.tar.gz](https://drive.google.com/file/d/1k7-cZACaDNw8A1JRI37mfMAhEErxKaQJ/view?usp=share_link)
 ```console
@@ -115,7 +115,7 @@ The sequence can be seen conceptually as [VS] [V1] [VE] [ATT] [VS] [V2] [VE], wh
 concepts associated with those visits.
 
 ```console
-PYTHONPATH=./: spark-submit spark_apps/generate_training_data.py -i ~/Documents/omop_test/ -o ~/Documents/omop_test/cehr-bert -tc condition_occurrence procedure_occurrence drug_exposure -d 1985-01-01 --is_new_patient_representation -iv 
+PYTHONPATH=./: spark-submit spark_apps/generate_training_data.py -i ~/Documents/omop_test/ -o ~/Documents/omop_test/cehr-bert -tc condition_occurrence procedure_occurrence drug_exposure -d 1985-01-01 --is_new_patient_representation -iv
 ```
 
 ### 3. Pre-train CEHR-BERT
@@ -125,7 +125,7 @@ at `sample/patient_sequence` in the repo. CEHR-BERT expects the data folder to b
 ```console
 mkdir test_dataset_prepared;
 mkdir test_results;
-python -m cehrbert.runners.hf_cehrbert_pretrain_runner sample_configs/hf_cehrbert_pretrain_runner_config.yaml 
+python -m cehrbert.runners.hf_cehrbert_pretrain_runner sample_configs/hf_cehrbert_pretrain_runner_config.yaml
 ```
 
 If your dataset is large, you could add ```--use_dask``` in the command above
@@ -157,4 +157,4 @@ Chao Pang, Xinzhuo Jiang, Krishna S. Kalluri, Matthew Spotnitz, RuiJun Chen, Adl
 Perotte, and Karthik Natarajan. "Cehr-bert: Incorporating temporal information from
 structured ehr data to improve prediction tasks." In Proceedings of Machine Learning for
 Health, volume 158 of Proceedings of Machine Learning Research, pages 239–260. PMLR,
-04 Dec 2021.
+04 Dec 2021.
diff --git a/db_properties.ini b/db_properties.ini
@@ -2,4 +2,4 @@
 base_url = jdbc:jtds:sqlserver://servername:1433;useNTLMv2=true;domain=domain_name;databaseName=db
 driver = net.sourceforge.jtds.jdbc.Driver
 user = username
-password = password
+password = password
diff --git a/deepspeed_configs/zero1.json b/deepspeed_configs/zero1.json
@@ -19,4 +19,4 @@
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",
   "wall_clock_breakdown": false
-}
+}
diff --git a/deepspeed_configs/zero2.json b/deepspeed_configs/zero2.json
@@ -23,4 +23,4 @@
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",
   "wall_clock_breakdown": false
-}
+}
diff --git a/deepspeed_configs/zero3.json b/deepspeed_configs/zero3.json
@@ -27,4 +27,4 @@
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",
   "wall_clock_breakdown": false
-}
+}
diff --git a/full_grid_search_config.ini b/full_grid_search_config.ini
@@ -8,4 +8,4 @@ val_4 = 1.2e-4
 val_1 = True
 
 [LSTM_UNIT]
-val_1 = 128
+val_1 = 128
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ authors = [
 ]
 description = "CEHR-BERT: Incorporating temporal information from structured EHR data to improve prediction tasks"
 readme = "README.md"
+license = { text = "MIT License" }
 requires-python = ">=3.10.0"
 
 classifiers = [
@@ -47,7 +48,7 @@ dependencies = [
     "scikit-learn==1.4.0",
     "scipy==1.12.0",
     "tensorflow==2.15.0",
-    "tensorflow-metal==1.1.0; sys_platform == 'darwin'",  # macOS only
+    "tensorflow-metal==1.1.0; sys_platform == 'darwin'", # macOS only
     "tensorflow-datasets==4.5.2",
     "tqdm==4.66.1",
     "torch==2.4.0",
@@ -60,11 +61,25 @@ dependencies = [
 
 [tool.setuptools_scm]
 
+[project.urls]
+Homepage = "https://github.com/cumc-dbmi/cehr-bert"
+
 [project.scripts]
 cehrbert-pretraining = "cehrbert.runner.hf_cehrbert_pretrain_runner:main"
 cehrbert-finetuning = "cehrbert.runner.hf_cehrbert_finetuning_runner:main"
 
 [project.optional-dependencies]
 dev = [
-    "pre-commit", "pytest", "pytest-cov", "pytest-subtests", "rootutils", "hypothesis"
+    "pre-commit", "pytest", "pytest-cov", "pytest-subtests", "rootutils", "hypothesis", "black"
 ]
+
+[tool.isort]
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 120
+
+[tool.black]
+line_length = 120
diff --git a/sample_configs/hf_cehrbert_pretrain_runner_meds_config.yaml b/sample_configs/hf_cehrbert_pretrain_runner_meds_config.yaml
@@ -50,4 +50,4 @@ logging_steps: 100
 save_total_limit:
 load_best_model_at_end: true
 metric_for_best_model: "eval_loss"
-greater_is_better: false
+greater_is_better: false
diff --git a/simple_grid_search_config.ini b/simple_grid_search_config.ini
@@ -5,4 +5,4 @@ val_1 = 1.0e-4
 val_1 = True
 
 [LSTM_UNIT]
-val_1 = 128
+val_1 = 128
diff --git a/src/cehrbert/__init__.py b/src/cehrbert/__init__.py
@@ -2,6 +2,7 @@
 
 It contains the main functions and classes needed to extract cohorts.
 """
+
 from importlib.metadata import PackageNotFoundError, version
 
 __package_name__ = "cehrbert"

diff --git a/src/cehrbert/config/grid_search_config.py b/src/cehrbert/config/grid_search_config.py
@@ -1,14 +1,13 @@
-from typing import NamedTuple, List
+from typing import List, NamedTuple
 
-LEARNING_RATE = 'LEARNING_RATE'
-LSTM_DIRECTION = 'LSTM_DIRECTION'
-LSTM_UNIT = 'LSTM_UNIT'
+LEARNING_RATE = "LEARNING_RATE"
+LSTM_DIRECTION = "LSTM_DIRECTION"
+LSTM_UNIT = "LSTM_UNIT"
 
 
 class GridSearchConfig(NamedTuple):
-    """
-    A data class for storing the row from the pandas data frame and the indexes for slicing the
-    """
+    """A data class for storing the row from the pandas data frame and the indexes for slicing the."""
+
     learning_rates: List[float] = [1.0e-4]
     lstm_directions: List[bool] = [True]
     lstm_units: List[int] = [128]
diff --git a/src/cehrbert/config/output_names.py b/src/cehrbert/config/output_names.py
@@ -1,9 +1,9 @@
-PARQUET_DATA_PATH = 'patient_sequence'
-QUALIFIED_CONCEPT_LIST_PATH = 'qualified_concept_list'
-TIME_ATTENTION_MODEL_PATH = 'time_aware_model.h5'
-BERT_MODEL_VALIDATION_PATH = 'bert_model.h5'
-MORTALITY_DATA_PATH = 'mortality'
-HEART_FAILURE_DATA_PATH = 'heart_failure'
-HOSPITALIZATION_DATA_PATH = 'hospitalization'
-INFORMATION_CONTENT_DATA_PATH = 'information_content'
-CONCEPT_SIMILARITY_PATH = 'concept_similarity'
+PARQUET_DATA_PATH = "patient_sequence"
+QUALIFIED_CONCEPT_LIST_PATH = "qualified_concept_list"
+TIME_ATTENTION_MODEL_PATH = "time_aware_model.h5"
+BERT_MODEL_VALIDATION_PATH = "bert_model.h5"
+MORTALITY_DATA_PATH = "mortality"
+HEART_FAILURE_DATA_PATH = "heart_failure"
+HOSPITALIZATION_DATA_PATH = "hospitalization"
+INFORMATION_CONTENT_DATA_PATH = "information_content"
+CONCEPT_SIMILARITY_PATH = "concept_similarity"
diff --git a/src/cehrbert/const/common.py b/src/cehrbert/const/common.py
@@ -1,18 +1,28 @@
-PERSON = 'person'
-VISIT_OCCURRENCE = 'visit_occurrence'
-CONDITION_OCCURRENCE = 'condition_occurrence'
-PROCEDURE_OCCURRENCE = 'procedure_occurrence'
-DRUG_EXPOSURE = 'drug_exposure'
-DEVICE_EXPOSURE = 'device_exposure'
-OBSERVATION = 'observation'
-MEASUREMENT = 'measurement'
-CATEGORICAL_MEASUREMENT = 'categorical_measurement'
-OBSERVATION_PERIOD = 'observation_period'
-DEATH = 'death'
-CDM_TABLES = [PERSON, VISIT_OCCURRENCE, CONDITION_OCCURRENCE, PROCEDURE_OCCURRENCE, DRUG_EXPOSURE,
-              DEVICE_EXPOSURE, OBSERVATION, MEASUREMENT, CATEGORICAL_MEASUREMENT,
-              OBSERVATION_PERIOD, DEATH]
-REQUIRED_MEASUREMENT = 'required_measurement'
-UNKNOWN_CONCEPT = '[UNKNOWN]'
-CONCEPT = 'concept'
-CONCEPT_ANCESTOR = 'concept_ancestor'
+PERSON = "person"
+VISIT_OCCURRENCE = "visit_occurrence"
+CONDITION_OCCURRENCE = "condition_occurrence"
+PROCEDURE_OCCURRENCE = "procedure_occurrence"
+DRUG_EXPOSURE = "drug_exposure"
+DEVICE_EXPOSURE = "device_exposure"
+OBSERVATION = "observation"
+MEASUREMENT = "measurement"
+CATEGORICAL_MEASUREMENT = "categorical_measurement"
+OBSERVATION_PERIOD = "observation_period"
+DEATH = "death"
+CDM_TABLES = [
+    PERSON,
+    VISIT_OCCURRENCE,
+    CONDITION_OCCURRENCE,
+    PROCEDURE_OCCURRENCE,
+    DRUG_EXPOSURE,
+    DEVICE_EXPOSURE,
+    OBSERVATION,
+    MEASUREMENT,
+    CATEGORICAL_MEASUREMENT,
+    OBSERVATION_PERIOD,
+    DEATH,
+]
+REQUIRED_MEASUREMENT = "required_measurement"
+UNKNOWN_CONCEPT = "[UNKNOWN]"
+CONCEPT = "concept"
+CONCEPT_ANCESTOR = "concept_ancestor"
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     It contains the main functions and classes needed to extract cohorts.
     """
     from importlib.metadata import PackageNotFoundError, version
     __package_name__ = "cehrbert"
@@ Expand Down @@