From c5dde44fc750cd4db4a4e3001fae16ec95cc7106 Mon Sep 17 00:00:00 2001
From: Gabriel Gimenez <gabriel.gimenez@loft.com.br>
Date: Thu, 4 Jul 2024 17:39:08 -0300
Subject: [PATCH] fix tests (#73)

* update dependencies and fix errors
---
 .github/workflows/auto-publish.yml | 44 +++++++++++++++----------
 .github/workflows/testing.yml      | 30 +++++++++--------
 .pre-commit-config.yaml            | 12 +++----
 Makefile                           | 10 +++---
 README.md                          |  2 +-
 examples/benchmarks/benchmark.py   |  7 ----
 mkdocs.yml                         |  5 ++-
 pyproject.toml                     | 36 +++++++++++++++++++++
 setup.py                           | 52 ------------------------------
 tests/test_feature_extractors.py   | 15 +++++----
 tests/test_metrics.py              |  2 +-
 xgbse/__init__.py                  |  5 ++-
 xgbse/_debiased_bce.py             |  2 +-
 xgbse/_feature_extractors.py       | 21 ++++++++----
 xgbse/_kaplan_neighbors.py         |  4 +--
 xgbse/_meta.py                     |  1 -
 16 files changed, 121 insertions(+), 127 deletions(-)
 create mode 100644 pyproject.toml
 delete mode 100644 setup.py

diff --git a/.github/workflows/auto-publish.yml b/.github/workflows/auto-publish.yml
index 1d6dda6..3e86f7d 100644
--- a/.github/workflows/auto-publish.yml
+++ b/.github/workflows/auto-publish.yml
@@ -6,20 +6,30 @@ jobs:
   release:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.8
-    - name: Install Tools
-      run: |
-        python -m pip install --upgrade pip
-        pip install setuptools wheel twine
-    - name: Package and Upload
-      env:
-        STACKMANAGER_VERSION: ${{ github.event.release.tag_name }}
-        TWINE_USERNAME: __token__
-        TWINE_PASSWORD: ${{ secrets.PYPI_APIKEY }}
-      run: |
-        python setup.py sdist bdist_wheel
-        twine upload dist/*
+      - uses: actions/checkout@v2
+
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python3 -
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Configure Poetry
+        run: |
+          poetry config pypi-token.pypi ${{ secrets.PYPI_APIKEY }}
+
+      - name: Install dependencies
+        run: |
+          poetry install
+
+      - name: Build package
+        run: |
+          poetry build
+
+      - name: Publish package
+        run: |
+          poetry publish
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index ee1bbc1..8bf7f87 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -3,25 +3,29 @@ name: Code Checks
 on:
   push:
     branches:
-    - main
+      - main
   pull_request:
     branches:
-    - main
+      - main
 
 jobs:
   build:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7]
-
+        python-version: ["3.9", "3.10", "3.11"]
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v1
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install Testing Dependencies
-      run: make install-dev
-    - name: Automated checks
-      run: make check
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+        with:
+          version: 1.5.1  # You can specify the Poetry version you want to use
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+      - name: Install dependencies
+        run: poetry install --no-interaction --no-root
+      - name: Automated checks
+        run: poetry run make check
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2676077..5ee5702 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,12 +9,8 @@ repos:
         -   id: check-yaml
         -   id: check-added-large-files
             args: ["--maxkb=2000"]
-    -   repo: https://gitlab.com/pycqa/flake8
-        rev: 8f9b4931b9a28896fb43edccb23016a7540f5b82
+    -   repo: https://github.com/astral-sh/ruff-pre-commit
+        rev: v0.5.0
         hooks:
-        -   id: flake8
-    -   repo: https://github.com/psf/black
-        rev: 20.8b1
-        hooks:
-          - id: black
-            language_version: python3.7
+        -    id: ruff
+        -    id: ruff-format
diff --git a/Makefile b/Makefile
index ec83024..6ad2330 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,13 @@
-black:
-	black xgbse setup.py tests/ --check
+format:
+	ruff format xgbse tests/ --check
 
-flake:
-	flake8 xgbse setup.py tests/
+lint:
+	ruff check xgbse tests/
 
 test:
 	pytest --cov-report term-missing --cov=xgbse tests/
 
-check: black flake test clean
+check: format lint test clean
 
 install:
 	python -m pip install -e .
diff --git a/README.md b/README.md
index 04c1cd8..c2cab96 100644
--- a/README.md
+++ b/README.md
@@ -409,7 +409,7 @@ To cite this repository:
   author = {Davi Vieira and Gabriel Gimenez and Guilherme Marmerola and Vitor Estima},
   title = {XGBoost Survival Embeddings: improving statistical properties of XGBoost survival analysis implementation},
   url = {http://github.com/loft-br/xgboost-survival-embeddings},
-  version = {0.2.3},
+  version = {0.3.1},
   year = {2021},
 }
 ```
diff --git a/examples/benchmarks/benchmark.py b/examples/benchmarks/benchmark.py
index 5b36510..29340d2 100644
--- a/examples/benchmarks/benchmark.py
+++ b/examples/benchmarks/benchmark.py
@@ -11,7 +11,6 @@
 
 
 def dataframe_to_xy(dataf, event_column, time_column):
-
     e = dataf.loc[:, event_column]
     t = dataf.loc[:, time_column]
     return dataf.drop([event_column, time_column], axis=1), convert_to_structured(t, e)
@@ -65,7 +64,6 @@ def predict(self):
         pass
 
     def test(self):
-
         self.predict()
         try:
             c_index = concordance_index(
@@ -172,7 +170,6 @@ def __init__(
         )
 
     def train(self):
-
         start = time.time()
         params = {"objective": self.objective}
 
@@ -226,14 +223,12 @@ def __init__(
         self.objective = objective
 
     def train(self):
-
         start = time.time()
 
         if self.model.__class__.__name__ not in [
             "XGBSEKaplanTree",
             "XGBSEBootstrapEstimator",
         ]:
-
             self.model.fit(
                 self.X_train,
                 self.y_train,
@@ -244,7 +239,6 @@ def train(self):
             )
 
         else:
-
             self.model.fit(self.X_train, self.y_train, time_bins=self.time_bins)
 
         self.training_time = time.time() - start
@@ -284,7 +278,6 @@ def __init__(
         )
 
     def train(self):
-
         T = self.train_dataset[self.time_column]
         E = self.train_dataset[self.event_column]
 
diff --git a/mkdocs.yml b/mkdocs.yml
index efb46b1..4c0a2db 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -19,13 +19,12 @@ nav:
       - converters: modules/converters.md
       - metrics: modules/metrics.md
   - Examples:
+      - Basic usage: examples/basic_usage.md
       - Confidence intervals: examples/confidence_interval.md
       - Extrapolation: examples/extrapolation_example.md
   - Benchmarks: benchmarks/benchmarks.md
 plugins:
-  - mkdocstrings:
-      watch:
-        - xgbse
+  - mkdocstrings
   - search
 copyright:
 theme:
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4982678
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,36 @@
+[tool.poetry]
+name = "xgbse"
+version = "0.3.1"
+description = "Improving XGBoost survival analysis with embeddings and debiased estimators"
+authors = ["Loft Data Science Team <bandits@loft.com.br>"]
+readme = "README.md"
+packages = [{ include = "xgbse" }]
+repository = "https://github.com/loft-br/xgboost-survival-embeddings"
+
+[tool.poetry.dependencies]
+python = ">=3.9"
+xgboost = "^2.1.0"
+numpy = "^1.26.4"
+scikit-learn = "^1.5.0"
+pandas = "^2.2.0"
+joblib = "^1.4.2"
+lifelines = "^0.29.0"
+
+[tool.poetry.group.docs.dependencies]
+mkdocs = "^1.6.0"
+mkdocs-material = "^9.5.28"
+mkdocstrings = { version = ">=0.18", extras = ["python-legacy"] }
+
+
+[tool.poetry.group.dev.dependencies]
+pre-commit = "^3.7.1"
+pytest = "^8.2.2"
+pytest-cov = "^5.0.0"
+ruff = "^0.5.0"
+
+[tool.poetry.group.benchmark.dependencies]
+pycox = "0.2.1"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 7b7236d..0000000
--- a/setup.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import setuptools
-
-install_requires = [
-    "xgboost>=1.4.0",
-    "numpy>=1.18.4",
-    "scikit-learn>=0.22.2",
-    "pandas>=1.0.*",
-    "joblib>=0.15.1",
-    "lifelines>=0.25.4",
-]
-
-docs_packages = [
-    "mkdocs>=1.1",
-    "mkdocs-material>=4.6.3",
-    "mkdocstrings>=0.8.0",
-]
-
-dev_packages = [
-    "black>=19.10b0",
-    "flake8>=3.7.9",
-    "pre-commit>=2.7.1",
-    "pytest>=6.1.0",
-    "pytest-cov==2.10.1",
-] + docs_packages
-
-benchmark_packages = [
-    "pycox==0.2.1",
-]
-
-all_packages = install_requires + dev_packages + benchmark_packages
-
-with open("docs/index.md", "r", encoding="utf-8") as fh:
-    long_description = fh.read()
-
-setuptools.setup(
-    name="xgbse",
-    version="0.2.3",
-    author="Loft Data Science Team",
-    author_email="bandits@loft.com.br",
-    description="Improving XGBoost survival analysis with embeddings and debiased estimators",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    packages=setuptools.find_packages(),
-    install_requires=install_requires,
-    extras_require={
-        "docs": docs_packages,
-        "dev": dev_packages,
-        "all": all_packages,
-    },
-    python_requires=">=3.7",
-    url="https://github.com/loft-br/xgboost-survival-embeddings",
-)
diff --git a/tests/test_feature_extractors.py b/tests/test_feature_extractors.py
index a14fdd9..1e56733 100644
--- a/tests/test_feature_extractors.py
+++ b/tests/test_feature_extractors.py
@@ -31,18 +31,21 @@ def test_no_objective():
 
 def test_predict_leaves_early_stop():
     xgbse = FeatureExtractor()
+    early_stopping_rounds = 10
     xgbse.fit(
         X_train,
         y_train,
-        num_boost_round=10000,
+        num_boost_round=1000,
         validation_data=(X_valid, y_valid),
-        early_stopping_rounds=10,
+        early_stopping_rounds=early_stopping_rounds,
         verbose_eval=0,
     )
     prediction = xgbse.predict_leaves(X_test)
-    assert prediction.shape == (
-        X_test.shape[0],
-        xgbse.bst.best_iteration + 1,
+    assert prediction.shape[0] == X_test.shape[0]
+    assert (
+        xgbse.bst.best_iteration
+        <= prediction.shape[1]
+        <= xgbse.bst.best_iteration + 1 + early_stopping_rounds
     )
 
 
@@ -64,7 +67,7 @@ def test_predict_hazard_early_stop():
     xgbse.fit(
         X_train,
         y_train,
-        num_boost_round=10000,
+        num_boost_round=1000,
         validation_data=(X_valid, y_valid),
         early_stopping_rounds=10,
         verbose_eval=0,
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index ccf685b..3440814 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -68,7 +68,7 @@ def is_dist_cal_return_correct_len():
 
 def is_dist_cal_return_correct_type():
     result = dist_calibration_score(y_train, km_survival, returns="all")
-    return type(result) == dict
+    return isinstance(result, dict)
 
 
 # testing
diff --git a/xgbse/__init__.py b/xgbse/__init__.py
index f085352..67f6571 100644
--- a/xgbse/__init__.py
+++ b/xgbse/__init__.py
@@ -2,11 +2,10 @@
 
 from ._debiased_bce import XGBSEDebiasedBCE
 from ._kaplan_neighbors import XGBSEKaplanNeighbors, XGBSEKaplanTree
-from ._stacked_weibull import XGBSEStackedWeibull
 from ._meta import XGBSEBootstrapEstimator
+from ._stacked_weibull import XGBSEStackedWeibull
 
-
-__version__ = "0.2.3"
+__version__ = "0.3.1"
 
 __all__ = [
     "XGBSEDebiasedBCE",
diff --git a/xgbse/_debiased_bce.py b/xgbse/_debiased_bce.py
index 6f1be31..83e71d9 100644
--- a/xgbse/_debiased_bce.py
+++ b/xgbse/_debiased_bce.py
@@ -292,7 +292,7 @@ def predict(self, X: pd.DataFrame, return_interval_probs: bool = False):
                 XGBoost model.
 
             return_interval_probs (Bool): Boolean indicating if interval probabilities
-             are to be returned. If False the cumulative survival is returned.
+                are to be returned. If False the cumulative survival is returned.
                 Default is False.
 
         Returns:
diff --git a/xgbse/_feature_extractors.py b/xgbse/_feature_extractors.py
index 83cbe80..0263b74 100644
--- a/xgbse/_feature_extractors.py
+++ b/xgbse/_feature_extractors.py
@@ -22,7 +22,7 @@ def __init__(
         """
         if not xgb_params:
             xgb_params = {}
-        xgb_params = check_xgboost_parameters(xgb_params)
+        xgb_params = check_xgboost_parameters(xgb_params, enable_categorical)
 
         self.xgb_params = xgb_params
         self.persist_train = False
@@ -119,21 +119,20 @@ def predict_leaves(self, X):
             raise ValueError("XGBoost model not fitted yet.")
 
         dmatrix = xgb.DMatrix(X, enable_categorical=self.enable_categorical)
-        return self.bst.predict(
-            dmatrix, pred_leaf=True, iteration_range=(0, self.bst.best_iteration + 1)
-        )
+        return self.bst.predict(dmatrix, pred_leaf=True)
 
     def predict_hazard(self, X):
         if not hasattr(self, "bst"):
             raise ValueError("XGBoost model not fitted yet.")
 
         return self.bst.predict(
-            xgb.DMatrix(X, enable_categorical=self.enable_categorical),
-            iteration_range=(0, self.bst.best_iteration + 1),
+            xgb.DMatrix(X, enable_categorical=self.enable_categorical)
         )
 
 
-def check_xgboost_parameters(xgb_params: Dict[str, Any]) -> Dict[str, Any]:
+def check_xgboost_parameters(
+    xgb_params: Dict[str, Any], enable_categorical: bool
+) -> Dict[str, Any]:
     """Check if XGBoost objective parameter is valid.
 
     Args:
@@ -145,6 +144,14 @@ def check_xgboost_parameters(xgb_params: Dict[str, Any]) -> Dict[str, Any]:
     Raises:
         ValueError: If XGBoost parameters are not valid for survival analysis.
     """
+    if enable_categorical:
+        if "tree_method" not in xgb_params:
+            xgb_params["tree_method"] = "hist"
+        if xgb_params["tree_method"] not in ("hist", "gpu_hist"):
+            raise ValueError(
+                "XGBoost tree_method must be either 'hist' or 'gpu_hist' for categorical features"
+            )
+
     if "objective" not in xgb_params:
         xgb_params["objective"] = "survival:aft"
     if xgb_params["objective"] not in ("survival:aft", "survival:cox"):
diff --git a/xgbse/_kaplan_neighbors.py b/xgbse/_kaplan_neighbors.py
index a725d62..1cc17d7 100644
--- a/xgbse/_kaplan_neighbors.py
+++ b/xgbse/_kaplan_neighbors.py
@@ -19,7 +19,7 @@
 DEFAULT_PARAMS_TREE = {
     "objective": "survival:cox",
     "eval_metric": "cox-nloglik",
-    "tree_method": "exact",
+    "tree_method": "hist",
     "max_depth": 100,
     "booster": "dart",
     "subsample": 1.0,
@@ -275,7 +275,7 @@ def __init__(
                 DEFAULT_PARAMS_TREE = {
                     "objective": "survival:cox",
                     "eval_metric": "cox-nloglik",
-                    "tree_method": "exact",
+                    "tree_method": "hist",
                     "max_depth": 100,
                     "booster": "dart",
                     "subsample": 1.0,
diff --git a/xgbse/_meta.py b/xgbse/_meta.py
index 438f69a..b679b94 100644
--- a/xgbse/_meta.py
+++ b/xgbse/_meta.py
@@ -5,7 +5,6 @@
 
 
 class XGBSEBootstrapEstimator(BaseEstimator):
-
     """
     Bootstrap meta-estimator for XGBSE models: