Merge upstream main

scikit-learn-contrib · Jan 5, 2024 · d67d352 · d67d352
1 parent f6ec329
commit d67d352
Show file tree

Hide file tree

Showing 17 changed files with 1,307 additions and 1,408 deletions.
diff --git a/.github/workflows/deploy-pypi.yml b/.github/workflows/deploy-pypi.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
         os: [ubuntu-latest, macOS-latest, windows-latest]
     steps:
     - uses: actions/checkout@v2
@@ -29,7 +29,7 @@ jobs:
         python -m pip install pydocstyle==6.1.1
         python -m pip install pytest-pydocstyle==2.3.0
         python -m pip install pytest-cov==3.0.0
-        python -m pip install ray==1.13.0
+        python -m pip install ray
         python -m pip install 'importlib-metadata<4.3'
         python -m pip install .
     - name: Test with pytest

diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml
@@ -30,9 +30,7 @@ jobs:
         python -m pip install pydocstyle==6.1.1
         python -m pip install pytest-pydocstyle==2.3.0
         python -m pip install pytest-cov==3.0.0
-        python -m pip install ray==1.13.0
-        python -m pip install pandas==1.4.2
-        python -m pip install numpy==1.23.4
+        python -m pip install ray
         python -m pip install .
     - name: Test with pytest
       run: |

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/docs/examples/plot_empty_levels.py b/docs/examples/plot_empty_levels.py
@@ -11,20 +11,24 @@
 .. figure:: ../algorithms/local_classifier_per_node.svg
    :align: center
 """
+import numpy as np
 from sklearn.linear_model import LogisticRegression
 
 from hiclass import LocalClassifierPerNode
 
 # Define data
 X_train = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
 X_test = [[9, 10], [7, 8], [5, 6], [3, 4], [1, 2]]
-Y_train = [
-    ["Bird"],
-    ["Reptile", "Snake"],
-    ["Reptile", "Lizard"],
-    ["Mammal", "Cat"],
-    ["Mammal", "Wolf", "Dog"],
-]
+Y_train = np.array(
+    [
+        ["Bird"],
+        ["Reptile", "Snake"],
+        ["Reptile", "Lizard"],
+        ["Mammal", "Cat"],
+        ["Mammal", "Wolf", "Dog"],
+    ],
+    dtype=object,
+)
 
 # Use random forest classifiers for every node
 rf = LogisticRegression()

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -7,5 +7,5 @@ sphinx-gallery==0.10.1
 matplotlib==3.5.2
 pandas==1.4.2
 ray==1.13.0
-numpy<1.24
+numpy
 git+https://github.com/charles9n/bert-sklearn.git@master
diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py
@@ -1,7 +1,6 @@
 """Shared code for all classifiers."""
 import abc
 import logging
-from typing import List
 
 import networkx as nx
 import numpy as np
@@ -24,19 +23,17 @@ def make_leveled(y):
 
     Parameters
     ----------
-    y : array-like of shape (n_samples, n_levels) or (n_samples, n_labels, n_levels) for multi-label classification
+    y : array-like of shape (n_samples, n_levels)
         The target values, i.e., hierarchical class labels for classification.
 
     Returns
     -------
-    leveled_y : array-like of shape (n_samples, n_levels) or (n_samples, n_labels, n_levels) for multi-label classification
+    leveled_y : array-like of shape (n_samples, n_levels)
         The leveled target values, i.e., hierarchical class labels for classification.
-        Y is returned only to pass sklearn's checks.
 
     Notes
     -----
     If rows are not iterable, returns the current y without modifications.
-    Y is returned only to pass sklearn's checks.
 
     Examples
     --------
@@ -46,28 +43,13 @@ def make_leveled(y):
     array([['a', ''],
        ['b', 'c']])
     """
-    # TODO: refactor this function to make it more readable
-    # TODO: add more tests, e.g., mixing different types and lists or np.ndarrays
-    if not isinstance(y, List):
-        return y
-    if not isinstance(y[0], List):
-        return y
-    elif not isinstance(y[0][0], List):
+    try:
         depth = max([len(row) for row in y])
-        leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y]
-        return np.array(leveled_y)
-    elif not isinstance(y[0][0][0], List):
-        rows = len(y)
-        multi_labels = max([len(row) for row in y])
-        levels = max([len(label) for row in y for label in row])
-        leveled_y = np.full((rows, multi_labels, levels), "", dtype=object)
-        for i, row in enumerate(y):
-            for j, multi_label in enumerate(row):
-                for k, label in enumerate(multi_label):
-                    leveled_y[i, j, k] = y[i][j][k]
-        return np.array(leveled_y)
-    else:
-        raise ValueError("y.ndim must be <= 3")
+    except TypeError:
+        return y
+    y = np.array(y, dtype=object)
+    leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y]
+    return np.array(leveled_y)
 
 
 class HierarchicalClassifier(abc.ABC):
@@ -79,25 +61,21 @@ class HierarchicalClassifier(abc.ABC):
     def __init__(
         self,
         local_classifier: BaseEstimator = None,
-        tolerance: float = None,
         verbose: int = 0,
         edge_list: str = None,
         replace_classifiers: bool = True,
         n_jobs: int = 1,
         bert: bool = False,
         classifier_abbreviation: str = "",
     ):
-        r"""
+        """
         Initialize a local hierarchical classifier.
 
         Parameters
         ----------
         local_classifier : BaseEstimator, default=LogisticRegression
             The local_classifier used to create the collection of local classifiers. Needs to have fit, predict and
             clone methods.
-        tolerance : float, default=None
-            The tolerance used to determine multi-labels. If set to None, only the child class with highest probability is predicted.
-            Otherwise, all child classes with :math:`probability >= max\_prob - tolerance` are predicted.
         verbose : int, default=0
             Controls the verbosity when fitting and predicting.
             See https://verboselogs.readthedocs.io/en/latest/readme.html#overview-of-logging-levels
@@ -116,7 +94,6 @@ def __init__(
             The abbreviation of the local hierarchical classifier to be displayed during logging.
         """
         self.local_classifier = local_classifier
-        self.tolerance = tolerance
         self.verbose = verbose
         self.edge_list = edge_list
         self.replace_classifiers = replace_classifiers
@@ -157,12 +134,6 @@ def _pre_fit(self, X, y, sample_weight):
         # Check that X and y have correct shape
         # and convert them to np.ndarray if need be
 
-        try:
-            if len(y) > 0:
-                y = make_leveled(y)
-        except TypeError:
-            pass
-
         if not self.bert:
             self.X_, self.y_ = self._validate_data(
                 X, y, multi_output=True, accept_sparse="csr", allow_nd=True
@@ -176,6 +147,8 @@ def _pre_fit(self, X, y, sample_weight):
         else:
             self.sample_weight_ = None
 
+        self.y_ = make_leveled(self.y_)
+
         # Create and configure logger
         self._create_logger()
 
@@ -224,7 +197,6 @@ def _create_logger(self):
             self.logger_.addHandler(ch)
 
     def _disambiguate(self):
-        # TODO: refactor this function to make it more readable
         self.separator_ = "::HiClass::Separator::"
         if self.y_.ndim == 2:
             new_y = []
@@ -236,22 +208,6 @@ def _disambiguate(self):
                     row.append(parent + self.separator_ + child)
                 new_y.append(np.asarray(row, dtype=np.str_))
             self.y_ = np.array(new_y)
-        if self.y_.ndim == 3:
-            new_y = []
-            for i in range(self.y_.shape[0]):
-                new_y.append([])
-                for j in range(self.y_.shape[1]):
-                    new_y[i].append([str(self.y_[i, j, 0])])
-                    for k in range(1, self.y_.shape[2]):
-                        new_cell = ""
-                        if new_y[i][j][k - 1] != "":
-                            new_cell = (
-                                new_y[i][j][k - 1]
-                                + self.separator_
-                                + str(self.y_[i, j, k])
-                            )
-                        new_y[i][j].append(new_cell)
-            self.y_ = np.array(new_y)
 
     def _create_digraph(self):
         # Create DiGraph
@@ -264,17 +220,13 @@ def _create_digraph(self):
 
         self._create_digraph_2d()
 
-        self._create_digraph_3d()
-
-        if self.y_.ndim > 3:
+        if self.y_.ndim > 2:
             # Unsuported dimension
             self.logger_.error(f"y with {self.y_.ndim} dimensions detected")
             raise ValueError(
                 f"Creating graph from y with {self.y_.ndim} dimensions is not supported"
             )
 
-        self._save_classes()
-
     def _create_digraph_1d(self):
         # Flatten 1D disguised as 2D
         if self.y_.ndim == 2 and self.y_.shape[1] == 1:
@@ -283,7 +235,6 @@ def _create_digraph_1d(self):
         if self.y_.ndim == 1:
             # Create max_levels_ variable
             self.max_levels_ = 1
-            self.ndim_ = 1
             self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels")
             for label in self.y_:
                 self.hierarchy_.add_node(label)
@@ -292,7 +243,6 @@ def _create_digraph_2d(self):
         if self.y_.ndim == 2:
             # Create max_levels variable
             self.max_levels_ = self.y_.shape[1]
-            self.ndim_ = 2
             rows, columns = self.y_.shape
             self.logger_.info(f"Creating digraph from {rows} 2D labels")
             for row in range(rows):
@@ -307,34 +257,6 @@ def _create_digraph_2d(self):
                     elif parent != "" and column == 0:
                         self.hierarchy_.add_node(parent)
 
-    def _create_digraph_3d(self):
-        if self.y_.ndim == 3:
-            self.max_multi_labels = self.y_.shape[1]
-            self.max_levels_ = self.y_.shape[2]
-            self.ndim_ = 3
-            rows, multi_labels, columns = self.y_.shape
-            self.logger_.info(f"Creating digraph from {rows} 3D labels")
-            for row in range(rows):
-                for multi_label in range(multi_labels):
-                    for column in range(columns - 1):
-                        parent = self.y_[row, multi_label, column].split(
-                            self.separator_
-                        )[-1]
-                        child = self.y_[row, multi_label, column + 1].split(
-                            self.separator_
-                        )[-1]
-                        if parent != "" and child != "":
-                            # Only add edge if both parent and child are not empty
-                            self.hierarchy_.add_edge(
-                                self.y_[row, multi_label, column],
-                                self.y_[row, multi_label, column + 1],
-                            )
-                        elif parent != "" and column == 0:
-                            self.hierarchy_.add_node(parent)
-
-    def _save_classes(self):
-        self.classes_ = np.array(self.hierarchy_.nodes)
-
     def _export_digraph(self):
         # Check if edge_list is set
         if self.edge_list:
@@ -388,18 +310,9 @@ def _convert_to_1d(self, y):
     def _remove_separator(self, y):
         # Remove separator from predictions
         if y.ndim == 2:
-            rows, columns = y.shape
-            for row in range(rows):
-                for column in range(1, columns):
-                    y[row, column] = y[row, column].split(self.separator_)[-1]
-        elif y.ndim == 3:
-            rows, multi_labels, columns = y.shape
-            for row in range(rows):
-                for multi_label in range(multi_labels):
-                    for column in range(1, columns):
-                        y[row, multi_label, column] = y[row, multi_label, column].split(
-                            self.separator_
-                        )[-1]
+            for i in range(y.shape[0]):
+                for j in range(1, y.shape[1]):
+                    y[i, j] = y[i, j].split(self.separator_)[-1]
 
     def _fit_node_classifier(
         self, nodes, local_mode: bool = False, use_joblib: bool = False
@@ -411,9 +324,9 @@ def _fit_node_classifier(
                     local_mode=local_mode,
                     ignore_reinit_error=True,
                 )
-                classifier = ray.put(self)
+                lcppn = ray.put(self)
                 _parallel_fit = ray.remote(self._fit_classifier)
-                results = [_parallel_fit.remote(classifier, node) for node in nodes]
+                results = [_parallel_fit.remote(lcppn, node) for node in nodes]
                 classifiers = ray.get(results)
             else:
                 classifiers = Parallel(n_jobs=self.n_jobs)(

diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py
@@ -140,7 +140,7 @@ def predict(self, X):
 
         # Input validation
         if not self.bert:
-            X = check_array(X, accept_sparse="csr")
+            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
         else:
             X = np.array(X)
 
@@ -250,7 +250,10 @@ def _fit_classifier(self, level, separator):
         if len(unique_y) == 1 and self.replace_classifiers:
             classifier = ConstantClassifier()
         if not self.bert:
-            classifier.fit(X, y, sample_weight)
+            try:
+                classifier.fit(X, y, sample_weight)
+            except TypeError:
+                classifier.fit(X, y)
         else:
             classifier.fit(X, y)
         return classifier