Skip to content

Commit

Permalink
Merge upstream main
Browse files Browse the repository at this point in the history
  • Loading branch information
agemcipe committed Jan 5, 2024
1 parent f6ec329 commit d67d352
Show file tree
Hide file tree
Showing 17 changed files with 1,307 additions and 1,408 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/deploy-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.8", "3.9"]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
os: [ubuntu-latest, macOS-latest, windows-latest]
steps:
- uses: actions/checkout@v2
Expand All @@ -29,7 +29,7 @@ jobs:
python -m pip install pydocstyle==6.1.1
python -m pip install pytest-pydocstyle==2.3.0
python -m pip install pytest-cov==3.0.0
python -m pip install ray==1.13.0
python -m pip install ray
python -m pip install 'importlib-metadata<4.3'
python -m pip install .
- name: Test with pytest
Expand Down
4 changes: 1 addition & 3 deletions .github/workflows/test-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ jobs:
python -m pip install pydocstyle==6.1.1
python -m pip install pytest-pydocstyle==2.3.0
python -m pip install pytest-cov==3.0.0
python -m pip install ray==1.13.0
python -m pip install pandas==1.4.2
python -m pip install numpy==1.23.4
python -m pip install ray
python -m pip install .
- name: Test with pytest
run: |
Expand Down
1,361 changes: 1,037 additions & 324 deletions Pipfile.lock

Large diffs are not rendered by default.

18 changes: 11 additions & 7 deletions docs/examples/plot_empty_levels.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,24 @@
.. figure:: ../algorithms/local_classifier_per_node.svg
:align: center
"""
import numpy as np
from sklearn.linear_model import LogisticRegression

from hiclass import LocalClassifierPerNode

# Define data
X_train = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
X_test = [[9, 10], [7, 8], [5, 6], [3, 4], [1, 2]]
Y_train = [
["Bird"],
["Reptile", "Snake"],
["Reptile", "Lizard"],
["Mammal", "Cat"],
["Mammal", "Wolf", "Dog"],
]
Y_train = np.array(
[
["Bird"],
["Reptile", "Snake"],
["Reptile", "Lizard"],
["Mammal", "Cat"],
["Mammal", "Wolf", "Dog"],
],
dtype=object,
)

# Use random forest classifiers for every node
rf = LogisticRegression()
Expand Down
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ sphinx-gallery==0.10.1
matplotlib==3.5.2
pandas==1.4.2
ray==1.13.0
numpy<1.24
numpy
git+https://github.com/charles9n/bert-sklearn.git@master
121 changes: 17 additions & 104 deletions hiclass/HierarchicalClassifier.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Shared code for all classifiers."""
import abc
import logging
from typing import List

import networkx as nx
import numpy as np
Expand All @@ -24,19 +23,17 @@ def make_leveled(y):
Parameters
----------
y : array-like of shape (n_samples, n_levels) or (n_samples, n_labels, n_levels) for multi-label classification
y : array-like of shape (n_samples, n_levels)
The target values, i.e., hierarchical class labels for classification.
Returns
-------
leveled_y : array-like of shape (n_samples, n_levels) or (n_samples, n_labels, n_levels) for multi-label classification
leveled_y : array-like of shape (n_samples, n_levels)
The leveled target values, i.e., hierarchical class labels for classification.
Y is returned only to pass sklearn's checks.
Notes
-----
If rows are not iterable, returns the current y without modifications.
Y is returned only to pass sklearn's checks.
Examples
--------
Expand All @@ -46,28 +43,13 @@ def make_leveled(y):
array([['a', ''],
['b', 'c']])
"""
# TODO: refactor this function to make it more readable
# TODO: add more tests, e.g., mixing different types and lists or np.ndarrays
if not isinstance(y, List):
return y
if not isinstance(y[0], List):
return y
elif not isinstance(y[0][0], List):
try:
depth = max([len(row) for row in y])
leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y]
return np.array(leveled_y)
elif not isinstance(y[0][0][0], List):
rows = len(y)
multi_labels = max([len(row) for row in y])
levels = max([len(label) for row in y for label in row])
leveled_y = np.full((rows, multi_labels, levels), "", dtype=object)
for i, row in enumerate(y):
for j, multi_label in enumerate(row):
for k, label in enumerate(multi_label):
leveled_y[i, j, k] = y[i][j][k]
return np.array(leveled_y)
else:
raise ValueError("y.ndim must be <= 3")
except TypeError:
return y
y = np.array(y, dtype=object)
leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y]
return np.array(leveled_y)


class HierarchicalClassifier(abc.ABC):
Expand All @@ -79,25 +61,21 @@ class HierarchicalClassifier(abc.ABC):
def __init__(
self,
local_classifier: BaseEstimator = None,
tolerance: float = None,
verbose: int = 0,
edge_list: str = None,
replace_classifiers: bool = True,
n_jobs: int = 1,
bert: bool = False,
classifier_abbreviation: str = "",
):
r"""
"""
Initialize a local hierarchical classifier.
Parameters
----------
local_classifier : BaseEstimator, default=LogisticRegression
The local_classifier used to create the collection of local classifiers. Needs to have fit, predict and
clone methods.
tolerance : float, default=None
The tolerance used to determine multi-labels. If set to None, only the child class with highest probability is predicted.
Otherwise, all child classes with :math:`probability >= max\_prob - tolerance` are predicted.
verbose : int, default=0
Controls the verbosity when fitting and predicting.
See https://verboselogs.readthedocs.io/en/latest/readme.html#overview-of-logging-levels
Expand All @@ -116,7 +94,6 @@ def __init__(
The abbreviation of the local hierarchical classifier to be displayed during logging.
"""
self.local_classifier = local_classifier
self.tolerance = tolerance
self.verbose = verbose
self.edge_list = edge_list
self.replace_classifiers = replace_classifiers
Expand Down Expand Up @@ -157,12 +134,6 @@ def _pre_fit(self, X, y, sample_weight):
# Check that X and y have correct shape
# and convert them to np.ndarray if need be

try:
if len(y) > 0:
y = make_leveled(y)
except TypeError:
pass

if not self.bert:
self.X_, self.y_ = self._validate_data(
X, y, multi_output=True, accept_sparse="csr", allow_nd=True
Expand All @@ -176,6 +147,8 @@ def _pre_fit(self, X, y, sample_weight):
else:
self.sample_weight_ = None

self.y_ = make_leveled(self.y_)

# Create and configure logger
self._create_logger()

Expand Down Expand Up @@ -224,7 +197,6 @@ def _create_logger(self):
self.logger_.addHandler(ch)

def _disambiguate(self):
# TODO: refactor this function to make it more readable
self.separator_ = "::HiClass::Separator::"
if self.y_.ndim == 2:
new_y = []
Expand All @@ -236,22 +208,6 @@ def _disambiguate(self):
row.append(parent + self.separator_ + child)
new_y.append(np.asarray(row, dtype=np.str_))
self.y_ = np.array(new_y)
if self.y_.ndim == 3:
new_y = []
for i in range(self.y_.shape[0]):
new_y.append([])
for j in range(self.y_.shape[1]):
new_y[i].append([str(self.y_[i, j, 0])])
for k in range(1, self.y_.shape[2]):
new_cell = ""
if new_y[i][j][k - 1] != "":
new_cell = (
new_y[i][j][k - 1]
+ self.separator_
+ str(self.y_[i, j, k])
)
new_y[i][j].append(new_cell)
self.y_ = np.array(new_y)

def _create_digraph(self):
# Create DiGraph
Expand All @@ -264,17 +220,13 @@ def _create_digraph(self):

self._create_digraph_2d()

self._create_digraph_3d()

if self.y_.ndim > 3:
if self.y_.ndim > 2:
# Unsuported dimension
self.logger_.error(f"y with {self.y_.ndim} dimensions detected")
raise ValueError(
f"Creating graph from y with {self.y_.ndim} dimensions is not supported"
)

self._save_classes()

def _create_digraph_1d(self):
# Flatten 1D disguised as 2D
if self.y_.ndim == 2 and self.y_.shape[1] == 1:
Expand All @@ -283,7 +235,6 @@ def _create_digraph_1d(self):
if self.y_.ndim == 1:
# Create max_levels_ variable
self.max_levels_ = 1
self.ndim_ = 1
self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels")
for label in self.y_:
self.hierarchy_.add_node(label)
Expand All @@ -292,7 +243,6 @@ def _create_digraph_2d(self):
if self.y_.ndim == 2:
# Create max_levels variable
self.max_levels_ = self.y_.shape[1]
self.ndim_ = 2
rows, columns = self.y_.shape
self.logger_.info(f"Creating digraph from {rows} 2D labels")
for row in range(rows):
Expand All @@ -307,34 +257,6 @@ def _create_digraph_2d(self):
elif parent != "" and column == 0:
self.hierarchy_.add_node(parent)

def _create_digraph_3d(self):
if self.y_.ndim == 3:
self.max_multi_labels = self.y_.shape[1]
self.max_levels_ = self.y_.shape[2]
self.ndim_ = 3
rows, multi_labels, columns = self.y_.shape
self.logger_.info(f"Creating digraph from {rows} 3D labels")
for row in range(rows):
for multi_label in range(multi_labels):
for column in range(columns - 1):
parent = self.y_[row, multi_label, column].split(
self.separator_
)[-1]
child = self.y_[row, multi_label, column + 1].split(
self.separator_
)[-1]
if parent != "" and child != "":
# Only add edge if both parent and child are not empty
self.hierarchy_.add_edge(
self.y_[row, multi_label, column],
self.y_[row, multi_label, column + 1],
)
elif parent != "" and column == 0:
self.hierarchy_.add_node(parent)

def _save_classes(self):
self.classes_ = np.array(self.hierarchy_.nodes)

def _export_digraph(self):
# Check if edge_list is set
if self.edge_list:
Expand Down Expand Up @@ -388,18 +310,9 @@ def _convert_to_1d(self, y):
def _remove_separator(self, y):
# Remove separator from predictions
if y.ndim == 2:
rows, columns = y.shape
for row in range(rows):
for column in range(1, columns):
y[row, column] = y[row, column].split(self.separator_)[-1]
elif y.ndim == 3:
rows, multi_labels, columns = y.shape
for row in range(rows):
for multi_label in range(multi_labels):
for column in range(1, columns):
y[row, multi_label, column] = y[row, multi_label, column].split(
self.separator_
)[-1]
for i in range(y.shape[0]):
for j in range(1, y.shape[1]):
y[i, j] = y[i, j].split(self.separator_)[-1]

def _fit_node_classifier(
self, nodes, local_mode: bool = False, use_joblib: bool = False
Expand All @@ -411,9 +324,9 @@ def _fit_node_classifier(
local_mode=local_mode,
ignore_reinit_error=True,
)
classifier = ray.put(self)
lcppn = ray.put(self)
_parallel_fit = ray.remote(self._fit_classifier)
results = [_parallel_fit.remote(classifier, node) for node in nodes]
results = [_parallel_fit.remote(lcppn, node) for node in nodes]
classifiers = ray.get(results)
else:
classifiers = Parallel(n_jobs=self.n_jobs)(
Expand Down
7 changes: 5 additions & 2 deletions hiclass/LocalClassifierPerLevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def predict(self, X):

# Input validation
if not self.bert:
X = check_array(X, accept_sparse="csr")
X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
else:
X = np.array(X)

Expand Down Expand Up @@ -250,7 +250,10 @@ def _fit_classifier(self, level, separator):
if len(unique_y) == 1 and self.replace_classifiers:
classifier = ConstantClassifier()
if not self.bert:
classifier.fit(X, y, sample_weight)
try:
classifier.fit(X, y, sample_weight)
except TypeError:
classifier.fit(X, y)
else:
classifier.fit(X, y)
return classifier
Expand Down
Loading

0 comments on commit d67d352

Please sign in to comment.