From ff41bacd0d2f43961bf4ad290f4816892cedd6ff Mon Sep 17 00:00:00 2001 From: Bryan Lee Date: Sat, 11 Jan 2025 01:32:14 -0800 Subject: [PATCH 1/4] adding scoring_df docstring --- .../compare_classifiers/scoring_df.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 src/compare_classifiers/compare_classifiers/scoring_df.py diff --git a/src/compare_classifiers/compare_classifiers/scoring_df.py b/src/compare_classifiers/compare_classifiers/scoring_df.py new file mode 100644 index 0000000..3a3b289 --- /dev/null +++ b/src/compare_classifiers/compare_classifiers/scoring_df.py @@ -0,0 +1,24 @@ +def scoring_df(pipeline, X, y): + """ + Evaluates the performance and timing of a scikit-learn pipeline. For each model + in the pipeline, the following metrics are calculated: + - F1 Score + - Accuracy + - Precision + - Recall + + Parameters + ---------- + - pipeline (Pipeline): A scikit-learn pipeline containing one or more models or transformers. + - X (array-like or DataFrame): Feature matrix for training and testing. + - y (array-like or DataFrame): Target vector for training and testing. + + Returns: + - pd.DataFrame: A DataFrame containing performance metrics (F1 Score, Accuracy, Precision, Recall). + >>> pipeline = Pipeline([ + >>> ('scaler', StandardScaler()), + >>> ('svc', SVC(kernel='linear', random_state=42)), + >>> ('random_forest',RandomForestRandomForestClassifier(n_estimators=100)) + >>> ]) + >>> scoring_df(pipeline, X_train, y_train) + """ \ No newline at end of file From 5d03d32739a2717e510628f01c01de980a3cdaf0 Mon Sep 17 00:00:00 2001 From: Bryan Lee Date: Sat, 11 Jan 2025 01:51:50 -0800 Subject: [PATCH 2/4] contributing md fixed --- CONTRIBUTING.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 91e73be..a6994f1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,13 +46,15 @@ Ready to contribute? Here's how to set up `compare_classifiers` for local develo 2. Install `compare_classifiers` using `poetry`: ```console - $ poetry install + $ poetry install compare_classifiers + ``` 3. Use `git` (or similar) to create a branch for local development and make your changes: ```console $ git checkout -b name-of-your-bugfix-or-feature + ``` 4. When you're done making changes, check that your changes conform to any code formatting requirements and pass any tests. From 028904d090c3ce0b67d9fd50775c46bea57bbf75 Mon Sep 17 00:00:00 2001 From: Bryan Lee Date: Sat, 11 Jan 2025 15:12:32 -0800 Subject: [PATCH 3/4] making changes to docstring as recommended --- .../compare_classifiers/compare_f1.py | 39 +++++++++++++++++++ .../compare_classifiers/scoring_df.py | 24 ------------ 2 files changed, 39 insertions(+), 24 deletions(-) create mode 100644 src/compare_classifiers/compare_classifiers/compare_f1.py delete mode 100644 src/compare_classifiers/compare_classifiers/scoring_df.py diff --git a/src/compare_classifiers/compare_classifiers/compare_f1.py b/src/compare_classifiers/compare_classifiers/compare_f1.py new file mode 100644 index 0000000..d8b3a54 --- /dev/null +++ b/src/compare_classifiers/compare_classifiers/compare_f1.py @@ -0,0 +1,39 @@ +def compare_f1(estimators, X, y): + """ + Evaluates the performance and timing of a scikit-learn pipeline. + For each model in the list, the following metrics are calculated: + - F1 Score + - Accuracy + - Precision + - Recall + + Parameters + ---------- + - estimators : list or pandas series + A scikit-learn pipeline containing one or more models or transformers. + + - X : Pandas Data frame + Feature matrix for training and testing. + + - y : list or pandas series + Target vector for training and testing. + + Returns: + -------- + - pandas DataFrame + A DataFrame containing performance metrics (F1 Score, Accuracy, + Precision, Recall). + + Example: + -------- + >>> models = [('lr', LogisticRegression()), ('rf', RandomForestClassifier())] + ... # X_train = ... # feature matrix for training + ... # y_train = ... # target vector for training + >>> compare_f1(pipeline, X_train, y_train) + """ + +# Example usage: +# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # generating training and testing split +# estimators = [('lr', LogisticRegression()), ('rf', RandomForestClassifier())] # generating list of models to score +# result = compare_f1(estimators, X_train, y_train) +# print(result) \ No newline at end of file diff --git a/src/compare_classifiers/compare_classifiers/scoring_df.py b/src/compare_classifiers/compare_classifiers/scoring_df.py deleted file mode 100644 index 3a3b289..0000000 --- a/src/compare_classifiers/compare_classifiers/scoring_df.py +++ /dev/null @@ -1,24 +0,0 @@ -def scoring_df(pipeline, X, y): - """ - Evaluates the performance and timing of a scikit-learn pipeline. For each model - in the pipeline, the following metrics are calculated: - - F1 Score - - Accuracy - - Precision - - Recall - - Parameters - ---------- - - pipeline (Pipeline): A scikit-learn pipeline containing one or more models or transformers. - - X (array-like or DataFrame): Feature matrix for training and testing. - - y (array-like or DataFrame): Target vector for training and testing. - - Returns: - - pd.DataFrame: A DataFrame containing performance metrics (F1 Score, Accuracy, Precision, Recall). - >>> pipeline = Pipeline([ - >>> ('scaler', StandardScaler()), - >>> ('svc', SVC(kernel='linear', random_state=42)), - >>> ('random_forest',RandomForestRandomForestClassifier(n_estimators=100)) - >>> ]) - >>> scoring_df(pipeline, X_train, y_train) - """ \ No newline at end of file From 75c6ad455d34c40e0d9ff188090e6a97518ef1c1 Mon Sep 17 00:00:00 2001 From: Susannah Sun Date: Sat, 11 Jan 2025 16:18:15 -0800 Subject: [PATCH 4/4] make wording consistent across all function docstrings --- .../compare_classifiers/compare_f1.py | 43 ++++++++----------- .../ensemble_compare_f1.py | 2 +- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/compare_classifiers/compare_classifiers/compare_f1.py b/src/compare_classifiers/compare_classifiers/compare_f1.py index d8b3a54..f4e2e67 100644 --- a/src/compare_classifiers/compare_classifiers/compare_f1.py +++ b/src/compare_classifiers/compare_classifiers/compare_f1.py @@ -1,39 +1,30 @@ def compare_f1(estimators, X, y): """ - Evaluates the performance and timing of a scikit-learn pipeline. - For each model in the list, the following metrics are calculated: - - F1 Score - - Accuracy - - Precision - - Recall + Show cross validation results, including fit time and f1 scores for each estimator. Parameters ---------- - - estimators : list or pandas series - A scikit-learn pipeline containing one or more models or transformers. + estimators : list of tuples + A list of (name, estimator) tuples, consisting of individual estimators to be processed through the voting or stacking classifying ensemble. Each tuple contains a string: name/label of estimator, and a model: the estimator, which implements + the scikit-learn API (`fit`, `predict`, etc.). - - X : Pandas Data frame - Feature matrix for training and testing. - - - y : list or pandas series - Target vector for training and testing. + X_train : Pandas data frame + Data frame containing training data along with n features. + + y_train : Pandas series + Target class labels for data in X_train. Returns: -------- - - pandas DataFrame - A DataFrame containing performance metrics (F1 Score, Accuracy, - Precision, Recall). + Pandas data frame + A data frame showing cross validation results on training data, with 3 columns: fit_time, test_score, train_score and 1 rows for each estimator. Example: -------- - >>> models = [('lr', LogisticRegression()), ('rf', RandomForestClassifier())] - ... # X_train = ... # feature matrix for training - ... # y_train = ... # target vector for training - >>> compare_f1(pipeline, X_train, y_train) + >>> estimators = [ + ... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), + ... ('svm', make_pipeline(StandardScaler(), LinearSVC(random_state=42))) + ... ] + >>> compare_f1(estimators, X, y) """ - -# Example usage: -# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # generating training and testing split -# estimators = [('lr', LogisticRegression()), ('rf', RandomForestClassifier())] # generating list of models to score -# result = compare_f1(estimators, X_train, y_train) -# print(result) \ No newline at end of file + pass \ No newline at end of file diff --git a/src/compare_classifiers/compare_classifiers/ensemble_compare_f1.py b/src/compare_classifiers/compare_classifiers/ensemble_compare_f1.py index 761b050..ab1cbea 100644 --- a/src/compare_classifiers/compare_classifiers/ensemble_compare_f1.py +++ b/src/compare_classifiers/compare_classifiers/ensemble_compare_f1.py @@ -25,7 +25,7 @@ def ensemble_compare_f1(estimators, X_train, y_train): ... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ... ('svm', make_pipeline(StandardScaler(), LinearSVC(random_state=42))) ... ] - ensemble_compare_f1(estimators, X, y) + >>> ensemble_compare_f1(estimators, X, y) """ # ...existing code...