-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
145 lines (97 loc) · 5.49 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#! /usr/bin/env python3
__author__ = "Michael Ripa"
__email__ = "[email protected]"
__version__ = "1.0.0"
__date__ = "August 26th 2022"
from global_variables import *
from similarity import Similarity
from preprocessing import load_df, find_pairs, check_transposition, create_training_labels
import pandas as pd
import numpy as np
from copy import deepcopy
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
class Classifier:
def __init__(self,similarity,n=None,classifier=LogisticRegression(),train_ratio = 0.5,standardize=True,df=GRADUATES_PATH):
'''Classifier - A class for initializing and training a classifier that is coupled to a Similarity instance for transforming non-numerical data into something that can be trained with a classifier model.
Inputs:
similarity : Similarity
Similarity instance used for generating numerical distance vectors used for the classifier.
n : int
Number of duplicates to train on. Defaults to training on all of the duplicates (ideal for use in production)
classifier : Scikit learn classifier instance
Linear classifier to be used by the model with hyperparameters of one's choice. Can incorporate a scikit learn pipeline granted the final estimator in the pipeline is a linear classifier. See https://scikit-learn.org/stable/modules/linear_model.html for examples of linear models that can be used and https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html for more details on scikit learn pipelines.
train_ratio : float
Ratio of duplicates to non-duplicates to train on.
standardize : bool
If True, performs standardization on the data before passing it into the classifier for training. Tends to improve model accuracy, see https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html for more details.
df : str or Pandas DataFrame
Path to dataset or Pandas df used for training the classifier.
Resulting Attributes:
df : Pandas DataFrame
DataFrame to be sampled from for training
n : int
Number of duplicates (to be) used in training
clf : Sklearn Pipeline
Pipeline setup with linear classifier
sim : Similarity()
Similarity instance used for computing distances
train_ratio : float
Ratio of duplicates to non-duplicates
'''
if type(df) == str:
try:
self.df = load_df(df,default_preprocessing=True)
except:
raise ValueError('Could not read ' + df + ' as a csv, please verify that the path is correct.')
else:
self.df = df
self.clf = classifier
if type(classifier) != sklearn.pipeline.Pipeline:
# Place the provided classifier into Scikit pipeline for consistency
if standardize:
self.clf = make_pipeline(StandardScaler(),classifier)
else:
self.clf = make_pipeline(classifier)
self.n = n
self.clf = classifier
self.train_ratio = train_ratio
self.sim = similarity
def init_classifier(self,training=False,test_size=0.4):
'''init_classifier(self,duplicates_df=DUPLICATE_GRADUATES_PATH,training=False,standardize=False)
Trains classifier model on pairs of duplicates and non-duplicates with repect to the hyperparameters passed in at initialization.
Inputs:
training : bool
If True, Creates a train test split of ratio 0.4, storing attributes `X_train`, `X_test`, `y_train` and `y_test` and training the classifier on `X_train`, `y_train`. If False, all of the data is used to train the model but none of it is stored.
test_size : float
Given `training` is True, this decides the ratio of the test set size to the train set size (i.e. ratio of `X_test` to `X_train`).
Resulting attributes:
n : int
n_true : int
Number of duplicates
n_false : int
Number of non-duplicates
'''
pairs = find_pairs(df=self.df,similarity=self.sim,n=self.n,duplicates=True,strict_pair=True,move_to_top=False,return_ratio=False)
#Compute number of non-duplicates with respect to train_ratio. Note that this computation works even if n = None.
n_true = len(pairs)
n_false = int((n_true - self.train_ratio*n_true)/self.train_ratio)
non_pairs = find_pairs(df=self.df,similarity=self.sim,n=n_false,duplicates=False,strict_pair=True,move_to_top=False,return_ratio=False)
#Compute distance vectors
X_true = self.sim.compute_similarities(pairs)
X_false = self.sim.compute_similarities(non_pairs)
X,y = create_training_labels(X_true,X_false)
if training:
#Generates training and test sets and stores internally
X, X_test, y, y_test = train_test_split(X,y,test_size=test_size)
self.X_train = X
self.y_train = y
self.X_test = X_test
self.y_test = y_test
self.clf.fit(X,y)
self.n = n_true + n_false
self.n_true = n_true
self.n_false = n_false