-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrad2vec.py
143 lines (95 loc) · 4.99 KB
/
grad2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#! /usr/bin/env python3
__author__ = "Michael Ripa"
__email__ = "[email protected]"
__version__ = "1.0.0"
__date__ = "August 26th 2022"
from global_variables import *
from similarity import Similarity
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
class Grad2Vec:
def __init__(self,similarity,records):
'''Grad2Vec: A class for converting non-numerical records (traditionally PSRS graduates) into numerical vectors in which similar records have a lower Euclidean distance (with respect to the embedding).
Inputs:
similarity : Similarity()
Similarity instance used in aiding with the creation of the embeddings.
records : df
Pandas DataFrame containing records to be embedded as vectors
Resulting attributes:
similarity : Similarity()
records : df
'''
self.records = records
self._skew_embeddings = False
self.similarity = similarity
def generate_embeddings(self, k=10, splits=10,default_basis=None,apply_changes=True,apply_mean=True):
'''generate_embeddings(self, k=10, splits=10,default_basis=None,apply_changes=True,apply_mean=True)
Creates a vectorized representation (or embedding) for an inputted DataFrame of records with respect to a random sample of the data.
Inputs:
k : int
Size of sample used for creating embeddings
splits : int
How many groups of the sample to be combined
default_basis : Pandas DataFrame
Sample to be used for creating embedding (optional)
apply_changes : Bool
If True, updates the model with the new embeddings and meta info
apply_mean : Bool
If k != splits, there will be a combination of distance vectors performed. Setting `apply_mean` as True takes the mean value, while False just combines them via summation
Output:
embeddings : NumPy Array
NumPy array of dimension n_records x (n_cols x n_splits)
Resulting Attributes:
n_records : int
k : int
splits : int
basis : Pandas DataFrame
embeddings : Numpy array
NumPy array of dimension n_records x (n_cols x n_splits)
'''
assert k % splits == 0
n_records = len(self.records.index)
basis = default_basis
m = len(self.similarity.cols)
# Obtain basis if not provided
if type(basis) == type(None):
basis = self._get_basis(k)
embeddings = np.zeros((n_records,m*splits))
cur_split = 0
#Compute distance to each "basis element" and add to slot in embedding (potentially adding to 1 or more pre-existing distances)
for i in range(k):
vector = basis.iloc[i:i+1,:]
embeddings[:,cur_split*m : (1 + cur_split)*m] += self.similarity.compute_similarities(self.records,vector)
cur_split = (cur_split + 1) % splits
#This takes the mean distance rather than the sum of distances for the embedding
if apply_mean:
embeddings = np.divide(embeddings,k // splits)
if apply_changes:
self.n_records = n_records
self.k = k
self.splits = splits
self.basis = basis
self.embeddings = embeddings
return self
return embeddings
def set_classifier_weights(self,weights):
'''set_classifier_weights(self,weights)
To improve the accuracy of the weights, one can train a linear classifier (like Logistic Regression) on pairwise duplicates and use the training weights to skew the vectorspace with respect to how important a dimension is in determining similarity. Note that in order to preserve the raw embeddings, these weights are left seperate from the embeddings, left for a parent class to add to the embeddings when used in computations. In other words, `embeddings` will never be modified at any point.
Inputs:
weights : float[]
NumPy array of weights
Resulting Attributes:
weights : float[]
'''
self.weights = weights
self._skew_embeddings = True
#This function definitely has room for a more creative approach, such as selecting a sample of very distinct records to use as a "basis" instead of totally random.
def _get_basis(self,k):
'''_get_basis(self,k)
Helper function which grabs "basis" i.e. a sample of k records from `records`, all of which have non-null values. This "basis" gets used in creating the embeddings (every record has its distance taken with each basis element).
'''
cols = self.similarity.cols
non_null = self.records[cols].notna().all(axis=1)
index = np.random.choice(np.arange(0,np.sum(non_null)),size=k)
return self.records[non_null][cols].iloc[index]