-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmain.py
190 lines (151 loc) · 7.33 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader
import torch.optim as optim
import os
import dataLoader
import classifier
import sys
def read_and_shuffle(file):
df = pd.read_csv(file, delimiter=',')
# Random shuffle.
df.sample(frac=1)
return df
def get_train_and_val_split(df, splitRatio=0.8):
train=df.sample(frac=splitRatio,random_state=201)
val=df.drop(train.index)
print("Number of Training Samples: ", len(train))
print("Number of Validation Samples: ", len(val))
return(train, val)
def get_max_length(reviews):
return len(max(reviews, key=len))
def get_accuracy(logits, labels):
# get the index of the max value in the row.
predictedClass = logits.max(dim = 1)[1]
# get accuracy by averaging over entire batch.
acc = (predictedClass == labels).float().mean()
return acc
def trainFunc(net, loss_func, opti, train_loader, test_loader, config):
best_acc = 0
for ep in range(config["epochs"]):
for it, (seq, attn_masks, labels) in enumerate(train_loader):
opti.zero_grad()
seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
logits = net(seq, attn_masks)
loss = loss_func(m(logits), labels)
loss.backward()
opti.step()
print("Iteration: ", it+1)
if (it + 1) % config["printEvery"] == 0:
acc = get_accuracy(m(logits), labels)
if not os.path.exists(config["outputFolder"]):
os.makedirs(config["outputFolder"])
# Since a single epoch could take well over hours, we regularly save the model even during evaluation of training accuracy.
torch.save(net.state_dict(), os.path.join(config["outputFolder"], config["outputFileName"]))
print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))
# perform validation at the end of an epoch.
val_acc, val_loss = evaluate(net, loss_func, val_loader, config)
print(" Validation Accuracy : {}, Validation Loss : {}".format(val_acc, val_loss))
if val_acc > best_acc:
print("Best validation accuracy improved from {} to {}, saving model...".format(best_acc, val_acc))
best_acc = val_acc
torch.save(net.state_dict(), os.path.join(config["outputFolder"], config["outputFileName"] + "_valTested_" + str(best_acc)))
def saveEvalAndTrain(train, eval, config):
val.to_csv("AMAZON-DATASET/Validations.csv")
train.to_csv("AMAZON-DATASET/Train.csv")
def evaluate(net, loss_func, dataloader, config):
net.eval()
mean_acc, mean_loss = 0, 0
count = 0
with torch.no_grad():
for seq, attn_masks, labels in dataloader:
seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
logits = net(seq, attn_masks)
mean_loss += loss_func(m(logits), labels)
mean_acc += get_accuracy(m(logits), labels)
print("Validating reviews ", count * config["batchSize"], " - ", (count+1) * config["batchSize"])
count += 1
'''
The entire validation set was around 0.1 million entries,
the validationFraction param controls what fraction of the shuffled
validation set you want to validate the results on.
'''
if count > config["validationFraction"] * len(val_set):
break
return mean_acc / count, mean_loss / count
if __name__== "__main__":
arguments = sys.argv[1:]
trainOrEval = arguments[0]
config = {
"splitRatio" : 0.8,
"maxLength" : 100,
"printEvery" : 5,
"outputFolder" : "Models",
"outputFileName" : "AmazonReviewClassifier.dat",
"threads" : 4,
"batchSize" : 64,
"validationFraction" : 0.0005,
"epochs" : 5,
"forceCPU" : False
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if config["forceCPU"]:
device = torch.device("cpu")
config["device"] = device
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print("Configuration is: ", config)
# Read and shuffle input data.
df = read_and_shuffle("./AMAZON-DATASET/Reviews.csv")
num_classes = df['Score'].nunique()
config["num_classes"] = num_classes
print("Number of Target Output Classes:", num_classes)
totalDatasetSize = len(df)
# Group by the column Score. This helps you get distribution of the Review Scores.
symbols = df.groupby('Score')
scores_dist = []
for i in range(num_classes):
scores_dist.append(len(symbols.groups[i+1])/totalDatasetSize)
train, val = get_train_and_val_split(df, config["splitRatio"])
if trainOrEval == "eval":
try:
val = pd.read_csv("./AMAZON-DATASET/Validations.csv", delimiter=',')
except:
print("Could not find file Validations.csv in AMAZON-DATSET, run the training and then run the eval flag.")
if trainOrEval == "train":
saveEvalAndTrain(train, eval, config)
# You can set the length to the true max length from the dataset, I have reduced it for the sake of memory and quicker training.
#T = get_max_length(reviews)
T = config["maxLength"]
train_set = dataLoader.AmazonReviewsDataset(train, T)
val_set = dataLoader.AmazonReviewsDataset(val, T)
'''
Uncomment this block if you would like to see what is contained in token_ids, attention_masks and labels.
_1, _2, _3 = train_set.__getitem__(0)
print(_1,_2,_3)
'''
train_loader = DataLoader(train_set, batch_size = config["batchSize"], num_workers = config["threads"])
val_loader = DataLoader(val_set, batch_size = config["batchSize"], num_workers = config["threads"])
# We are unfreezing the BERT layers so as to be able to fine tune and save a new BERT model that is specific to the Sizeable food reviews dataset.
net = classifier.SentimentClassifier(num_classes, config["device"], freeze_bert=False)
net.to(config["device"])
weights = torch.tensor(scores_dist).to(config["device"])
# Setting the Loss function and Optimizer.
loss_func = nn.NLLLoss(weight=weights)
opti = optim.Adam(net.parameters(), lr = 2e-5)
m = nn.LogSoftmax(dim=1)
if trainOrEval == "train":
trainFunc(net, loss_func, opti, train_loader, val_loader, config)
elif trainOrEval == "eval":
model = classifier.SentimentClassifier(5, device, freeze_bert = False)
try:
model.load_state_dict(torch.load(os.path.join(config["outputFolder"], config["outputFileName"]), map_location=config["device"]))
print("Loaded model at ", os.path.join(config["outputFolder"], config["outputFileName"]))
except:
print("Failed to load the model weights, please check if the file ", os.path.join(config["outputFolder"], config["outputFileName"]), "exisits and is not corrupt")
val_acc, val_loss = evaluate(model, loss_func, val_loader, config)
print("Accuracy: ",val_acc,"\nLoss: ", val_loss)
else:
print("Please give a command line argument of either 'train' or 'eval'. For example. python3 main.py eval\n Eval will only work once the first model from training is available.")