-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
147 lines (117 loc) · 4.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
# coding: utf-8
############################################################################################################################
# This is the main script for training the DNA_bert model on the training data and evaluating on the test data #
# The model is trained and evaluated using the HuggingFace Trainer API #
# The model is saved in the "saved_models" directory #
# The model is logged to Weights and Biases using the WandbLogger #
############################################################################################################################
#########################################
### Importing the necessary libraries ###
#########################################
import wandb
import pandas as pd
import numpy as np
from pathlib import Path
from utils.data_utils import return_kmer, val_dataset_generator, HF_dataset
from utils.model_utils import load_model, compute_metrics
from utils.viz_utils import count_plot
from transformers import Trainer, TrainingArguments
############################################
### Reading the training and test data ####
############################################
KMER = 3 # The length of the K-mers to be used by the model and tokenizer
training_data_path = Path("data/TrainingData/Trainingdata.csv")
eval_data_path = Path("data/TestData/Testdata-2.csv")
df_training = pd.read_csv(training_data_path)
train_kmers, labels_train = [], []
for seq, label in zip(df_training["SEQ"], df_training["CLASS"]):
kmer_seq = return_kmer(seq, K=KMER)
train_kmers.append(kmer_seq)
labels_train.append(label - 1)
NUM_CLASSES = len(np.unique(labels_train))
count_plot(labels_train, "Training Class Distribution")
model_config = {
"model_path": f"zhihan1996/DNA_bert_{KMER}",
"num_classes": NUM_CLASSES,
}
model, tokenizer, device = load_model(model_config, return_model=True)
SEQ_MAX_LEN = 512 # max len of BERT
train_encodings = tokenizer.batch_encode_plus(
train_kmers,
max_length=SEQ_MAX_LEN,
padding=True, # pad to max len
truncation=True, # truncate to max len
return_attention_mask=True,
return_tensors="pt", # return pytorch tensors
)
train_dataset = HF_dataset(
train_encodings["input_ids"], train_encodings["attention_mask"], labels_train
)
df_val = pd.read_csv(eval_data_path) # i use the Testdata-2 as the validation set
val_kmers, labels_val = [], []
for seq, label in zip(df_val["SEQ"], df_val["CLASS"]):
kmer_seq = return_kmer(seq, K=KMER)
val_kmers.append(kmer_seq)
labels_val.append(label - 1)
count_plot(labels_val, "Validation Class Distribution")
val_encodings = tokenizer.batch_encode_plus(
val_kmers,
max_length=SEQ_MAX_LEN,
padding=True, # pad to max len
truncation=True, # truncate to max len
return_attention_mask=True,
return_tensors="pt", # return pytorch tensors
)
val_dataset = HF_dataset(
val_encodings["input_ids"], val_encodings["attention_mask"], labels_val
)
############################################
### Training and evaluating the model #####
############################################
results_dir = Path("./results/classification/")
results_dir.mkdir(parents=True, exist_ok=True)
EPOCHS = 10
BATCH_SIZE = 8
# initialize wandb for logging the training process
wandb.init(project="DNA_bert", name=model_config["model_path"])
wandb.config.update(model_config)
training_args = TrainingArguments(
output_dir=results_dir / "checkpoints", # output directory
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # L2 regularization lambda value
logging_steps=60, # log metrics every 60 steps
load_best_model_at_end=True,
evaluation_strategy="epoch",
save_strategy="epoch",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics, # compute metrics function is used for evaluation at the end of each epoch
tokenizer=tokenizer,
)
trainer.train()
# save the model and tokenizer
model_path = results_dir / "model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
# evaluate on all the test datasets
eval_results = []
for val_dataset in val_dataset_generator(
tokenizer, kmer_size=KMER, val_dir="data/TestData/"
):
res = trainer.evaluate(val_dataset)
eval_results.append(res)
# average over the eval_accuracy and eval_f1 from the dic items in eval_results
avg_acc = np.mean([res["eval_accuracy"] for res in eval_results])
avg_f1 = np.mean([res["eval_f1"] for res in eval_results])
print(f"Average accuracy: {avg_acc}")
print(f"Average F1: {avg_f1}")
wandb.log({"avg_acc": avg_acc, "avg_f1": avg_f1})
wandb.finish()