hw8_bidirectionalgru.py

# -*- coding: utf-8 -*-
"""hw8_bidirectionalGRU.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15zOwcC8WiAndCzp6W5mB9Cv-Irb3sLoG
"""

#Import the libraries necessary
import gzip as gzip
import numpy as np
import torch
import torch.utils.data
import torchvision.transforms as tvt
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn
import copy as copy
import os
import pickle as pkl
import gensim.downloader as GENAPI
from gensim.models import KeyedVectors as kv

# # import os

# data_path = '/content/drive/MyDrive/BME 64600/hw8/data/'
# file_name = 'sentiment_dataset_train_400.tar.gz'
# print(os.listdir(data_path))

# full_path = os.path.join(data_path, file_name)
# print("Checking if file exists:", os.path.exists(full_path))

#Defining namespace function for creating lightweight objects to hold data
class Namespace:
  def __init__(self, **kwargs):
    self.__dict__.update(kwargs)

#Defining custom dataset loader class
class Dataset(torch.utils.data.Dataset):
  def __init__(self, args):
    super().__init__()
    self.args = args
    self.install()
    self.load_data()
    self.preprocess()
  
  #Install pre-trained word2vec model from disk or download 
  def install(self):
    path = self.args.path + "/vectors.kv"
    if os.path.exists(path):
        self.wordVectors = kv.load(path)
    else:
        self.wordVectors = GENAPI.load('word2vec-google-news-300')
        self.wordVectors.save(path)

  #Load data from .tar.gz file
  def load_data(self):
    #Initialize maxlength definition and empty list for data
    self.maxLen = 0
    data = []
    mydata = gzip.open(self.args.path + "/" + self.args.data, 'rb').read()
    #Split text data into positive and negative dictionary samples & vocab list
    positive, negative, vocab = pkl.loads(mydata, encoding = 'latin1')
    #Sort list of category labels
    self.categories = sorted(list(positive.keys()))
    if self.args.types == 'test': #Sort vocab list if test
      vocab = sorted (vocab)

    #Create list of training samples
    self.data = [[review, category, 1] for category in positive for review in positive[category]] #1 if positive
    self.data += [[review, category, 0] for category in negative for review in negative[category]]  #0 if negative

    #Tokenize review text into individual words
    for ii in self.data:
      words = []
      #Convert each word to corresponding word vector using wordVectors
      for _, word in enumerate(ii[0]):
        #Ignore if not in model vocab
        if word in self.wordVectors.key_to_index: 
            words.append(self.wordVectors[word])  
      if len(words) > self.maxLen: 
        self.maxLen = len(words)  #Calculate max length of preprocessed reviews
                
      data.append([words, ii[1], ii[2]])  #Format (preprocessed words, category label, binary label)
      
    self.data = data

  # Convert tokenized review & sentiment to tensor
  def data_to_tensor(self, review, sentiment):
    review_embeddings = np.array([np.array(word) for word in review]) #Create single numpy array to convert to tensor
    
    sentiment_embeddings = torch.zeros(2)
    sentiment_embeddings[sentiment] = 1
    return torch.FloatTensor(review_embeddings), torch.FloatTensor(sentiment_embeddings)


  #Convert reviews and sentiment labels to tensors
  def preprocess(self):
    data = []
    for ii in self.data:
        review, category, sentiment = ii
        review, sentiment = self.data_to_tensor(review, sentiment)
        temp = {'review': review, 'category': self.categories.index(category), 'sentiment': sentiment}
        data.append(temp)
    self.data = data
  
  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

#GRU bidirectional class
class GRUnet(torch.nn.Module):
  def __init__(self, args):
      super().__init__()

      self.batchSize = args.batch_size
      self.inputSize = args.input_size
      self.hiddenSize = args.hidden_size
      self.outputSize = args.output_size
      self.numLayers = 1
      self.GRU = torch.nn.GRU(self.inputSize, self.hiddenSize, self.numLayers, batch_first=True, bidirectional=True)  #Set bidirectional to True
      self.fc = torch.nn.Linear(self.hiddenSize * 2, self.outputSize)  # Multiply by 2 for bidirectional hidden states (2 hidden states)
      self.ReLU = torch.nn.ReLU()
      self.softmax = torch.nn.LogSoftmax(dim=1)

  def forward(self, data, hidden):
      out, hidden = self.GRU(data, hidden)
      out = self.fc(self.ReLU(out[:, -1]))
      out = self.softmax(out)

      return out, hidden

  def init_hidden(self):
      weight = next(self.parameters()).data
      hidden = weight.new(2, self.batchSize, self.hiddenSize).zero_()  # Multiply num layers by 2 when initializing hidden state

      return hidden

#Define function for training
def run_code_for_training(model, dataloader, args):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = copy.deepcopy(model).to(device)
  criterion = torch.nn.NLLLoss()  # Use negative log likelihood loss for classification
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001) #Adjustable learning rate here
  epochs = 4
  loss_record = []

  for epoch in range(epochs):
    # print(f"Entering {epoch + 1} of {epochs} epochs")
    gruloss = 0.0

    for iteration, data in enumerate(dataloader):
        review, category, sentiment = data['review'], data['category'], data['sentiment']
        review = review.to(device)
        sentiment = sentiment.to(device).long()  # Convert sentiment tensor to long data type for NLLLoss
        optimizer.zero_grad()
        hidden = model.init_hidden().to(device)
        
        for i in range(review.shape[1]):
            output, hidden = model(torch.unsqueeze(torch.unsqueeze(review[0, i], 0), 0), hidden)
        loss = criterion(output, sentiment.argmax(dim=1))  # Compute loss using NLLLoss
        gruloss += loss.item()
        loss.backward()
        optimizer.step()

        i = 100
        if (iteration + 1) % i == 0:
            running_loss = gruloss / float(i)
            loss_record.append(running_loss)
            print("\n[epoch:%d, batch:%5d] loss: %.7f" %(epoch + 1, iteration + 1, gruloss / float(i)))
            gruloss = 0.0

  return model, loss_record


#Define function for testing (accuracy and confusion matrix)
def run_code_for_testing(model, dataloader, args):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model.load_state_dict(torch.load(args.model_path))
  correct_count = 0.0
  total_count = 0.0
  confusionmatrix = torch.zeros(2, 2)
  with torch.no_grad():
      for iteration, data in enumerate(dataloader):
          review, category, sentiment = data['review'], data['category'], data['sentiment']
          hidden = model.init_hidden()
          for i in range(review.shape[1]): 
              output, hidden = model(torch.unsqueeze(torch.unsqueeze(review[0, i], 0), 0), hidden)
          pred = torch.argmax(output).item()
          truth = torch.argmax(sentiment).item()

          if pred == truth: 
              correct_count += 1
          total_count += 1

          confusionmatrix[truth, pred] += 1
  accuracy = correct_count / total_count
  print(accuracy * 100)
  print(confusionmatrix)
  return (accuracy * 100), confusionmatrix


def plotconfusionmatrix(accuracy, confusionMatrix, MODEL):
  accuracy = f'Accuracy ({MODEL}): {accuracy:.2f}%'    
  plt.rcParams['axes.facecolor'] = 'white'
  plt.figure(figsize = (9, 9))
  ax = seaborn.heatmap(confusionMatrix.int(), annot=True, fmt='d')
  plt.title(accuracy)
  plt.xlabel("Predicted Label")
  plt.xticks([_+0.5 for _ in range(2)], ['negative', 'positive'])
  plt.ylabel("True Label")
  plt.yticks([_+0.5 for _ in range(2)], ['negative', 'positive'])
  hw8_path = '/content/drive/MyDrive/BME 64600/hw8'

  # Save confusion matrix
  plt.savefig(os.path.join(hw8_path, f'confusion_matrix_{MODEL}.jpg'), bbox_inches='tight', dpi=800)
  plt.show()

#Main Script for code
if __name__ == '__main__':
  args = Namespace(
                      path = '/content/drive/MyDrive/BME 64600/hw8/data',
                      data = 'sentiment_dataset_train_400.tar.gz',
                      types = 'train',
                      batch_size = 1,
                      input_size = 300,
                      hidden_size = 100,
                      output_size = 2,
                  )
  
  dataset = Dataset(args)
  dataLoader = torch.utils.data.DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True)

  #Train and test torch.nn.GRU bidirectional model
  args.data = 'sentiment_dataset_train_400.tar.gz'
  args.types = 'train'
  args.model = 'bidirectionalGRU'
  args.model_path = os.path.join(args.path, 'bidirectionalGRU.pth') 
  model = GRUnet(args)
  bidirectionalgrumodel, lossrecord_bidirectionalGRU = run_code_for_training(model, dataLoader, args)
  torch.save(bidirectionalgrumodel.state_dict(), args.model_path)  #Save model for bidirectional GRU

#Plot the figures
plt.figure()
plt.title("bidirectionalGRU Training Loss vs. Iterations")
plt.plot(lossrecord_bidirectionalGRU, label = "Training Loss")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
# Save training loss plot
hw8_path = '/content/drive/MyDrive/BME 64600/hw8'
plt.savefig(os.path.join(hw8_path, "bidirectionalGRU_train_loss.jpg"))
plt.show()

#Evaluate Model
args.data = 'sentiment_dataset_test_400.tar.gz'
args.types = 'test'
model = GRUnet(args)
accuracy_bidirectionalGRU, confusionMatrix_bidirectionalGRU = run_code_for_testing(model, dataLoader, args)

#Plot Confusion Matrix 
plotconfusionmatrix(accuracy_bidirectionalGRU, confusionMatrix_bidirectionalGRU, 'bidirectionalGRU')