Add files via upload

Polished Version of GPT2 for Summarization
VioletRaven · Jul 4, 2022 · 463384b · 463384b
commit 463384b
Show file tree

Hide file tree

Showing 12 changed files with 1,056 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+desktop.ini
+trash.ini
diff --git a/README.md b/README.md
@@ -0,0 +1,50 @@
+# Introduction
+GPT-2 Fine-Tuning for Summarization on the ARTeLab dataset. The dataset is firstly unpacked into tokenized .json files which are passed to the training section.
+
+### Built With
+
+The following technologies, frameworks and libraries have been used:
+
+* [Python](https://www.python.org/)
+* [Git](https://git-scm.com/)
+
+We strongly suggest to create a virtual env (i.e. 'GPT-2_Summarizer') providing the python version otherwise it will not install previous libraries:
+
+```bash
+conda create -n GPT-2_Summarizer python=3.8.9 
+conda activate GPT-2_Summarizer python=3.8.9
+```
+
+If you want to run it manually you need to have python 3.8.9 (or later versions) configured on your machine. 
+
+1. Install all the libraries using the requirements.txt files that can be found in the main repository
+
+```bash
+pip install -r requirements.txt
+```
+
+2. Dataset Creation
+
+The dataset can be created by passing a .csv file to the "dataset_creation.py" script which expects 2 columns, text and summary respectively.
+
+```bash
+python dataset_creation.py --path_csv "./path_to_csv" --path_directory "./path_to_directory" --model "model_used_for_tokenization" 
+``` 
+The script will create tokenized .json files that can be fed to the "train.py" script.
+
+3. Training
+
+In order to run the training process on a GPU follow the Google Colab Notebook provided as
+
+bash_train_GPT2.ipynb
+
+Remember to change the Runtime to GPU !
+
+
+4. Loading and using the model
+
+Once the training has stopped and the best model has been saved you can load it and use it for summarization by running the script below on your terminal.
+
+```bash
+python loading_saved_model.py --text "Sarebbe stato molto facile per l'uomo estrarre la freccia dalla carne del malcapitato, eppure questo si rivelò complicato e fatale. La freccia aveva infatti penetrato troppo a fondo nella gamba e aveva provocato una terribile emorragia." --saved_model "./model_O0_trained_after_50_epochs_only_sum_loss_ignr_pad.bin"
+```
diff --git a/bash_train_GPT2.ipynb b/bash_train_GPT2.ipynb
@@ -0,0 +1,110 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6d1ca6f3"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install transformers"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7lqB5WNxgfyf"
+      },
+      "outputs": [],
+      "source": [
+        "import os"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OYS439xzBEBc"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/data-prestige/GPT-2_Summarizer.git\n",
+        "!git pull"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lz2eFeMTjDuG"
+      },
+      "outputs": [],
+      "source": [
+        "'''\n",
+        "run this for old dataset\n",
+        "'''\n",
+        "%cd GPT-2_Summarizer\n",
+        "if not os.path.exists(\"./articoli.zip\"):\n",
+        "    os.system('gdown --id \"1PRI22qGx1v2ppWGl5Ub7Z2ZiXg0EhOpp\" --output \"./articoli.zip\"')\n",
+        "!mkdir articoli\n",
+        "!mkdir weights\n",
+        "!mkdir output\n",
+        "\n",
+        "!unzip articoli.zip -d articoli\n",
+        "!gdown --id 1oElTndZC3SKCYM1tA5WSJ8-IqKUIqwsK"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SBvITYkWeRB7"
+      },
+      "outputs": [],
+      "source": [
+        "'''\n",
+        "run this for new dataset\n",
+        "'''\n",
+        "%cd GPT-2_Summarizer\n",
+        "if not os.path.exists(\"./articoli_nuovi.zip\"):\n",
+        "    os.system('gdown --id \"1ywTWq6EJNlDxL3OeKteAP37raW0Pg0eX\" --output \"./articoli_nuovi.zip\"')\n",
+        "!mkdir articoli_nuovi\n",
+        "!mkdir weights\n",
+        "!mkdir output\n",
+        "\n",
+        "!unzip articoli_nuovi.zip -d articoli_nuovi\n",
+        "!gdown --id 1AIShy30wjYznYO6brKQ9JjzHjRByUImu"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RV15sgV-odj9"
+      },
+      "outputs": [],
+      "source": [
+        "!python \"./train_new.py\" --model_name 'GroNLP/gpt2-small-italian' --lr 0.00005 --gradient_accumulation_steps 32 --batch_size 1 --num_train_epochs 1 --output_dir ./output --model_dir ./weights --root_dir './articoli_nuovi' --ids_file \"./index_articoli_nuovi.json\""
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "bash_train_GPT2.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/dataset.py b/dataset.py
@@ -0,0 +1,51 @@
+import os
+import json
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+
+class GPT21024Dataset(Dataset):
+
+    def __init__(self, tokenizer, root_dir, ids_file, mode='train',length=None):
+        self.root_dir = root_dir
+        self.tokenizer = tokenizer
+
+        # with open(ids_file,'r') as f:
+            # if mode=='train':
+            #     self.idxs = np.array(json.load(f)['train_ids'])
+            # elif mode=='valid':
+            #     self.idxs = np.array(json.load(f)['valid_ids'])
+            # elif mode=='test':
+            #     self.idxs = np.array(json.load(f)['test_ids'])
+
+            # self.idxs = self.idxs -min(self.idxs)
+
+        self.idxs = os.listdir(root_dir)
+        self.mode = mode
+        if len == None:
+            self.len = len(self.idxs)
+        else:
+            self.len = length
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self,idx):
+
+        if self.mode=='valid':
+            idx = self.idxs[-idx]
+        elif self.mode=='test':
+            idx = self.idxs[-idx-self.len]   # assuming valid and test set of same sizes
+        else:
+            idx = self.idxs[idx]
+        # file_name = os.path.join(self.root_dir,str(idx)+".json")
+        file_name = os.path.join(self.root_dir,str(idx))
+        with open(file_name,'r') as f:
+              data = json.load(f)
+        text = self.tokenizer.encode(self.tokenizer.pad_token)*1024
+        content = data['article'] + self.tokenizer.encode(self.tokenizer.sep_token) + data['abstract']
+        text[:len(content)] = content
+        text = torch.tensor(text)
+        sample = {'article': text, 'sum_idx': len(data['article'])}
+        return sample
diff --git a/dataset_creation.py b/dataset_creation.py
@@ -0,0 +1,69 @@
+import pandas as pd
+from pre_processing import cleaner_d2v
+import json
+import os
+import argparse
+import time
+
+def parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path_csv", type=str, required = True, help="Path to CSV\nExpecting 2 columns where 1st column --> text and 2nd column --> summary")
+    parser.add_argument("--path_directory", type=str, required = True, help="Path to directory where to save .json files")
+    parser.add_argument("--model", default='GroNLP/gpt2-small-italian', type=str, required=False, help="Model used for tokenizer")
+
+    args = parser.parse_args()
+    path_csv = args.path_csv
+    path_directory = args.path_directory
+    model = args.model
+
+    return path_csv, path_directory, model
+
+path_csv, path_directory, model = parser()
+
+'''
+Write to Json tokenized txt
+'''
+
+def write_json(text, summary, number, directory = path_directory):
+	# saves json files
+    file = os.path.join(directory, 'file_' + str(number) + '.json')
+    js_example = {'id': number, 'article': text, 'abstract': summary}
+    with open(file, 'w') as f:
+        json.dump(js_example, f, ensure_ascii=False)
+
+
+def tokenizer_to_json(dataset, model = model, directory = path_directory):
+    tokenizer = cleaner_d2v.add_special_tokens(model)
+    train_ids = []
+    i = 0
+    for index, row in dataset.iterrows():
+        article, abstract = tokenizer.encode(row['text']), tokenizer.encode(row['summary'])
+        if len(article) > 0 and len(abstract) > 0 and (len(article) + len(abstract)) <= 1023:
+        	train_ids.append(i)
+        	write_json(text = article, summary = abstract, number = i)
+        i += 1
+        if i % 1000 == 0:
+            print(i, " files written")
+
+    file = os.path.join(directory, 'index_files.json')
+
+    x, y = int(len(train_ids) * 0.8), int(len(train_ids) * 0.9)
+    valid_ids = train_ids[x:y]
+    test_ids = train_ids[y:]
+    train_ids = train_ids[:x]
+    with open(file, 'w') as f:
+        js = {}
+        js['train_ids'] = train_ids
+        js['valid_ids'] = valid_ids
+        js['test_ids'] = test_ids
+        json.dump(js, f)
+
+data = pd.read_csv(path_csv)
+data.drop('Unnamed: 0', inplace = True, axis = 1)
+data.columns = ['text', 'summary']
+
+print('Creating dataset...')
+
+start = time.time()
+tokenizer_to_json(dataset = data)
+print('It took {} seconds to tokenize and write all .json files!'.format(time.time() - start))
diff --git a/loading_saved_model.py b/loading_saved_model.py
@@ -0,0 +1,50 @@
+import torch
+from transformers import GPT2LMHeadModel
+import argparse
+from utils_new import add_special_tokens, sample_seq
+
+def load_model(model_name, path):
+	tokenizer = add_special_tokens(model_name)
+	model = GPT2LMHeadModel.from_pretrained(model_name)
+	model.resize_token_embeddings(len(tokenizer))
+	model.load_state_dict(torch.load(path))
+	model.eval()
+	return model, tokenizer
+
+def GPT2_summarizer(model, text, device, tokenizer, length, temperature, top_k, top_p):
+	context = tokenizer(text, return_tensors='np')
+	dictionary = {}
+	dictionary['article'] = context['input_ids'][0]
+	dictionary['sum_idx'] = len(context['input_ids'][0])
+	generated_text = sample_seq(model, dictionary['article'], length, device, temperature, top_k, top_p)
+	generated_text = generated_text[0,dictionary['sum_idx']:].tolist()
+	text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
+	text = tokenizer.convert_tokens_to_string(text)
+	return text
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument("--model_name", default='GroNLP/gpt2-small-italian', type=str, help="Model name to use")
+	parser.add_argument("--text", type=str, required=True, help="Text to pass for summarization")
+	parser.add_argument("--saved_model", default='model_O0_trained_after_50_epochs_only_sum_loss_ignr_pad.bin', type=str, required=True, help="Saved model path")
+	parser.add_argument("--length", default=70, type=int, required=False, help="Number of words to compose the phrase")
+	parser.add_argument("--temperature", default=0.2, type=float, required=False, help="Temperature is a degree of randomness of the predictions," 
+	"lower temperature provides smoother summaries but increases computation time. A temperature of 1 means random probabilities for the next word, >1 probabilities will take less likely words and use words out of context, <1 more context-related words and more words depending on real training data")
+	parser.add_argument("--top_k", default=20, type=int, required=False, help="Top K sampling to sort out by probability and zero-ing out those below the k'th token")
+	parser.add_argument("--top_p", default=0.7, type=float, required=False, help="Top P sampling taking one of top N sample and removing those with probability less than P")
+
+	args = parser.parse_args()
+
+	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+	model, tokenizer = load_model(model_name = args.model_name, path = args.saved_model)
+	text = GPT2_summarizer(model = model, text = args.text, tokenizer = tokenizer, device = device, length=args.length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p)
+	print(text)
+
+if __name__ == '__main__':
+	main()
+
+#parameters for my machine
+#model_name = 'GroNLP/gpt2-small-italian'
+#PATH = r'C:\Users\Mario\Desktop\Summarization_project\weights\config_new\model_O0_trained_after_50_epochs_only_sum_loss_ignr_pad.bin'
+#news = "Sarebbe stato molto facile per l'uomo estrarre la freccia dalla carne del malcapitato, eppure questo si rivelò complicato e fatale. La freccia aveva infatti penetrato troppo a fondo nella gamba e aveva provocato una terribile emorragia."
+#temperature of 0.15/0.20 and top_k of 20 is the gold standard for summarization tasks
diff --git a/pre_processing.py b/pre_processing.py
@@ -0,0 +1,13 @@
+import nltk
+import string
+from transformers import GPT2Tokenizer
+nltk.download('stopwords')
+
+class cleaner_d2v:
+    @staticmethod
+    def add_special_tokens(model):
+        """ Returns GPT2 tokenizer after adding separator and padding tokens """
+        tokenizer = GPT2Tokenizer.from_pretrained(model)
+        special_tokens = {'pad_token': '<|pad|>', 'sep_token': '<|sep|>'}
+        num_add_toks = tokenizer.add_special_tokens(special_tokens)
+        return tokenizer