Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
Polished Version of GPT2 for Summarization
  • Loading branch information
VioletRaven authored Jul 4, 2022
0 parents commit 463384b
Show file tree
Hide file tree
Showing 12 changed files with 1,056 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
desktop.ini
trash.ini
50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Introduction
GPT-2 Fine-Tuning for Summarization on the ARTeLab dataset. The dataset is firstly unpacked into tokenized .json files which are passed to the training section.

### Built With

The following technologies, frameworks and libraries have been used:

* [Python](https://www.python.org/)
* [Git](https://git-scm.com/)

We strongly suggest to create a virtual env (i.e. 'GPT-2_Summarizer') providing the python version otherwise it will not install previous libraries:

```bash
conda create -n GPT-2_Summarizer python=3.8.9
conda activate GPT-2_Summarizer python=3.8.9
```

If you want to run it manually you need to have python 3.8.9 (or later versions) configured on your machine.

1. Install all the libraries using the requirements.txt files that can be found in the main repository

```bash
pip install -r requirements.txt
```

2. Dataset Creation

The dataset can be created by passing a .csv file to the "dataset_creation.py" script which expects 2 columns, text and summary respectively.

```bash
python dataset_creation.py --path_csv "./path_to_csv" --path_directory "./path_to_directory" --model "model_used_for_tokenization"
```
The script will create tokenized .json files that can be fed to the "train.py" script.

3. Training

In order to run the training process on a GPU follow the Google Colab Notebook provided as

bash_train_GPT2.ipynb

Remember to change the Runtime to GPU !


4. Loading and using the model

Once the training has stopped and the best model has been saved you can load it and use it for summarization by running the script below on your terminal.

```bash
python loading_saved_model.py --text "Sarebbe stato molto facile per l'uomo estrarre la freccia dalla carne del malcapitato, eppure questo si rivelò complicato e fatale. La freccia aveva infatti penetrato troppo a fondo nella gamba e aveva provocato una terribile emorragia." --saved_model "./model_O0_trained_after_50_epochs_only_sum_loss_ignr_pad.bin"
```
110 changes: 110 additions & 0 deletions bash_train_GPT2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6d1ca6f3"
},
"outputs": [],
"source": [
"!pip install transformers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7lqB5WNxgfyf"
},
"outputs": [],
"source": [
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OYS439xzBEBc"
},
"outputs": [],
"source": [
"!git clone https://github.com/data-prestige/GPT-2_Summarizer.git\n",
"!git pull"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lz2eFeMTjDuG"
},
"outputs": [],
"source": [
"'''\n",
"run this for old dataset\n",
"'''\n",
"%cd GPT-2_Summarizer\n",
"if not os.path.exists(\"./articoli.zip\"):\n",
" os.system('gdown --id \"1PRI22qGx1v2ppWGl5Ub7Z2ZiXg0EhOpp\" --output \"./articoli.zip\"')\n",
"!mkdir articoli\n",
"!mkdir weights\n",
"!mkdir output\n",
"\n",
"!unzip articoli.zip -d articoli\n",
"!gdown --id 1oElTndZC3SKCYM1tA5WSJ8-IqKUIqwsK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SBvITYkWeRB7"
},
"outputs": [],
"source": [
"'''\n",
"run this for new dataset\n",
"'''\n",
"%cd GPT-2_Summarizer\n",
"if not os.path.exists(\"./articoli_nuovi.zip\"):\n",
" os.system('gdown --id \"1ywTWq6EJNlDxL3OeKteAP37raW0Pg0eX\" --output \"./articoli_nuovi.zip\"')\n",
"!mkdir articoli_nuovi\n",
"!mkdir weights\n",
"!mkdir output\n",
"\n",
"!unzip articoli_nuovi.zip -d articoli_nuovi\n",
"!gdown --id 1AIShy30wjYznYO6brKQ9JjzHjRByUImu"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RV15sgV-odj9"
},
"outputs": [],
"source": [
"!python \"./train_new.py\" --model_name 'GroNLP/gpt2-small-italian' --lr 0.00005 --gradient_accumulation_steps 32 --batch_size 1 --num_train_epochs 1 --output_dir ./output --model_dir ./weights --root_dir './articoli_nuovi' --ids_file \"./index_articoli_nuovi.json\""
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "bash_train_GPT2.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
51 changes: 51 additions & 0 deletions dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import json
import numpy as np
import torch
from torch.utils.data import Dataset


class GPT21024Dataset(Dataset):

def __init__(self, tokenizer, root_dir, ids_file, mode='train',length=None):
self.root_dir = root_dir
self.tokenizer = tokenizer

# with open(ids_file,'r') as f:
# if mode=='train':
# self.idxs = np.array(json.load(f)['train_ids'])
# elif mode=='valid':
# self.idxs = np.array(json.load(f)['valid_ids'])
# elif mode=='test':
# self.idxs = np.array(json.load(f)['test_ids'])

# self.idxs = self.idxs -min(self.idxs)

self.idxs = os.listdir(root_dir)
self.mode = mode
if len == None:
self.len = len(self.idxs)
else:
self.len = length

def __len__(self):
return self.len

def __getitem__(self,idx):

if self.mode=='valid':
idx = self.idxs[-idx]
elif self.mode=='test':
idx = self.idxs[-idx-self.len] # assuming valid and test set of same sizes
else:
idx = self.idxs[idx]
# file_name = os.path.join(self.root_dir,str(idx)+".json")
file_name = os.path.join(self.root_dir,str(idx))
with open(file_name,'r') as f:
data = json.load(f)
text = self.tokenizer.encode(self.tokenizer.pad_token)*1024
content = data['article'] + self.tokenizer.encode(self.tokenizer.sep_token) + data['abstract']
text[:len(content)] = content
text = torch.tensor(text)
sample = {'article': text, 'sum_idx': len(data['article'])}
return sample
69 changes: 69 additions & 0 deletions dataset_creation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import pandas as pd
from pre_processing import cleaner_d2v
import json
import os
import argparse
import time

def parser():
parser = argparse.ArgumentParser()
parser.add_argument("--path_csv", type=str, required = True, help="Path to CSV\nExpecting 2 columns where 1st column --> text and 2nd column --> summary")
parser.add_argument("--path_directory", type=str, required = True, help="Path to directory where to save .json files")
parser.add_argument("--model", default='GroNLP/gpt2-small-italian', type=str, required=False, help="Model used for tokenizer")

args = parser.parse_args()
path_csv = args.path_csv
path_directory = args.path_directory
model = args.model

return path_csv, path_directory, model

path_csv, path_directory, model = parser()

'''
Write to Json tokenized txt
'''

def write_json(text, summary, number, directory = path_directory):
# saves json files
file = os.path.join(directory, 'file_' + str(number) + '.json')
js_example = {'id': number, 'article': text, 'abstract': summary}
with open(file, 'w') as f:
json.dump(js_example, f, ensure_ascii=False)


def tokenizer_to_json(dataset, model = model, directory = path_directory):
tokenizer = cleaner_d2v.add_special_tokens(model)
train_ids = []
i = 0
for index, row in dataset.iterrows():
article, abstract = tokenizer.encode(row['text']), tokenizer.encode(row['summary'])
if len(article) > 0 and len(abstract) > 0 and (len(article) + len(abstract)) <= 1023:
train_ids.append(i)
write_json(text = article, summary = abstract, number = i)
i += 1
if i % 1000 == 0:
print(i, " files written")

file = os.path.join(directory, 'index_files.json')

x, y = int(len(train_ids) * 0.8), int(len(train_ids) * 0.9)
valid_ids = train_ids[x:y]
test_ids = train_ids[y:]
train_ids = train_ids[:x]
with open(file, 'w') as f:
js = {}
js['train_ids'] = train_ids
js['valid_ids'] = valid_ids
js['test_ids'] = test_ids
json.dump(js, f)

data = pd.read_csv(path_csv)
data.drop('Unnamed: 0', inplace = True, axis = 1)
data.columns = ['text', 'summary']

print('Creating dataset...')

start = time.time()
tokenizer_to_json(dataset = data)
print('It took {} seconds to tokenize and write all .json files!'.format(time.time() - start))
50 changes: 50 additions & 0 deletions loading_saved_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import torch
from transformers import GPT2LMHeadModel
import argparse
from utils_new import add_special_tokens, sample_seq

def load_model(model_name, path):
tokenizer = add_special_tokens(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load(path))
model.eval()
return model, tokenizer

def GPT2_summarizer(model, text, device, tokenizer, length, temperature, top_k, top_p):
context = tokenizer(text, return_tensors='np')
dictionary = {}
dictionary['article'] = context['input_ids'][0]
dictionary['sum_idx'] = len(context['input_ids'][0])
generated_text = sample_seq(model, dictionary['article'], length, device, temperature, top_k, top_p)
generated_text = generated_text[0,dictionary['sum_idx']:].tolist()
text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
text = tokenizer.convert_tokens_to_string(text)
return text

def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", default='GroNLP/gpt2-small-italian', type=str, help="Model name to use")
parser.add_argument("--text", type=str, required=True, help="Text to pass for summarization")
parser.add_argument("--saved_model", default='model_O0_trained_after_50_epochs_only_sum_loss_ignr_pad.bin', type=str, required=True, help="Saved model path")
parser.add_argument("--length", default=70, type=int, required=False, help="Number of words to compose the phrase")
parser.add_argument("--temperature", default=0.2, type=float, required=False, help="Temperature is a degree of randomness of the predictions,"
"lower temperature provides smoother summaries but increases computation time. A temperature of 1 means random probabilities for the next word, >1 probabilities will take less likely words and use words out of context, <1 more context-related words and more words depending on real training data")
parser.add_argument("--top_k", default=20, type=int, required=False, help="Top K sampling to sort out by probability and zero-ing out those below the k'th token")
parser.add_argument("--top_p", default=0.7, type=float, required=False, help="Top P sampling taking one of top N sample and removing those with probability less than P")

args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, tokenizer = load_model(model_name = args.model_name, path = args.saved_model)
text = GPT2_summarizer(model = model, text = args.text, tokenizer = tokenizer, device = device, length=args.length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p)
print(text)

if __name__ == '__main__':
main()

#parameters for my machine
#model_name = 'GroNLP/gpt2-small-italian'
#PATH = r'C:\Users\Mario\Desktop\Summarization_project\weights\config_new\model_O0_trained_after_50_epochs_only_sum_loss_ignr_pad.bin'
#news = "Sarebbe stato molto facile per l'uomo estrarre la freccia dalla carne del malcapitato, eppure questo si rivelò complicato e fatale. La freccia aveva infatti penetrato troppo a fondo nella gamba e aveva provocato una terribile emorragia."
#temperature of 0.15/0.20 and top_k of 20 is the gold standard for summarization tasks
13 changes: 13 additions & 0 deletions pre_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import nltk
import string
from transformers import GPT2Tokenizer
nltk.download('stopwords')

class cleaner_d2v:
@staticmethod
def add_special_tokens(model):
""" Returns GPT2 tokenizer after adding separator and padding tokens """
tokenizer = GPT2Tokenizer.from_pretrained(model)
special_tokens = {'pad_token': '<|pad|>', 'sep_token': '<|sep|>'}
num_add_toks = tokenizer.add_special_tokens(special_tokens)
return tokenizer
Loading

0 comments on commit 463384b

Please sign in to comment.