Skip to content

Commit

Permalink
added files
Browse files Browse the repository at this point in the history
  • Loading branch information
rashad101 committed Jun 30, 2019
1 parent c98ccad commit 807a8b3
Show file tree
Hide file tree
Showing 5,067 changed files with 116,128 additions and 0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
58 changes: 58 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Using a KG-Copy Network for Non-Goal Oriented Dialogues

## Requirements
- python 3.6
- pytorch 0.4.1
- Run ```pip install -r requirements.txt```

**NOTE: The following pre-processing step is not required if you just want to train the system on our processed data (since all the required pre-processed data are included in the project directory).


## Pre-processing:

#### Building Knowledge Graph:
Running the following code will download information from wikipedia and will create a Knowledge Graphs for clubs and national teams respectively (names of the selected clubs and national teams are currently hard-coded into the 'build_KG_clubs.py','build_KG_national_teams.py' files:
```
python build_KG_clubs.py
python build_KG_national_teams.py
```

#### Building vocabulary:
In order to build a vocabulary for the system, run the following command. Running the commands will create vocabulary for the system for the given KGs (which we have already built in the previous step) :
```
wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec
mv wiki.en.vec vocab/
python create_vocab_kb.py
python build_incar_data.py
```
Running the commands will generate 'glove300.npy','vocab.npy','w2i.npy' files inside 'vocab/' directory

#### Generating train/test/dev data from AMT data (conversations)
To create and preprocess train-test-dev data, run the following command (Train, test, validation data are already pre-processed and generated inside \preproc_files directory).
No need to preprocess again if you just want to train/test the model.
```
python -m spacy download en_core_web_sm
python -m spacy download en
python -m spacy download en_core_web_lg
python preprocess_kb_2.py --data_dir conversations
python utils/generate_entities_soccer.py
```


## Train & Test
Pre-processing is not required if you just want to train/test the model at this point. To train the system run the following command:

For Soccer Domain:
```
python -u ./train_kg_copy.py --batch_size 64 --hidden_size 512 --teacher_forcing 12 --resp_len 20 --lr 0.0001 --num_layer 1 --gpu 1 --epochs 300 --data_dir preproc_files/soccer/
```

For incar_settings:
```
python -u ./train_kg_copy_incar.py --batch_size 64 --hidden_size 512 --teacher_forcing 12 --resp_len 20 --lr 0.0001 --num_layer 1 --gpu 1 --epochs 300 --data_dir preproc_files/incar/ --stoi vocab/w2i_incar.npy --vocab_glove vocab/glove300_incar.npy
```

In each epochs the best trained model so far will be saved inside '/models' directory with a file name 'Sentient_model2.bin'. The saved model can later be used for testing purpose on new data.
After completing the training the command will also generate a file 'test_predicted_kg_attn2.csv' where we can check predicted output along with given input test data.

39 changes: 39 additions & 0 deletions args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from argparse import ArgumentParser

def get_args():
parser = ArgumentParser(description="Soccer chatbot")
parser.add_argument('--no_cuda', action='store_false', help='do not use cuda', dest='cuda')
parser.add_argument('--gpu', type=bool, default=True)
parser.add_argument('--epochs', type=int, default=60)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--dataset', type=str, default="EntityDetection")
parser.add_argument('--lr', type=float, default=1e-4)
parser.add_argument('--seed', type=int, default=3435)
parser.add_argument('--dev_every', type=int, default=2000)
parser.add_argument('--log_every', type=int, default=1000)
parser.add_argument('--patience', type=int, default=10)
parser.add_argument('--save_path', type=str, default='saved_checkpoints')
parser.add_argument('--specify_prefix', type=str, default='id1')
parser.add_argument('--words_dim', type=int, default=300)
parser.add_argument('--num_layer', type=int, default=2)
parser.add_argument('--dropout', type=float, default=0.3)
parser.add_argument('--input_size', type=int, default=300)
parser.add_argument('--hidden_size', type=int, default=50)
parser.add_argument('--rnn_dropout', type=float, default=0.3)
parser.add_argument('--clip_gradient', type=float, default=0.6, help='gradient clipping')
parser.add_argument('--stoi', type=str, default="vocab/w2i.npy")
parser.add_argument('--vocab_glove', type=str, default="vocab/glove300.npy")
parser.add_argument('--weight_decay',type=float, default=0)
parser.add_argument('--teacher_forcing',type=int, default=4)
parser.add_argument('--fix_embed', action='store_false', dest='train_embed')
parser.add_argument('--hits', type=int, default=100)
parser.add_argument('--no_tqdm', default=False, action='store_true', help='disable tqdm progress bar')
parser.add_argument('--randseed', type=int, default=666, metavar='', help='random seed (default: 666)')
parser.add_argument('--trained_model', type=str, default='')
parser.add_argument('--data_dir', type=str, default='preproc_files/incar/')
parser.add_argument('--results_path', type=str, default='query_text')
parser.add_argument('--emb_drop', type=float, default=0.2)
parser.add_argument('--threshold', type=float, default=0.5)
parser.add_argument('--resp_len', type=int, default=20)
args = parser.parse_args()
return args
211 changes: 211 additions & 0 deletions batcher_kb_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# imports
import pickle
import numpy as np
from collections import defaultdict
import os
from collections import OrderedDict
import torch
from args import get_args
import json
import itertools
import re
args = get_args()
print ('Loaded all libraries')
class DialogBatcher:
"""
Wrapper for batching the Soccer Dialogue dataset
"""
def __init__(self, gpu=True, max_sent_len=100, max_resp_len=50, max_kb_len=198, max_kb_triple_len=6):
self.batch_size = args.batch_size
# self.use_mask = use_mask
self.max_kb_len = max_kb_len
self.max_kb_sub_len = max_kb_triple_len
self.gpu = gpu
self.max_sent_len = max_sent_len
self.max_resp_len = args.resp_len
if os.path.isfile(args.stoi):
self.stoi = np.load(args.stoi,allow_pickle=True).item()

self.vocab_glove = np.load(args.vocab_glove,allow_pickle=True).item()
vec_dim = 300

# self.stoi['EOS'] = len(self.stoi)+1
# self.stoi['SOS'] = len(self.stoi)+1

# Get required dictionaries for data
self.train = self.get_sequences('train')
self.test = self.get_sequences('test')
self.valid = self.get_sequences('valid')
#self.all = self.get_sequences('all')

self.n_words = len(self.stoi) + 1
self.n_train = len(self.train['x'])
self.n_val = len(self.valid['x'])
self.n_test = len(self.test['x'])
#self.n_all = len(self.all)

self.itos = {v : k for k, v in self.stoi.items()}

# get pretrained vectors
self.vectors = np.zeros((len(self.itos)+1, vec_dim))
for k, v in self.vocab_glove.items():
# self.vectors[self.stoi[k.encode('utf-8')]] = v
self.vectors[self.stoi[k]] = v

self.vectors = torch.from_numpy(self.vectors.astype(np.float32))


def get_seq(self, dial, dataset):
"""
get sequence prepared
:param dial:
:param dataset:
:return:
"""
out = []
q, q_c, a, kb_s, kb_r, team = dial
kb = [kb_s[j] + kb_r[j] for j in range(len(kb_s))]
# dataset['kb'].append(kb)
# dataset['team'].append(team)
for l, (q_l, qc, a_l) in enumerate(zip(q, q_c, a)):
dataset['q_c'].append(qc)
dataset['kb'].append(kb)
dataset['team'].append(team)
dataset['y'].append(a_l+[self.stoi['<eos>']]) # add EOS token at the end

if l > 0:
out = self.merge_list([out, a[l - 1], q_l])
dataset['x'].append(out + [self.stoi['<eos>']])
else:
out = self.merge_list([out, q_l])
dataset['x'].append(out + [self.stoi['<eos>']])

def get_sequences(self, file_name):
"""
get dialogue data
:param file_name:
:return:
"""
ds = {}
ds['x'], ds['q_c'], ds['y'], ds['kb'], ds['team'] = [], [], [], [], []
dat = np.load(args.data_dir+file_name+'.npy',allow_pickle=True)
[self.get_seq(d, ds) for d in dat]
ds['x'], ds['q_c'], ds['y'], ds['kb'] = np.array(ds['x']), np.array(ds['q_c']), np.array(ds['y']), np.array(ds['kb'])
return ds
#return dataset

@staticmethod
def merge_list(set_l):
"""
merge previous utterances into current
:param set_l:
:return:
"""
return list(itertools.chain.from_iterable(set_l))

def geti2w(self, word):
"""
get id 2 word
:param word:
:return:
"""
if self.gpu:
word = self.itos[int(word.cpu().numpy())]
if isinstance(word, str):
return word
else:
return word
else:
# word = self.itos[int(word.numpy())].decode('utf-8')
word = self.itos[int(word.numpy())]
if isinstance(word, str):
return word
else:
return word

def get_iter(self, dataset='train'):
# get iterations.
#self.batch_size = batch_size
if dataset == 'train':
dataset = self.train
elif dataset == 'valid':
dataset = self.valid
# print(dataset['team'])
elif dataset == 'test':
dataset = self.test
else:
dataset = self.all

for i in range(0, len(dataset['x']), self.batch_size):
query = dataset['x'][i:i+self.batch_size]
query_c = dataset['q_c'][i:i+self.batch_size]
response = dataset['y'][i:i+self.batch_size]
kb = dataset['kb'][i:i+self.batch_size]
team = dataset['team'][i:i+self.batch_size]
# for dat in dataset:
# query, response, kb, team = dat
x, x_c, y, mx, my, kb, kb_m, s, v_m = self._load_batch(query, query_c, response, kb, self.batch_size)
#
yield x, x_c, y, mx, my, kb, kb_m, s, v_m, team

def _load_batch(self, q, q_c, a, kb_i, b_s):
b_s = min(b_s, len(q))
#b_s = len(q)
max_len_q = np.max([len(sent) for sent in q])
max_len_q = (max_len_q) if max_len_q < self.max_sent_len else self.max_sent_len
max_len_a = np.max([len(sent) for sent in a])
max_len_a = (max_len_a) if max_len_a < self.max_resp_len else self.max_resp_len
x = np.zeros([max_len_q, b_s], np.int)
q_c_o = np.zeros([max_len_q, b_s], np.int)
y = np.zeros([max_len_a, b_s], np.int)
# sentient_g = np.zeros([max_len_a, b_s], np.int)
kb = np.zeros([b_s, self.max_kb_len, self.max_kb_sub_len])
kb_mask = np.zeros([b_s, self.max_kb_len])
x_mask = np.zeros([max_len_q, b_s], np.int)
y_mask = np.zeros([max_len_a, b_s], np.int)
vocab_mask = np.arange(0, len(self.stoi) + 1)

for j, (row_t, row_qc, row_l, row_kb) in enumerate(zip(q, q_c, a, kb_i)):
row_t = row_t[-max_len_q:]
row_qc = row_qc[-max_len_q:]
row_l = row_l[:max_len_a]
# print (kb_i)
# print (row_t, len(row_t))
x[:len(row_t), j] = row_t
q_c_o[:len(row_qc), j] = row_qc
y[:len(row_l), j] = row_l
x_mask[:len(row_t), j] = 1
y_mask[:len(row_l), j] = 1
for l, k in enumerate(row_kb):
try:
kb[j][l][:len(k)] = k
except Exception:
print (k)
kb_mask[j][:len(row_kb)] = 1

x_o = torch.from_numpy(x)
q_c_o = torch.from_numpy(q_c_o).type(torch.FloatTensor)
y_o = torch.from_numpy(y).type(torch.FloatTensor)
kb = torch.from_numpy(kb).long()
sentient_g = (y_o > self.stoi['<eos>'])
vocab_mask = torch.from_numpy(vocab_mask)
vocab_mask = (vocab_mask < self.stoi['<eos>']).type(torch.FloatTensor)


x_mask = torch.from_numpy(x_mask).type(torch.FloatTensor)
y_mask = torch.from_numpy(y_mask).type(torch.FloatTensor)
kb_mask = torch.from_numpy(kb_mask).type(torch.FloatTensor)

if self.gpu:
x_o, q_c_o, y_o, x_mask, y_mask, kb, kb_mask, sentient_g, vocab_mask = x_o.cuda(), q_c_o.cuda(), y_o.cuda(), x_mask.cuda(), y_mask.cuda(), \
kb.cuda(), kb_mask.cuda(), sentient_g.cuda(), vocab_mask.cuda()

return x_o, q_c_o, y_o, x_mask, y_mask, kb, kb_mask, sentient_g.float(), vocab_mask


if __name__ == '__main__':
batcher = DialogBatcher(gpu=False)
batches = batcher.get_iter('valid')

print (batches)

Loading

0 comments on commit 807a8b3

Please sign in to comment.