-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5,067 changed files
with
116,128 additions
and
0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Using a KG-Copy Network for Non-Goal Oriented Dialogues | ||
|
||
## Requirements | ||
- python 3.6 | ||
- pytorch 0.4.1 | ||
- Run ```pip install -r requirements.txt``` | ||
|
||
**NOTE: The following pre-processing step is not required if you just want to train the system on our processed data (since all the required pre-processed data are included in the project directory). | ||
|
||
|
||
## Pre-processing: | ||
|
||
#### Building Knowledge Graph: | ||
Running the following code will download information from wikipedia and will create a Knowledge Graphs for clubs and national teams respectively (names of the selected clubs and national teams are currently hard-coded into the 'build_KG_clubs.py','build_KG_national_teams.py' files: | ||
``` | ||
python build_KG_clubs.py | ||
python build_KG_national_teams.py | ||
``` | ||
|
||
#### Building vocabulary: | ||
In order to build a vocabulary for the system, run the following command. Running the commands will create vocabulary for the system for the given KGs (which we have already built in the previous step) : | ||
``` | ||
wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec | ||
mv wiki.en.vec vocab/ | ||
python create_vocab_kb.py | ||
python build_incar_data.py | ||
``` | ||
Running the commands will generate 'glove300.npy','vocab.npy','w2i.npy' files inside 'vocab/' directory | ||
|
||
#### Generating train/test/dev data from AMT data (conversations) | ||
To create and preprocess train-test-dev data, run the following command (Train, test, validation data are already pre-processed and generated inside \preproc_files directory). | ||
No need to preprocess again if you just want to train/test the model. | ||
``` | ||
python -m spacy download en_core_web_sm | ||
python -m spacy download en | ||
python -m spacy download en_core_web_lg | ||
python preprocess_kb_2.py --data_dir conversations | ||
python utils/generate_entities_soccer.py | ||
``` | ||
|
||
|
||
## Train & Test | ||
Pre-processing is not required if you just want to train/test the model at this point. To train the system run the following command: | ||
|
||
For Soccer Domain: | ||
``` | ||
python -u ./train_kg_copy.py --batch_size 64 --hidden_size 512 --teacher_forcing 12 --resp_len 20 --lr 0.0001 --num_layer 1 --gpu 1 --epochs 300 --data_dir preproc_files/soccer/ | ||
``` | ||
|
||
For incar_settings: | ||
``` | ||
python -u ./train_kg_copy_incar.py --batch_size 64 --hidden_size 512 --teacher_forcing 12 --resp_len 20 --lr 0.0001 --num_layer 1 --gpu 1 --epochs 300 --data_dir preproc_files/incar/ --stoi vocab/w2i_incar.npy --vocab_glove vocab/glove300_incar.npy | ||
``` | ||
|
||
In each epochs the best trained model so far will be saved inside '/models' directory with a file name 'Sentient_model2.bin'. The saved model can later be used for testing purpose on new data. | ||
After completing the training the command will also generate a file 'test_predicted_kg_attn2.csv' where we can check predicted output along with given input test data. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from argparse import ArgumentParser | ||
|
||
def get_args(): | ||
parser = ArgumentParser(description="Soccer chatbot") | ||
parser.add_argument('--no_cuda', action='store_false', help='do not use cuda', dest='cuda') | ||
parser.add_argument('--gpu', type=bool, default=True) | ||
parser.add_argument('--epochs', type=int, default=60) | ||
parser.add_argument('--batch_size', type=int, default=32) | ||
parser.add_argument('--dataset', type=str, default="EntityDetection") | ||
parser.add_argument('--lr', type=float, default=1e-4) | ||
parser.add_argument('--seed', type=int, default=3435) | ||
parser.add_argument('--dev_every', type=int, default=2000) | ||
parser.add_argument('--log_every', type=int, default=1000) | ||
parser.add_argument('--patience', type=int, default=10) | ||
parser.add_argument('--save_path', type=str, default='saved_checkpoints') | ||
parser.add_argument('--specify_prefix', type=str, default='id1') | ||
parser.add_argument('--words_dim', type=int, default=300) | ||
parser.add_argument('--num_layer', type=int, default=2) | ||
parser.add_argument('--dropout', type=float, default=0.3) | ||
parser.add_argument('--input_size', type=int, default=300) | ||
parser.add_argument('--hidden_size', type=int, default=50) | ||
parser.add_argument('--rnn_dropout', type=float, default=0.3) | ||
parser.add_argument('--clip_gradient', type=float, default=0.6, help='gradient clipping') | ||
parser.add_argument('--stoi', type=str, default="vocab/w2i.npy") | ||
parser.add_argument('--vocab_glove', type=str, default="vocab/glove300.npy") | ||
parser.add_argument('--weight_decay',type=float, default=0) | ||
parser.add_argument('--teacher_forcing',type=int, default=4) | ||
parser.add_argument('--fix_embed', action='store_false', dest='train_embed') | ||
parser.add_argument('--hits', type=int, default=100) | ||
parser.add_argument('--no_tqdm', default=False, action='store_true', help='disable tqdm progress bar') | ||
parser.add_argument('--randseed', type=int, default=666, metavar='', help='random seed (default: 666)') | ||
parser.add_argument('--trained_model', type=str, default='') | ||
parser.add_argument('--data_dir', type=str, default='preproc_files/incar/') | ||
parser.add_argument('--results_path', type=str, default='query_text') | ||
parser.add_argument('--emb_drop', type=float, default=0.2) | ||
parser.add_argument('--threshold', type=float, default=0.5) | ||
parser.add_argument('--resp_len', type=int, default=20) | ||
args = parser.parse_args() | ||
return args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
# imports | ||
import pickle | ||
import numpy as np | ||
from collections import defaultdict | ||
import os | ||
from collections import OrderedDict | ||
import torch | ||
from args import get_args | ||
import json | ||
import itertools | ||
import re | ||
args = get_args() | ||
print ('Loaded all libraries') | ||
class DialogBatcher: | ||
""" | ||
Wrapper for batching the Soccer Dialogue dataset | ||
""" | ||
def __init__(self, gpu=True, max_sent_len=100, max_resp_len=50, max_kb_len=198, max_kb_triple_len=6): | ||
self.batch_size = args.batch_size | ||
# self.use_mask = use_mask | ||
self.max_kb_len = max_kb_len | ||
self.max_kb_sub_len = max_kb_triple_len | ||
self.gpu = gpu | ||
self.max_sent_len = max_sent_len | ||
self.max_resp_len = args.resp_len | ||
if os.path.isfile(args.stoi): | ||
self.stoi = np.load(args.stoi,allow_pickle=True).item() | ||
|
||
self.vocab_glove = np.load(args.vocab_glove,allow_pickle=True).item() | ||
vec_dim = 300 | ||
|
||
# self.stoi['EOS'] = len(self.stoi)+1 | ||
# self.stoi['SOS'] = len(self.stoi)+1 | ||
|
||
# Get required dictionaries for data | ||
self.train = self.get_sequences('train') | ||
self.test = self.get_sequences('test') | ||
self.valid = self.get_sequences('valid') | ||
#self.all = self.get_sequences('all') | ||
|
||
self.n_words = len(self.stoi) + 1 | ||
self.n_train = len(self.train['x']) | ||
self.n_val = len(self.valid['x']) | ||
self.n_test = len(self.test['x']) | ||
#self.n_all = len(self.all) | ||
|
||
self.itos = {v : k for k, v in self.stoi.items()} | ||
|
||
# get pretrained vectors | ||
self.vectors = np.zeros((len(self.itos)+1, vec_dim)) | ||
for k, v in self.vocab_glove.items(): | ||
# self.vectors[self.stoi[k.encode('utf-8')]] = v | ||
self.vectors[self.stoi[k]] = v | ||
|
||
self.vectors = torch.from_numpy(self.vectors.astype(np.float32)) | ||
|
||
|
||
def get_seq(self, dial, dataset): | ||
""" | ||
get sequence prepared | ||
:param dial: | ||
:param dataset: | ||
:return: | ||
""" | ||
out = [] | ||
q, q_c, a, kb_s, kb_r, team = dial | ||
kb = [kb_s[j] + kb_r[j] for j in range(len(kb_s))] | ||
# dataset['kb'].append(kb) | ||
# dataset['team'].append(team) | ||
for l, (q_l, qc, a_l) in enumerate(zip(q, q_c, a)): | ||
dataset['q_c'].append(qc) | ||
dataset['kb'].append(kb) | ||
dataset['team'].append(team) | ||
dataset['y'].append(a_l+[self.stoi['<eos>']]) # add EOS token at the end | ||
|
||
if l > 0: | ||
out = self.merge_list([out, a[l - 1], q_l]) | ||
dataset['x'].append(out + [self.stoi['<eos>']]) | ||
else: | ||
out = self.merge_list([out, q_l]) | ||
dataset['x'].append(out + [self.stoi['<eos>']]) | ||
|
||
def get_sequences(self, file_name): | ||
""" | ||
get dialogue data | ||
:param file_name: | ||
:return: | ||
""" | ||
ds = {} | ||
ds['x'], ds['q_c'], ds['y'], ds['kb'], ds['team'] = [], [], [], [], [] | ||
dat = np.load(args.data_dir+file_name+'.npy',allow_pickle=True) | ||
[self.get_seq(d, ds) for d in dat] | ||
ds['x'], ds['q_c'], ds['y'], ds['kb'] = np.array(ds['x']), np.array(ds['q_c']), np.array(ds['y']), np.array(ds['kb']) | ||
return ds | ||
#return dataset | ||
|
||
@staticmethod | ||
def merge_list(set_l): | ||
""" | ||
merge previous utterances into current | ||
:param set_l: | ||
:return: | ||
""" | ||
return list(itertools.chain.from_iterable(set_l)) | ||
|
||
def geti2w(self, word): | ||
""" | ||
get id 2 word | ||
:param word: | ||
:return: | ||
""" | ||
if self.gpu: | ||
word = self.itos[int(word.cpu().numpy())] | ||
if isinstance(word, str): | ||
return word | ||
else: | ||
return word | ||
else: | ||
# word = self.itos[int(word.numpy())].decode('utf-8') | ||
word = self.itos[int(word.numpy())] | ||
if isinstance(word, str): | ||
return word | ||
else: | ||
return word | ||
|
||
def get_iter(self, dataset='train'): | ||
# get iterations. | ||
#self.batch_size = batch_size | ||
if dataset == 'train': | ||
dataset = self.train | ||
elif dataset == 'valid': | ||
dataset = self.valid | ||
# print(dataset['team']) | ||
elif dataset == 'test': | ||
dataset = self.test | ||
else: | ||
dataset = self.all | ||
|
||
for i in range(0, len(dataset['x']), self.batch_size): | ||
query = dataset['x'][i:i+self.batch_size] | ||
query_c = dataset['q_c'][i:i+self.batch_size] | ||
response = dataset['y'][i:i+self.batch_size] | ||
kb = dataset['kb'][i:i+self.batch_size] | ||
team = dataset['team'][i:i+self.batch_size] | ||
# for dat in dataset: | ||
# query, response, kb, team = dat | ||
x, x_c, y, mx, my, kb, kb_m, s, v_m = self._load_batch(query, query_c, response, kb, self.batch_size) | ||
# | ||
yield x, x_c, y, mx, my, kb, kb_m, s, v_m, team | ||
|
||
def _load_batch(self, q, q_c, a, kb_i, b_s): | ||
b_s = min(b_s, len(q)) | ||
#b_s = len(q) | ||
max_len_q = np.max([len(sent) for sent in q]) | ||
max_len_q = (max_len_q) if max_len_q < self.max_sent_len else self.max_sent_len | ||
max_len_a = np.max([len(sent) for sent in a]) | ||
max_len_a = (max_len_a) if max_len_a < self.max_resp_len else self.max_resp_len | ||
x = np.zeros([max_len_q, b_s], np.int) | ||
q_c_o = np.zeros([max_len_q, b_s], np.int) | ||
y = np.zeros([max_len_a, b_s], np.int) | ||
# sentient_g = np.zeros([max_len_a, b_s], np.int) | ||
kb = np.zeros([b_s, self.max_kb_len, self.max_kb_sub_len]) | ||
kb_mask = np.zeros([b_s, self.max_kb_len]) | ||
x_mask = np.zeros([max_len_q, b_s], np.int) | ||
y_mask = np.zeros([max_len_a, b_s], np.int) | ||
vocab_mask = np.arange(0, len(self.stoi) + 1) | ||
|
||
for j, (row_t, row_qc, row_l, row_kb) in enumerate(zip(q, q_c, a, kb_i)): | ||
row_t = row_t[-max_len_q:] | ||
row_qc = row_qc[-max_len_q:] | ||
row_l = row_l[:max_len_a] | ||
# print (kb_i) | ||
# print (row_t, len(row_t)) | ||
x[:len(row_t), j] = row_t | ||
q_c_o[:len(row_qc), j] = row_qc | ||
y[:len(row_l), j] = row_l | ||
x_mask[:len(row_t), j] = 1 | ||
y_mask[:len(row_l), j] = 1 | ||
for l, k in enumerate(row_kb): | ||
try: | ||
kb[j][l][:len(k)] = k | ||
except Exception: | ||
print (k) | ||
kb_mask[j][:len(row_kb)] = 1 | ||
|
||
x_o = torch.from_numpy(x) | ||
q_c_o = torch.from_numpy(q_c_o).type(torch.FloatTensor) | ||
y_o = torch.from_numpy(y).type(torch.FloatTensor) | ||
kb = torch.from_numpy(kb).long() | ||
sentient_g = (y_o > self.stoi['<eos>']) | ||
vocab_mask = torch.from_numpy(vocab_mask) | ||
vocab_mask = (vocab_mask < self.stoi['<eos>']).type(torch.FloatTensor) | ||
|
||
|
||
x_mask = torch.from_numpy(x_mask).type(torch.FloatTensor) | ||
y_mask = torch.from_numpy(y_mask).type(torch.FloatTensor) | ||
kb_mask = torch.from_numpy(kb_mask).type(torch.FloatTensor) | ||
|
||
if self.gpu: | ||
x_o, q_c_o, y_o, x_mask, y_mask, kb, kb_mask, sentient_g, vocab_mask = x_o.cuda(), q_c_o.cuda(), y_o.cuda(), x_mask.cuda(), y_mask.cuda(), \ | ||
kb.cuda(), kb_mask.cuda(), sentient_g.cuda(), vocab_mask.cuda() | ||
|
||
return x_o, q_c_o, y_o, x_mask, y_mask, kb, kb_mask, sentient_g.float(), vocab_mask | ||
|
||
|
||
if __name__ == '__main__': | ||
batcher = DialogBatcher(gpu=False) | ||
batches = batcher.get_iter('valid') | ||
|
||
print (batches) | ||
|
Oops, something went wrong.