Skip to content

Commit

Permalink
[WIP]: update model, pre-processor, and api to use reference data
Browse files Browse the repository at this point in the history
Signed-off-by: Salman Maqbool [email protected]
  • Loading branch information
salmanmaq committed Feb 27, 2019
1 parent acc987f commit bfb7906
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 44 deletions.
12 changes: 10 additions & 2 deletions inspire_classifier/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,20 @@ def train():
train_and_save_classifier()


def predict_coreness(title, abstract):
def predict_coreness(title, abstract, core_references_fraction_first_order, core_references_fraction_second_order,
noncore_references_fraction_first_order, noncore_references_fraction_second_order,
total_first_order_references, total_second_order_references, training_set_means_for_reference_data,
training_set_standard_deviations_for_reference_data):
"""
Predicts class-wise probabilities given the title and abstract.
"""
text = title + ' <ENDTITLE> ' + abstract
categories = ['rejected', 'non_core', 'core']
reference_data = np.array([core_references_fraction_first_order, core_references_fraction_second_order,
noncore_references_fraction_first_order, noncore_references_fraction_second_order,
total_first_order_references, total_second_order_references])
reference_data_normalized = (reference_data - current_app.config['TRAINING_SET_MEANS_FOR_REFERENCE_DATA']) /\
current_app.config['TRAINING_SET_STANDARD_DEVIATIONS_FOR_REFERENCE_DATA']
try:
classifier = Classifier(data_itos_path=path_for('data_itos'),
number_of_classes=3, cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID'])
Expand All @@ -200,7 +208,7 @@ def predict_coreness(title, abstract):
except IOError as error:
raise IOError('Could not load the trained classifier weights.') from error

class_probabilities = classifier.predict(text)
class_probabilities = classifier.predict(text, reference_data_normalized)
assert len(class_probabilities) == 3

predicted_class = categories[np.argmax(class_probabilities)]
Expand Down
55 changes: 26 additions & 29 deletions inspire_classifier/domain/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,12 @@
from fastai.text import (
accuracy,
DataLoader,
get_rnn_classifer,
LanguageModelLoader,
LanguageModelData,
load_model,
ModelData,
RNN_Learner,
T,
TextDataset,
TextModel,
to_gpu,
to_np,
save_model,
Expand All @@ -48,7 +45,13 @@
Variable
)
from functools import partial
from inspire_classifier.utils import FastLoadTokenizer
from inspire_classifier.utils import (
FastLoadTokenizer,
get_rnn_classifier,
numpy_softmax,
TextPlusReferencesDataset,
TextPlusReferencesModel
)
import numpy as np
import pickle

Expand Down Expand Up @@ -138,21 +141,23 @@ def __init__(self, data_itos_path, cuda_device_id=0, dropout_multiplier=0.5, num
number_of_layers = 3
embedding_size = 400

self.model = get_rnn_classifer(bptt=number_of_back_propagation_through_time_steps,
max_seq=20 * number_of_back_propagation_through_time_steps,
n_class=number_of_classes, n_tok=self.vocabulary_size, emb_sz=embedding_size,
n_hid=number_of_hidden_units, n_layers=number_of_layers, pad_token=1,
layers=[embedding_size * 3, 50, number_of_classes], drops=[dropouts[4], 0.1],
dropouti=dropouts[0], wdrop=dropouts[1], dropoute=dropouts[2],
dropouth=dropouts[3])
self.model = get_rnn_classifier(bptt=number_of_back_propagation_through_time_steps,
max_seq=20 * number_of_back_propagation_through_time_steps,
n_tok=self.vocabulary_size, emb_sz=embedding_size, n_hid=number_of_hidden_units,
n_layers=number_of_layers, pad_token=1,
layers=[embedding_size * 3 + 100, 50, number_of_classes], drops=[dropouts[4], 0.1],
dropouti=dropouts[0], wdrop=dropouts[1], dropoute=dropouts[2],
dropouth=dropouts[3])

self.tokenizer = FastLoadTokenizer()

def load_training_and_validation_data(self, training_data_ids_path, training_data_labels_path,
validation_data_ids_path, validation_data_labels_path, classifier_data_dir,
batch_size=10):
def load_training_and_validation_data(self, training_data_ids_path, training_data_references_path, training_data_labels_path,
validation_data_ids_path, validation_data_references_path, validation_data_labels_path,
classifier_data_dir, batch_size=10):
training_token_ids = np.load(training_data_ids_path)
validation_token_ids = np.load(validation_data_ids_path)
training_references = np.load(training_data_references_path)
validation_references = np.load(validation_data_references_path)
training_labels = np.load(training_data_labels_path)
validation_labels = np.load(validation_data_labels_path)

Expand All @@ -161,8 +166,8 @@ def load_training_and_validation_data(self, training_data_ids_path, training_dat
training_labels -= training_labels.min()
validation_labels -= validation_labels.min()

training_dataset = TextDataset(training_token_ids, training_labels)
validation_dataset = TextDataset(validation_token_ids, validation_labels)
training_dataset = TextPlusReferencesDataset(training_token_ids, training_references, training_labels)
validation_dataset = TextPlusReferencesDataset(validation_token_ids, validation_references, validation_labels)
training_data_sampler = SortishSampler(data_source=training_token_ids, key=lambda x: len(training_token_ids[x]),
bs=batch_size // 2)
validation_data_sampler = SortSampler(data_source=validation_token_ids,
Expand All @@ -176,14 +181,14 @@ def load_training_and_validation_data(self, training_data_ids_path, training_dat
def initialize_learner(self):
optimization_function = partial(optim.Adam, betas=(0.8, 0.99))

self.learner = RNN_Learner(data=self.model_data, models=TextModel(to_gpu(self.model)),
self.learner = RNN_Learner(data=self.model_data, models=TextPlusReferencesModel(to_gpu(self.model)),
opt_fn=optimization_function)
self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
self.learner.clip = 25.
self.learner.metrics = [accuracy]

def load_finetuned_language_model_weights(self, finetuned_language_model_encoder_path):
load_model(self.learner.model[0], finetuned_language_model_encoder_path)
load_model(self.learner.model.text_network, finetuned_language_model_encoder_path)

def train(self, trained_classifier_save_path, learning_rates=np.array([1e-4, 1e-4, 1e-4, 1e-3, 1e-2]),
weight_decay=1e-6, cycle_length=14):
Expand All @@ -199,7 +204,7 @@ def train(self, trained_classifier_save_path, learning_rates=np.array([1e-4, 1e-
def load_trained_classifier_weights(self, trained_classifier_path):
self.model.load_state_dict(torch.load(trained_classifier_path, map_location=lambda storage, loc: storage))

def predict(self, text):
def predict(self, text, reference_data):
self.model.reset()
self.model.eval()

Expand All @@ -209,15 +214,7 @@ def predict(self, text):
encoded_tokens = [self.inspire_data_stoi[p] for p in tokens[0]]
token_array = np.reshape(np.array(encoded_tokens), (-1, 1))
token_array = Variable(torch.from_numpy(token_array))
prediction_scores = self.model(token_array)
prediction_scores = self.model(token_array, reference_data)
prediction_scores_numpy = prediction_scores[0].data.cpu().numpy()

return numpy_softmax(prediction_scores_numpy[0])[0]


def numpy_softmax(x):
if x.ndim == 1:
x = x.reshape((1, -1))
max_x = np.max(x, axis=1).reshape((-1, 1))
exp_x = np.exp(x - max_x)
return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))
return numpy_softmax(prediction_scores_numpy[0])[0]
50 changes: 37 additions & 13 deletions inspire_classifier/domain/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,23 @@ def split_and_save_data_for_language_model_and_classifier(dataframe_path, langua

# Shuffle the data
inspire_data = inspire_data.sample(frac=1).reset_index(drop=True)
# Swap the columns so that the labels are Column 0 and the text is Column 1 (and remove any additional columns)
inspire_data = inspire_data[['labels', 'text']]
inspire_data = inspire_data[['core_reference_fraction_first_order', 'core_reference_fraction_second_order',
'noncore_reference_fraction_first_order', 'noncore_reference_fraction_second_order',
'total_first_order_references', 'total_second_order_references', 'labels', 'text']]

training_dataframe, validation_dataframe = sklearn.model_selection.train_test_split(
inspire_data, test_size=val_fraction)

training_dataframe = training_dataframe.reset_index(drop=True)
validation_dataframe = validation_dataframe.reset_index(drop=True)

# Standardize the numerical data values
training_data_means = training_dataframe[:,:-2].mean().values
training_data_standard_deviations = training_dataframe[:,:,-2].std().values

training_dataframe.iloc[:,:-2] = (training_dataframe.iloc[:,:-2] - training_data_means) / training_data_standard_deviations
validation_dataframe.iloc[:, :-2] = (validation_dataframe.iloc[:,:-2] - training_data_means) / training_data_standard_deviations

# Save the data for the classifier
training_dataframe.to_csv(classifier_data_dir / 'training_data.csv', header=False, index=False)
validation_dataframe.to_csv(classifier_data_dir / 'validation_data.csv', header=False, index=False)
Expand Down Expand Up @@ -131,40 +139,56 @@ def generate_and_save_classifier_tokens(classifier_data_dir):
training_dataframe = pd.read_csv(classifier_data_dir / 'training_data.csv', header=None)
validation_dataframe = pd.read_csv(classifier_data_dir / 'validation_data.csv', header=None)

training_tokens, training_labels = get_texts(training_dataframe)
validation_tokens, validation_labels = get_texts(validation_dataframe)
training_tokens_text, training_references, training_labels = get_texts_classifier(training_dataframe)
validation_tokens_text, validation_references, validation_labels = get_texts_classifier(validation_dataframe)

assert len(training_tokens) == len(training_dataframe)
assert len(training_tokens_text) == len(training_dataframe)

np.save(classifier_data_dir / 'training_tokens.npy', training_tokens)
np.save(classifier_data_dir / 'validation_tokens.npy', validation_tokens)
np.save(classifier_data_dir / 'training_tokens_text.npy', training_tokens_text)
np.save(classifier_data_dir / 'validation_tokens_text.npy', validation_tokens_text)
np.save(classifier_data_dir / 'training_references.npy', training_references)
np.save(classifier_data_dir / 'validation_references.npy', validation_references)
np.save(classifier_data_dir / 'training_labels.npy', training_labels)
np.save(classifier_data_dir / 'validation_labels.npy', validation_labels)


def map_and_save_tokens_to_ids_for_classifier(classifier_data_dir, data_itos_path):
training_tokens = np.load(classifier_data_dir / 'training_tokens.npy')
validation_tokens = np.load(classifier_data_dir / 'validation_tokens.npy')
training_tokens_text = np.load(classifier_data_dir / 'training_tokens_text.npy')
validation_tokens_text = np.load(classifier_data_dir / 'validation_tokens_text.npy')

inspire_data_itos = pickle.load(open(data_itos_path, 'rb'))
inspire_data_stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(inspire_data_itos)})

training_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in training_tokens])
validation_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in validation_tokens])
training_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in training_tokens_text])
validation_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in validation_tokens_text])

np.save(classifier_data_dir / 'training_token_ids.npy', training_token_ids)
np.save(classifier_data_dir / 'validation_token_ids.npy', validation_token_ids)


def get_texts(df):
labels = df[0].values.astype(np.int64)
texts = f'\n{BOS} {FLD} 1 ' + df[1].astype(str)
labels = df['labels'].values.astype(np.int64)
texts = f'\n{BOS} {FLD} 1 ' + df['text'].astype(str)
texts = list(texts.apply(fixup).values)

tokens = FastLoadTokenizer().proc_all_mp(partition_by_cores(texts))
return tokens, list(labels)


def get_texts_classifier(df):
labels = df['labels'].values.astype(np.int64)
texts = f'\n{BOS} {FLD} 1 ' + df['texts'].astype(str)
texts = list(texts.apply(fixup).values)
refs = np.array([df['core_references_fraction_first_order'].values.astype(np.float32),
df['core_references_fraction_second_order'].values.astype(np.float32),
df['noncore_references_fraction_first_order'].values.astype(np.float32),
df['noncore_references_fraction_second_order'].values.astype(np.float32),
df['total_first_order_references'].values.astype(np.float32),
df['total_second_order_references'].values.astype(np.float32)])
tokens = FastLoadTokenizer().proc_all_mp(partition_by_cores(texts))
return tokens, refs.T, list(labels)


def fixup(x):
x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
Expand Down
96 changes: 96 additions & 0 deletions inspire_classifier/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,23 @@
# Modified from the fastai library (https://github.com/fastai/fastai).

from fastai.text import (
BasicModel,
Dataset,
LinearBlock,
MultiBatchRNN,
num_cpus,
Tokenizer
)
from flask import current_app
import numpy as np
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import re
from spacy.lang.en import English
from spacy.symbols import ORTH
import torch
from torch import nn
import torch.nn.functional as F


def path_for(name):
Expand Down Expand Up @@ -62,3 +70,91 @@ def proc_all_mp(self, ss, ncpus=None):
ncpus = ncpus or num_cpus() // 2
with ProcessPoolExecutor(ncpus) as executor:
return sum(executor.map(self.proc_all, ss), [])


def numpy_softmax(x):
if x.ndim == 1:
x = x.reshape((1, -1))
max_x = np.max(x, axis=1).reshape((-1, 1))
exp_x = np.exp(x - max_x)
return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))


class PoolingLinearClassifier(nn.Module):
def __init__(self, layers, drops):
super().__init__()
self.layers = nn.ModuleList([
LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)
])
self.ref_layers = nn.ModuleList([
LinearBlock(6, 200, 0.0),
LinearBlock(200, 100, 0.2)
])

def pool(self, x, bs, is_max):
f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
return f(x.permute(1,2,0), (1,)).view(bs,-1)

def forward(self, text_input, ref_input):
raw_outputs, outputs = text_input
output = outputs[-1]
sl,bs,_ = output.size()
avgpool = self.pool(output, bs, False)
mxpool = self.pool(output, bs, True)
ref_x = ref_input
for l_ref in self.ref_layers:
ref_output = l_ref(ref_x)
ref_x = F.relu(ref_output)
x = torch.cat([output[-1], mxpool, avgpool, ref_output], 1)
for l in self.layers:
l_x = l(x)
x = F.relu(l_x)
return l_x, raw_outputs, outputs


class TextPlusReferencesDataset(Dataset):
def __init__(self, x_text, x_ref, y, backwards=False, sos=None, eos=None):
self.x_text, self.x_ref, self.y, self.backwards, self.sos, self.eos = \
x_text, x_ref, y, backwards, sos, eos

def __getitem__(self, idx):
x_text = self.x_text[idx]
x_ref = self.x_ref[idx]
if self.backwards: x_text = list(reversed(x_text))
if self.eos is not None: x_text = x_text + [self.eos]
if self.sos is not None: x_text = [self.sos] + x_text
return np.array(x_text), x_ref, self.y[idx]

def __len__(self):
return len(self.x_text)


class MultiInputRNN(nn.Module):

def __init__(self, rnn_encoder, final_classifier_layers, final_classifier_dropouts=[0.2, 0.1]):
super(MultiInputRNN, self).__init__()
self.text_network = rnn_encoder
if hasattr(self.text_network, 'reset'):
self.text_network.reset()
self.combined_network = PoolingLinearClassifier(layers=final_classifier_layers, drops=final_classifier_dropouts)

def forward(self, x_text, x_ref):
text_network_output = self.text_network(x_text)
output = self.combined_network(text_network_output, x_ref)

return output


class TextPlusReferencesModel(BasicModel):
def get_layer_groups(self):
m = self.model
return [(m.text_network.encoder, m.text_network.dropouti),
*zip(m.text_network.rnns, m.text_network.dropouths),
(m.combined_network)]


def get_rnn_classifier(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, qrnn=False):
rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=qrnn)
return MultiInputRNN(rnn_enc, layers, drops)

0 comments on commit bfb7906

Please sign in to comment.