[WIP]: update model, pre-processor, and api to use reference data

Signed-off-by: Salman Maqbool [email protected]
inspirehep · Feb 27, 2019 · bfb7906 · bfb7906
1 parent acc987f
commit bfb7906
Show file tree

Hide file tree

Showing 4 changed files with 169 additions and 44 deletions.
diff --git a/inspire_classifier/api.py b/inspire_classifier/api.py
@@ -183,12 +183,20 @@ def train():
     train_and_save_classifier()
 
 
-def predict_coreness(title, abstract):
+def predict_coreness(title, abstract, core_references_fraction_first_order, core_references_fraction_second_order,
+                     noncore_references_fraction_first_order, noncore_references_fraction_second_order,
+                     total_first_order_references, total_second_order_references, training_set_means_for_reference_data,
+                     training_set_standard_deviations_for_reference_data):
     """
     Predicts class-wise probabilities given the title and abstract.
     """
     text = title + ' <ENDTITLE> ' + abstract
     categories = ['rejected', 'non_core', 'core']
+    reference_data = np.array([core_references_fraction_first_order, core_references_fraction_second_order,
+                     noncore_references_fraction_first_order, noncore_references_fraction_second_order,
+                     total_first_order_references, total_second_order_references])
+    reference_data_normalized = (reference_data - current_app.config['TRAINING_SET_MEANS_FOR_REFERENCE_DATA']) /\
+                                current_app.config['TRAINING_SET_STANDARD_DEVIATIONS_FOR_REFERENCE_DATA']
     try:
         classifier = Classifier(data_itos_path=path_for('data_itos'),
                                 number_of_classes=3, cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID'])
@@ -200,7 +208,7 @@ def predict_coreness(title, abstract):
     except IOError as error:
         raise IOError('Could not load the trained classifier weights.') from error
 
-    class_probabilities = classifier.predict(text)
+    class_probabilities = classifier.predict(text, reference_data_normalized)
     assert len(class_probabilities) == 3
 
     predicted_class = categories[np.argmax(class_probabilities)]

diff --git a/inspire_classifier/domain/models.py b/inspire_classifier/domain/models.py
@@ -30,15 +30,12 @@
 from fastai.text import (
     accuracy,
     DataLoader,
-    get_rnn_classifer,
     LanguageModelLoader,
     LanguageModelData,
     load_model,
     ModelData,
     RNN_Learner,
     T,
-    TextDataset,
-    TextModel,
     to_gpu,
     to_np,
     save_model,
@@ -48,7 +45,13 @@
     Variable
 )
 from functools import partial
-from inspire_classifier.utils import FastLoadTokenizer
+from inspire_classifier.utils import (
+    FastLoadTokenizer,
+    get_rnn_classifier,
+    numpy_softmax,
+    TextPlusReferencesDataset,
+    TextPlusReferencesModel
+)
 import numpy as np
 import pickle
 
@@ -138,21 +141,23 @@ def __init__(self, data_itos_path, cuda_device_id=0, dropout_multiplier=0.5, num
         number_of_layers = 3
         embedding_size = 400
 
-        self.model = get_rnn_classifer(bptt=number_of_back_propagation_through_time_steps,
-                                       max_seq=20 * number_of_back_propagation_through_time_steps,
-                                       n_class=number_of_classes, n_tok=self.vocabulary_size, emb_sz=embedding_size,
-                                       n_hid=number_of_hidden_units, n_layers=number_of_layers, pad_token=1,
-                                       layers=[embedding_size * 3, 50, number_of_classes], drops=[dropouts[4], 0.1],
-                                       dropouti=dropouts[0], wdrop=dropouts[1], dropoute=dropouts[2],
-                                       dropouth=dropouts[3])
+        self.model = get_rnn_classifier(bptt=number_of_back_propagation_through_time_steps,
+                                        max_seq=20 * number_of_back_propagation_through_time_steps,
+                                        n_tok=self.vocabulary_size, emb_sz=embedding_size, n_hid=number_of_hidden_units,
+                                        n_layers=number_of_layers, pad_token=1,
+                                        layers=[embedding_size * 3 + 100, 50, number_of_classes], drops=[dropouts[4], 0.1],
+                                        dropouti=dropouts[0], wdrop=dropouts[1], dropoute=dropouts[2],
+                                        dropouth=dropouts[3])
 
         self.tokenizer = FastLoadTokenizer()
 
-    def load_training_and_validation_data(self, training_data_ids_path, training_data_labels_path,
-                                          validation_data_ids_path, validation_data_labels_path, classifier_data_dir,
-                                          batch_size=10):
+    def load_training_and_validation_data(self, training_data_ids_path, training_data_references_path, training_data_labels_path,
+                                          validation_data_ids_path, validation_data_references_path, validation_data_labels_path,
+                                          classifier_data_dir, batch_size=10):
         training_token_ids = np.load(training_data_ids_path)
         validation_token_ids = np.load(validation_data_ids_path)
+        training_references = np.load(training_data_references_path)
+        validation_references = np.load(validation_data_references_path)
         training_labels = np.load(training_data_labels_path)
         validation_labels = np.load(validation_data_labels_path)
 
@@ -161,8 +166,8 @@ def load_training_and_validation_data(self, training_data_ids_path, training_dat
         training_labels -= training_labels.min()
         validation_labels -= validation_labels.min()
 
-        training_dataset = TextDataset(training_token_ids, training_labels)
-        validation_dataset = TextDataset(validation_token_ids, validation_labels)
+        training_dataset = TextPlusReferencesDataset(training_token_ids, training_references, training_labels)
+        validation_dataset = TextPlusReferencesDataset(validation_token_ids, validation_references, validation_labels)
         training_data_sampler = SortishSampler(data_source=training_token_ids, key=lambda x: len(training_token_ids[x]),
                                                bs=batch_size // 2)
         validation_data_sampler = SortSampler(data_source=validation_token_ids,
@@ -176,14 +181,14 @@ def load_training_and_validation_data(self, training_data_ids_path, training_dat
     def initialize_learner(self):
         optimization_function = partial(optim.Adam, betas=(0.8, 0.99))
 
-        self.learner = RNN_Learner(data=self.model_data, models=TextModel(to_gpu(self.model)),
+        self.learner = RNN_Learner(data=self.model_data, models=TextPlusReferencesModel(to_gpu(self.model)),
                                    opt_fn=optimization_function)
         self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
         self.learner.clip = 25.
         self.learner.metrics = [accuracy]
 
     def load_finetuned_language_model_weights(self, finetuned_language_model_encoder_path):
-        load_model(self.learner.model[0], finetuned_language_model_encoder_path)
+        load_model(self.learner.model.text_network, finetuned_language_model_encoder_path)
 
     def train(self, trained_classifier_save_path, learning_rates=np.array([1e-4, 1e-4, 1e-4, 1e-3, 1e-2]),
               weight_decay=1e-6, cycle_length=14):
@@ -199,7 +204,7 @@ def train(self, trained_classifier_save_path, learning_rates=np.array([1e-4, 1e-
     def load_trained_classifier_weights(self, trained_classifier_path):
         self.model.load_state_dict(torch.load(trained_classifier_path, map_location=lambda storage, loc: storage))
 
-    def predict(self, text):
+    def predict(self, text, reference_data):
         self.model.reset()
         self.model.eval()
 
@@ -209,15 +214,7 @@ def predict(self, text):
         encoded_tokens = [self.inspire_data_stoi[p] for p in tokens[0]]
         token_array = np.reshape(np.array(encoded_tokens), (-1, 1))
         token_array = Variable(torch.from_numpy(token_array))
-        prediction_scores = self.model(token_array)
+        prediction_scores = self.model(token_array, reference_data)
         prediction_scores_numpy = prediction_scores[0].data.cpu().numpy()
 
-        return numpy_softmax(prediction_scores_numpy[0])[0]
-
-
-def numpy_softmax(x):
-    if x.ndim == 1:
-        x = x.reshape((1, -1))
-    max_x = np.max(x, axis=1).reshape((-1, 1))
-    exp_x = np.exp(x - max_x)
-    return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))
+        return numpy_softmax(prediction_scores_numpy[0])[0]
diff --git a/inspire_classifier/domain/preprocessor.py b/inspire_classifier/domain/preprocessor.py
@@ -56,15 +56,23 @@ def split_and_save_data_for_language_model_and_classifier(dataframe_path, langua
 
     # Shuffle the data
     inspire_data = inspire_data.sample(frac=1).reset_index(drop=True)
-    # Swap the columns so that the labels are Column 0 and the text is Column 1 (and remove any additional columns)
-    inspire_data = inspire_data[['labels', 'text']]
+    inspire_data = inspire_data[['core_reference_fraction_first_order', 'core_reference_fraction_second_order',
+                     'noncore_reference_fraction_first_order', 'noncore_reference_fraction_second_order',
+                     'total_first_order_references', 'total_second_order_references', 'labels', 'text']]
 
     training_dataframe, validation_dataframe = sklearn.model_selection.train_test_split(
         inspire_data, test_size=val_fraction)
 
     training_dataframe = training_dataframe.reset_index(drop=True)
     validation_dataframe = validation_dataframe.reset_index(drop=True)
 
+    # Standardize the numerical data values
+    training_data_means = training_dataframe[:,:-2].mean().values
+    training_data_standard_deviations = training_dataframe[:,:,-2].std().values
+
+    training_dataframe.iloc[:,:-2] = (training_dataframe.iloc[:,:-2] - training_data_means) / training_data_standard_deviations
+    validation_dataframe.iloc[:, :-2] = (validation_dataframe.iloc[:,:-2] - training_data_means) / training_data_standard_deviations
+
     # Save the data for the classifier
     training_dataframe.to_csv(classifier_data_dir / 'training_data.csv', header=False, index=False)
     validation_dataframe.to_csv(classifier_data_dir / 'validation_data.csv', header=False, index=False)
@@ -131,40 +139,56 @@ def generate_and_save_classifier_tokens(classifier_data_dir):
     training_dataframe = pd.read_csv(classifier_data_dir / 'training_data.csv', header=None)
     validation_dataframe = pd.read_csv(classifier_data_dir / 'validation_data.csv', header=None)
 
-    training_tokens, training_labels = get_texts(training_dataframe)
-    validation_tokens, validation_labels = get_texts(validation_dataframe)
+    training_tokens_text, training_references, training_labels = get_texts_classifier(training_dataframe)
+    validation_tokens_text, validation_references, validation_labels = get_texts_classifier(validation_dataframe)
 
-    assert len(training_tokens) == len(training_dataframe)
+    assert len(training_tokens_text) == len(training_dataframe)
 
-    np.save(classifier_data_dir / 'training_tokens.npy', training_tokens)
-    np.save(classifier_data_dir / 'validation_tokens.npy', validation_tokens)
+    np.save(classifier_data_dir / 'training_tokens_text.npy', training_tokens_text)
+    np.save(classifier_data_dir / 'validation_tokens_text.npy', validation_tokens_text)
+    np.save(classifier_data_dir / 'training_references.npy', training_references)
+    np.save(classifier_data_dir / 'validation_references.npy', validation_references)
     np.save(classifier_data_dir / 'training_labels.npy', training_labels)
     np.save(classifier_data_dir / 'validation_labels.npy', validation_labels)
 
 
 def map_and_save_tokens_to_ids_for_classifier(classifier_data_dir, data_itos_path):
-    training_tokens = np.load(classifier_data_dir / 'training_tokens.npy')
-    validation_tokens = np.load(classifier_data_dir / 'validation_tokens.npy')
+    training_tokens_text = np.load(classifier_data_dir / 'training_tokens_text.npy')
+    validation_tokens_text = np.load(classifier_data_dir / 'validation_tokens_text.npy')
 
     inspire_data_itos = pickle.load(open(data_itos_path, 'rb'))
     inspire_data_stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(inspire_data_itos)})
 
-    training_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in training_tokens])
-    validation_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in validation_tokens])
+    training_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in training_tokens_text])
+    validation_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in validation_tokens_text])
 
     np.save(classifier_data_dir / 'training_token_ids.npy', training_token_ids)
     np.save(classifier_data_dir / 'validation_token_ids.npy', validation_token_ids)
 
 
 def get_texts(df):
-    labels = df[0].values.astype(np.int64)
-    texts = f'\n{BOS} {FLD} 1 ' + df[1].astype(str)
+    labels = df['labels'].values.astype(np.int64)
+    texts = f'\n{BOS} {FLD} 1 ' + df['text'].astype(str)
     texts = list(texts.apply(fixup).values)
 
     tokens = FastLoadTokenizer().proc_all_mp(partition_by_cores(texts))
     return tokens, list(labels)
 
 
+def get_texts_classifier(df):
+    labels = df['labels'].values.astype(np.int64)
+    texts = f'\n{BOS} {FLD} 1 ' + df['texts'].astype(str)
+    texts = list(texts.apply(fixup).values)
+    refs = np.array([df['core_references_fraction_first_order'].values.astype(np.float32),
+                     df['core_references_fraction_second_order'].values.astype(np.float32),
+                     df['noncore_references_fraction_first_order'].values.astype(np.float32),
+                     df['noncore_references_fraction_second_order'].values.astype(np.float32),
+                     df['total_first_order_references'].values.astype(np.float32),
+                     df['total_second_order_references'].values.astype(np.float32)])
+    tokens = FastLoadTokenizer().proc_all_mp(partition_by_cores(texts))
+    return tokens, refs.T, list(labels)
+
+
 def fixup(x):
     x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
         'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(

diff --git a/inspire_classifier/utils.py b/inspire_classifier/utils.py
@@ -23,15 +23,23 @@
 # Modified from the fastai library (https://github.com/fastai/fastai).
 
 from fastai.text import (
+    BasicModel,
+    Dataset,
+    LinearBlock,
+    MultiBatchRNN,
     num_cpus,
     Tokenizer
 )
 from flask import current_app
+import numpy as np
 from pathlib import Path
 from concurrent.futures import ProcessPoolExecutor
 import re
 from spacy.lang.en import English
 from spacy.symbols import ORTH
+import torch
+from torch import nn
+import torch.nn.functional as F
 
 
 def path_for(name):
@@ -62,3 +70,91 @@ def proc_all_mp(self, ss, ncpus=None):
         ncpus = ncpus or num_cpus() // 2
         with ProcessPoolExecutor(ncpus) as executor:
             return sum(executor.map(self.proc_all, ss), [])
+
+
+def numpy_softmax(x):
+    if x.ndim == 1:
+        x = x.reshape((1, -1))
+    max_x = np.max(x, axis=1).reshape((-1, 1))
+    exp_x = np.exp(x - max_x)
+    return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))
+
+
+class PoolingLinearClassifier(nn.Module):
+    def __init__(self, layers, drops):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)
+        ])
+        self.ref_layers = nn.ModuleList([
+            LinearBlock(6, 200, 0.0),
+            LinearBlock(200, 100, 0.2)
+        ])
+
+    def pool(self, x, bs, is_max):
+        f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
+        return f(x.permute(1,2,0), (1,)).view(bs,-1)
+
+    def forward(self, text_input, ref_input):
+        raw_outputs, outputs = text_input
+        output = outputs[-1]
+        sl,bs,_ = output.size()
+        avgpool = self.pool(output, bs, False)
+        mxpool = self.pool(output, bs, True)
+        ref_x = ref_input
+        for l_ref in self.ref_layers:
+            ref_output = l_ref(ref_x)
+            ref_x = F.relu(ref_output)
+        x = torch.cat([output[-1], mxpool, avgpool, ref_output], 1)
+        for l in self.layers:
+            l_x = l(x)
+            x = F.relu(l_x)
+        return l_x, raw_outputs, outputs
+
+
+class TextPlusReferencesDataset(Dataset):
+    def __init__(self, x_text, x_ref, y, backwards=False, sos=None, eos=None):
+        self.x_text, self.x_ref, self.y, self.backwards, self.sos, self.eos = \
+            x_text, x_ref, y, backwards, sos, eos
+
+    def __getitem__(self, idx):
+        x_text = self.x_text[idx]
+        x_ref = self.x_ref[idx]
+        if self.backwards: x_text = list(reversed(x_text))
+        if self.eos is not None: x_text = x_text + [self.eos]
+        if self.sos is not None: x_text = [self.sos] + x_text
+        return np.array(x_text), x_ref, self.y[idx]
+
+    def __len__(self):
+        return len(self.x_text)
+
+
+class MultiInputRNN(nn.Module):
+
+    def __init__(self, rnn_encoder, final_classifier_layers, final_classifier_dropouts=[0.2, 0.1]):
+        super(MultiInputRNN, self).__init__()
+        self.text_network = rnn_encoder
+        if hasattr(self.text_network, 'reset'):
+            self.text_network.reset()
+        self.combined_network = PoolingLinearClassifier(layers=final_classifier_layers, drops=final_classifier_dropouts)
+
+    def forward(self, x_text, x_ref):
+        text_network_output = self.text_network(x_text)
+        output = self.combined_network(text_network_output, x_ref)
+
+        return output
+
+
+class TextPlusReferencesModel(BasicModel):
+    def get_layer_groups(self):
+        m = self.model
+        return [(m.text_network.encoder, m.text_network.dropouti),
+                *zip(m.text_network.rnns, m.text_network.dropouths),
+                (m.combined_network)]
+
+
+def get_rnn_classifier(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
+                       dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, qrnn=False):
+    rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
+                            dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=qrnn)
+    return MultiInputRNN(rnn_enc, layers, drops)