From acc987f6574031db0e96e9e55d6ab236ad30796b Mon Sep 17 00:00:00 2001 From: salmanmaq Date: Wed, 27 Feb 2019 19:11:29 +0100 Subject: [PATCH 1/2] data: update docs and scripts to generate training data with references data included Signed-off-by: Salman Maqbool salman.maqbool@cern.ch --- docs/generate_training_data.rst | 6 +- .../combine_core_noncore_rejected_data.py | 10 +- .../generate_core_and_noncore_data.py | 101 ++++++++++++++++-- ...get_core_and_non_core_arxiv_identifiers.py | 13 ++- .../get_core_and_non_core_recids.py | 45 ++++++++ generate_training_data_scripts/makejson.py | 93 +++++++++++++++- 6 files changed, 244 insertions(+), 24 deletions(-) create mode 100644 generate_training_data_scripts/get_core_and_non_core_recids.py diff --git a/docs/generate_training_data.rst b/docs/generate_training_data.rst index ec9a886..ddd34e7 100644 --- a/docs/generate_training_data.rst +++ b/docs/generate_training_data.rst @@ -52,7 +52,7 @@ Here, we consider only data from 2016 onwards since before that the curation rul Generate Rejected data ^^^^^^^^^^^^^^^^^^^^^^ -The data for Rejected articles is harvested from the local inspire-next instance in a hackish way. The workflows themselves need to be modified in our local inspire-next setup. First, the file *inspire-next/inspirehep/modules/workflows/workflows/article.py* needs to be modified as specified in ``article.py``. We need to add another file *inspire-next/inspirehep/modules/workflows/tasks/makejson.py* with the contents of ``makejson.py``. +The data for Rejected articles is harvested from the local inspire-next instance in a hackish way. The workflows themselves need to be modified in our local inspire-next setup. First, we need to get the Core and Non-Core records ids. We can get by them running the script in ``get_core_and_non_core_recids.py`` from the inspirehep shell [1]_. This will produce two files: ``inspire_core_recids.txt`` and ``inspire_noncore_recids.txt``. Next, the file *inspire-next/inspirehep/modules/workflows/workflows/article.py* needs to be modified as specified in ``article.py``. We need to add another file *inspire-next/inspirehep/modules/workflows/tasks/makejson.py* with the contents of ``makejson.py``. Once the workflow has been modified, we are ready to start the harvest. First, we need to deploy the harvest spiders. This can be done from the *inspire-next* instance folder: @@ -97,14 +97,14 @@ This will open our favorite text editor (or we'll be required to set it). Add th This will schedule a task to run every 15 minutes which will find and delete all files created before the last 30 minutes. It's recommended to schedule the cronjob after starting the harvests since the first harvests and workflows can take a few minutes to start. We can schedule the command to run more frequently or vice versa depending on our hardware specifications. -The harvest produces a file named *inspire_harvested_data.json*. We can monitor the harvest status in the local holdingpen. However, it doesn't contain information on whether the harvested records were Core, Non-Core, or Rejected. To find this, we need to extract the list of arXiv identifiers of Core and Non-Core records from our local inspire-next instance. From the *inspirehep shell* [1]_, copy the contents of ``get_core_and_noncore_arXiv_identifiers.py`` and execute. This will produce two files, *inspire_core_list.txt* and *inspire_noncore_list.txt*. These files will be used to filter out Core and Non-Core records from the harvested data. +The harvest produces a file named *inspire_harvested_data.json*. We can monitor the harvest status in the local holdingpen. However, it doesn't contain information on whether the harvested records were Core, Non-Core, or Rejected. To find this, we need to extract the list of arXiv identifiers of Core and Non-Core records from our local inspire-next instance. From the *inspirehep shell* [1]_, copy the contents of ``get_core_and_noncore_arXiv_identifiers.py`` and execute. This will produce two files, *inspire_core_arxiv_ids.txt* and *inspire_noncore_arxiv_ids.txt*. These files will be used to filter out Core and Non-Core records from the harvested data. Combine the Core, Non-Core, and Rejected data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The Core, Non-Core, and Rejected data can be combined by using the python script found at ``combine_core_noncore_rejected_data.py``. The different required files paths need to be specified in the file before running the script. Finally, this will produce the file *inspire_data.df* which is a Pandas DataFrame and which can be used for training and evaluation of the INSPIRE classifier. This file should be placed at the path specified in *inspire-classifier/inspire_classifier/config.py* in the variable *CLASSIFIER_DATAFRAME_PATH*. -The resulting pandas dataframe will contain 2 columns: *labels* and *text* where *text* is *title* and *abstract* concatenated with a ** token in between. +The resulting pandas dataframe will contain 8 columns: *core_references_fraction_first_order*, *core_references_fraction_second_order*, *noncore_references_fraction_first_order*, *noncore_references_fraction_second_order*, *total_first_order_references*, *total_second_order_references*, *labels*, and *text* where *text* is *title* and *abstract* concatenated with a ** token in between. diff --git a/generate_training_data_scripts/combine_core_noncore_rejected_data.py b/generate_training_data_scripts/combine_core_noncore_rejected_data.py index a4084e4..52bc6d1 100644 --- a/generate_training_data_scripts/combine_core_noncore_rejected_data.py +++ b/generate_training_data_scripts/combine_core_noncore_rejected_data.py @@ -24,16 +24,16 @@ import numpy as np import pandas as pd -inspire_core_list_path = 'inspire_core_list.txt' -inspire_noncore_list_path = 'inspire_noncore_list.txt' +inspire_core_arxiv_ids_path = 'inspire_core_arxiv_ids.txt' +inspire_noncore_arxiv_ids_path = 'inspire_noncore_arxiv_ids.txt' inspire_harvested_data_path = 'inspire_harvested_data.jsonl' inspire_core_data_path = 'inspire_core_records.jsonl' inspire_noncore_data_path = 'inspire_noncore_records.jsonl' save_path = 'inspire_data.df' -with open(inspire_core_list_path, 'r') as fd: +with open(inspire_core_arxiv_ids_path, 'r') as fd: inspire_core_arxiv_ids = set(arxiv_id.strip() for arxiv_id in fd.readlines()) -with open(inspire_noncore_list_path, 'r') as fd: +with open(inspire_noncore_arxiv_ids_path, 'r') as fd: inspire_noncore_arxiv_ids = set(arxiv_id.strip() for arxiv_id in fd.readlines()) def rejected_data(harvested_data_path): @@ -67,5 +67,5 @@ def noncore_data(): inspire_data = pd.concat([rejected_df, noncore_df, core_df], ignore_index=True) inspire_data['text'] = inspire_data['title'] + ' ' + inspire_data['abstract'] -inspire_data = inspire_data[['labels', 'text']] +inspire_data = inspire_data.drop(['title', 'abstract'], axis=1) inspire_data.to_pickle(save_path) diff --git a/generate_training_data_scripts/generate_core_and_noncore_data.py b/generate_training_data_scripts/generate_core_and_noncore_data.py index e3c216f..f482208 100644 --- a/generate_training_data_scripts/generate_core_and_noncore_data.py +++ b/generate_training_data_scripts/generate_core_and_noncore_data.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of INSPIRE. -# Copyright (C) 2014-2018 CERN. +# Copyright (C) 2014-2019 CERN. # # INSPIRE is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -20,12 +20,14 @@ # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. -""" -Get Core and Non-Core records starting from an earliest date from INSPIRE. -Please run the code in this snippet from within the inspirehep shell. -""" +from __future__ import absolute_import, division, print_function import datetime +from inspire_dojson.utils import get_recid_from_ref +from inspirehep.utils.record_getter import ( + get_db_record, + RecordGetterError +) from invenio_db import db from invenio_records.models import RecordMetadata import json @@ -40,8 +42,75 @@ STARTING_DATE = datetime.datetime(2016, 1, 1, 0, 0, 0) +inspire_core_recids_path = 'inspire_core_recids.txt' +inspire_noncore_recids_path = 'inspire_noncore_recids.txt' + + +with open(inspire_core_recids_path, 'r') as fd: + core_recids = set(int(recid.strip()) for recid in fd.readlines()) +with open(inspire_noncore_recids_path, 'r') as fd: + noncore_recids = set(int(recid.strip()) for recid in fd.readlines()) + + +def get_first_order_core_noncore_reference_fractions(references): + num_core_refs = 0 + num_noncore_refs = 0 + if references: + for reference in references: + recid = get_recid_from_ref(reference.get('record')) + if recid in core_recids: + num_core_refs += 1 + elif recid in noncore_recids: + num_noncore_refs += 1 + total_first_order_references = len(references) + core_references_fraction = num_core_refs / total_first_order_references + noncore_references_fraction = num_noncore_refs / total_first_order_references + else: + core_references_fraction, noncore_references_fraction = 0.0, 0.0 + total_first_order_references = 0 + + return core_references_fraction, noncore_references_fraction, total_first_order_references + + +def get_second_order_core_noncore_reference_fractions(references): + num_core_refs = 0 + num_noncore_refs = 0 + total_second_order_references = 0 + first_order_recids = get_references_recids(references) + missing_recids = set() + if first_order_recids: + for f_recid in first_order_recids: + if not f_recid in missing_recids: + try: + second_order_references = get_db_record('lit', f_recid).get('references') + except RecordGetterError: + missing_recids.add(f_recid) + continue + if second_order_references: + total_second_order_references += len(second_order_references) + second_order_recids = get_references_recids(second_order_references) + for s_recid in second_order_recids: + if s_recid in core_recids: + num_core_refs += 1 + elif s_recid in noncore_recids: + num_noncore_refs += 1 + if total_second_order_references > 0: + core_references_fraction = num_core_refs / total_second_order_references + noncore_references_fraction = num_noncore_refs / total_second_order_references + else: + core_references_fraction, noncore_references_fraction = 0.0, 0.0 + + return core_references_fraction, noncore_references_fraction, total_second_order_references + + +def get_references_recids(references): + recids = None + if references: + recids = [get_recid_from_ref(reference.get('record')) for reference in references \ + if reference.get('record')] + return recids -base_query = db.session.query(RecordMetadata).with_entities(RecordMetadata.json['titles'][0]['title'], RecordMetadata.json['abstracts'][0]['value']) +base_query = db.session.query(RecordMetadata).with_entities(RecordMetadata.json['titles'][0]['title'], RecordMetadata.json['abstracts'][0]['value'], RecordMetadata.json['references']) filter_by_date = RecordMetadata.created >= STARTING_DATE has_title_and_abstract = and_(type_coerce(RecordMetadata.json, JSONB).has_key('titles'), type_coerce(RecordMetadata.json, JSONB).has_key('abstracts')) filter_deleted_records = or_(not_(type_coerce(RecordMetadata.json, JSONB).has_key('deleted')), not_(RecordMetadata.json['deleted'] == cast(True, JSONB))) @@ -54,14 +123,30 @@ noncore_query_results = base_query.filter(filter_by_date, only_noncore_records, has_title_and_abstract, filter_deleted_records, only_literature_collection) with open('inspire_core_records.jsonl', 'w') as fd: - for title, abstract in core_query_results: + for title, abstract, references in core_query_results: + core_references_fraction_first_order, noncore_references_fraction_first_order, total_first_order_references = get_first_order_core_noncore_reference_fractions(references) + core_references_fraction_second_order, noncore_references_fraction_second_order, total_second_order_references = get_second_order_core_noncore_reference_fractions(references) fd.write(json.dumps({ 'title': title, 'abstract': abstract, + 'core_references_fraction_first_order': core_references_fraction_first_order, + 'noncore_references_fraction_first_order': noncore_references_fraction_first_order, + 'core_references_fraction_second_order': core_references_fraction_second_order, + 'noncore_references_fraction_second_order': noncore_references_fraction_second_order, + 'total_first_order_references': total_first_order_references, + 'total_second_order_references': total_second_order_references, }) + '\n') with open('inspire_noncore_records.jsonl', 'w') as fd: - for title, abstract in noncore_query_results: + for title, abstract, references in noncore_query_results: + core_references_fraction_first_order, noncore_references_fraction_first_order, total_first_order_references = get_first_order_core_noncore_reference_fractions(references) + core_references_fraction_second_order, noncore_references_fraction_second_order, total_second_order_references = get_second_order_core_noncore_reference_fractions(references) fd.write(json.dumps({ 'title': title, 'abstract': abstract, + 'core_references_fraction_first_order': core_references_fraction_first_order, + 'noncore_references_fraction_first_order': noncore_references_fraction_first_order, + 'core_references_fraction_second_order': core_references_fraction_second_order, + 'noncore_references_fraction_second_order': noncore_references_fraction_second_order, + 'total_first_order_references': total_first_order_references, + 'total_second_order_references': total_second_order_references, }) + '\n') diff --git a/generate_training_data_scripts/get_core_and_non_core_arxiv_identifiers.py b/generate_training_data_scripts/get_core_and_non_core_arxiv_identifiers.py index 13b4d37..7120793 100644 --- a/generate_training_data_scripts/get_core_and_non_core_arxiv_identifiers.py +++ b/generate_training_data_scripts/get_core_and_non_core_arxiv_identifiers.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of INSPIRE. -# Copyright (C) 2014-2018 CERN. +# Copyright (C) 2014-2019 CERN. # # INSPIRE is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -25,13 +25,16 @@ Please run the code in this snippet from within the inspirehep shell. """ +from __future__ import absolute_import, division, print_function + from invenio_search import current_search_client as es from elasticsearch.helpers import scan -import numpy as np + core = [] non_core = [] + for hit in scan(es, query={"query": {"exists": {"field": "arxiv_eprints"}}, "_source": ["core", "arxiv_eprints"]}, index='records-hep', doc_type='hep'): source = hit['_source'] @@ -41,7 +44,7 @@ else: non_core.append(arxiv_eprint) -with open('inspire_core_list.txt', 'w') as fd: +with open('inspire_core_arxiv_ids.txt', 'w') as fd: fd.writelines("{}\n".format(arxiv_id) for arxiv_id in core) -with open('inspire_noncore_list.txt', 'w') as fd: - fd.writelines("{}\n".format(arxiv_id) for arxiv_id in non_core) \ No newline at end of file +with open('inspire_noncore_arxiv_ids.txt', 'w') as fd: + fd.writelines("{}\n".format(arxiv_id) for arxiv_id in non_core) diff --git a/generate_training_data_scripts/get_core_and_non_core_recids.py b/generate_training_data_scripts/get_core_and_non_core_recids.py new file mode 100644 index 0000000..f261a60 --- /dev/null +++ b/generate_training_data_scripts/get_core_and_non_core_recids.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# +# This file is part of INSPIRE. +# Copyright (C) 2014-2019 CERN. +# +# INSPIRE is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# INSPIRE is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with INSPIRE. If not, see . +# +# In applying this license, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +from __future__ import absolute_import, division, print_function + +from invenio_search import current_search_client as es +from elasticsearch.helpers import scan + + +core = [] +non_core = [] + +for hit in scan(es, query={"query": {"exists": {"field": "control_number"}}, "_source": ["core", "control_number"]}, + index='records-hep', doc_type='hep'): + source = hit['_source'] + control_number = source['control_number'] + if source.get('core') == True: + core.append(control_number) + else: + non_core.append(control_number) + +with open('inspire_core_recids.txt', 'w') as fd: + fd.writelines("{}\n".format(recid) for recid in core) +with open('inspire_noncore_recids.txt', 'w') as fd: + fd.writelines("{}\n".format(recid) for recid in non_core) + diff --git a/generate_training_data_scripts/makejson.py b/generate_training_data_scripts/makejson.py index 4e50a66..e6ec310 100644 --- a/generate_training_data_scripts/makejson.py +++ b/generate_training_data_scripts/makejson.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of INSPIRE. -# Copyright (C) 2014-2018 CERN. +# Copyright (C) 2014-2019 CERN. # # INSPIRE is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -21,15 +21,102 @@ # or submit itself to any jurisdiction. from __future__ import absolute_import, division, print_function +from inspire_dojson.utils import get_recid_from_ref +from inspirehep.utils.record_getter import ( + get_db_record, + RecordGetterError +) import json +inspire_core_recids_path = 'inspire_core_recids.txt' +inspire_noncore_recids_path = 'inspire_noncore_recids.txt' + +with open(inspire_core_recids_path, 'r') as fd: + core_recids = set(int(recid.strip()) for recid in fd.readlines()) +with open(inspire_noncore_recids_path, 'r') as fd: + noncore_recids = set(int(recid.strip()) for recid in fd.readlines()) + + def makejson(obj, eng): title = obj.extra_data['source_data']['data']['titles'][0]['title'] abstract = obj.extra_data['source_data']['data']['abstracts'][0]['value'] arxiv_identifier = obj.extra_data['source_data']['data']['arxiv_eprints'][0]['value'] - object_data = {"title": title, "abstract": abstract, "arxiv_identifier": arxiv_identifier} + references = obj.data['references'] + core_references_fraction_first_order, noncore_references_fraction_first_order, total_first_order_references = get_first_order_core_noncore_reference_fractions( + references) + core_references_fraction_second_order, noncore_references_fraction_second_order, total_second_order_references = get_second_order_core_noncore_reference_fractions( + references) + object_data = {"title": title, + "abstract": abstract, + "arxiv_identifier": arxiv_identifier, + "core_references_fraction_first_order": core_references_fraction_first_order, + "noncore_references_fraction_first_order": noncore_references_fraction_first_order, + "core_references_fraction_second_order": core_references_fraction_second_order, + "noncore_references_fraction_second_order": noncore_references_fraction_second_order, + "total_first_order_references": total_first_order_references, + "total_second_order_references": total_second_order_references + } - with open('./inspire_harvested_data.jsonl', 'a') as fd: + with open('inspire_harvested_data.jsonl', 'a') as fd: json.dump(object_data, fd) fd.write("\n") + + +def get_first_order_core_noncore_reference_fractions(references): + num_core_refs = 0 + num_noncore_refs = 0 + if references: + for reference in references: + recid = get_recid_from_ref(reference.get('record')) + if recid in core_recids: + num_core_refs += 1 + elif recid in noncore_recids: + num_noncore_refs += 1 + total_first_order_references = len(references) + core_references_fraction = num_core_refs / total_first_order_references + noncore_references_fraction = num_noncore_refs / total_first_order_references + else: + core_references_fraction, noncore_references_fraction = 0.0, 0.0 + total_first_order_references = 0 + + return core_references_fraction, noncore_references_fraction, total_first_order_references + + +def get_second_order_core_noncore_reference_fractions(references): + num_core_refs = 0 + num_noncore_refs = 0 + total_second_order_references = 0 + first_order_recids = get_references_recids(references) + missing_recids = set() + if first_order_recids: + for f_recid in first_order_recids: + if not f_recid in missing_recids: + try: + second_order_references = get_db_record('lit', f_recid).get('references') + except RecordGetterError: + missing_recids.add(f_recid) + continue + if second_order_references: + total_second_order_references += len(second_order_references) + second_order_recids = get_references_recids(second_order_references) + for s_recid in second_order_recids: + if s_recid in core_recids: + num_core_refs += 1 + elif s_recid in noncore_recids: + num_noncore_refs += 1 + if total_second_order_references > 0: + core_references_fraction = num_core_refs / total_second_order_references + noncore_references_fraction = num_noncore_refs / total_second_order_references + else: + core_references_fraction, noncore_references_fraction = 0.0, 0.0 + + return core_references_fraction, noncore_references_fraction, total_second_order_references + + +def get_references_recids(references): + recids = None + if references: + recids = [get_recid_from_ref(reference.get('record')) for reference in references \ + if reference.get('record')] + return recids \ No newline at end of file From bfb7906c69c4665bcf22027f8192f67dd2ca9ffe Mon Sep 17 00:00:00 2001 From: salmanmaq Date: Wed, 27 Feb 2019 19:13:13 +0100 Subject: [PATCH 2/2] [WIP]: update model, pre-processor, and api to use reference data Signed-off-by: Salman Maqbool salman.maqbool@cern.ch --- inspire_classifier/api.py | 12 ++- inspire_classifier/domain/models.py | 55 ++++++------- inspire_classifier/domain/preprocessor.py | 50 +++++++++--- inspire_classifier/utils.py | 96 +++++++++++++++++++++++ 4 files changed, 169 insertions(+), 44 deletions(-) diff --git a/inspire_classifier/api.py b/inspire_classifier/api.py index 6784a5e..0da4295 100644 --- a/inspire_classifier/api.py +++ b/inspire_classifier/api.py @@ -183,12 +183,20 @@ def train(): train_and_save_classifier() -def predict_coreness(title, abstract): +def predict_coreness(title, abstract, core_references_fraction_first_order, core_references_fraction_second_order, + noncore_references_fraction_first_order, noncore_references_fraction_second_order, + total_first_order_references, total_second_order_references, training_set_means_for_reference_data, + training_set_standard_deviations_for_reference_data): """ Predicts class-wise probabilities given the title and abstract. """ text = title + ' ' + abstract categories = ['rejected', 'non_core', 'core'] + reference_data = np.array([core_references_fraction_first_order, core_references_fraction_second_order, + noncore_references_fraction_first_order, noncore_references_fraction_second_order, + total_first_order_references, total_second_order_references]) + reference_data_normalized = (reference_data - current_app.config['TRAINING_SET_MEANS_FOR_REFERENCE_DATA']) /\ + current_app.config['TRAINING_SET_STANDARD_DEVIATIONS_FOR_REFERENCE_DATA'] try: classifier = Classifier(data_itos_path=path_for('data_itos'), number_of_classes=3, cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID']) @@ -200,7 +208,7 @@ def predict_coreness(title, abstract): except IOError as error: raise IOError('Could not load the trained classifier weights.') from error - class_probabilities = classifier.predict(text) + class_probabilities = classifier.predict(text, reference_data_normalized) assert len(class_probabilities) == 3 predicted_class = categories[np.argmax(class_probabilities)] diff --git a/inspire_classifier/domain/models.py b/inspire_classifier/domain/models.py index 4a6deec..c7db8c2 100644 --- a/inspire_classifier/domain/models.py +++ b/inspire_classifier/domain/models.py @@ -30,15 +30,12 @@ from fastai.text import ( accuracy, DataLoader, - get_rnn_classifer, LanguageModelLoader, LanguageModelData, load_model, ModelData, RNN_Learner, T, - TextDataset, - TextModel, to_gpu, to_np, save_model, @@ -48,7 +45,13 @@ Variable ) from functools import partial -from inspire_classifier.utils import FastLoadTokenizer +from inspire_classifier.utils import ( + FastLoadTokenizer, + get_rnn_classifier, + numpy_softmax, + TextPlusReferencesDataset, + TextPlusReferencesModel +) import numpy as np import pickle @@ -138,21 +141,23 @@ def __init__(self, data_itos_path, cuda_device_id=0, dropout_multiplier=0.5, num number_of_layers = 3 embedding_size = 400 - self.model = get_rnn_classifer(bptt=number_of_back_propagation_through_time_steps, - max_seq=20 * number_of_back_propagation_through_time_steps, - n_class=number_of_classes, n_tok=self.vocabulary_size, emb_sz=embedding_size, - n_hid=number_of_hidden_units, n_layers=number_of_layers, pad_token=1, - layers=[embedding_size * 3, 50, number_of_classes], drops=[dropouts[4], 0.1], - dropouti=dropouts[0], wdrop=dropouts[1], dropoute=dropouts[2], - dropouth=dropouts[3]) + self.model = get_rnn_classifier(bptt=number_of_back_propagation_through_time_steps, + max_seq=20 * number_of_back_propagation_through_time_steps, + n_tok=self.vocabulary_size, emb_sz=embedding_size, n_hid=number_of_hidden_units, + n_layers=number_of_layers, pad_token=1, + layers=[embedding_size * 3 + 100, 50, number_of_classes], drops=[dropouts[4], 0.1], + dropouti=dropouts[0], wdrop=dropouts[1], dropoute=dropouts[2], + dropouth=dropouts[3]) self.tokenizer = FastLoadTokenizer() - def load_training_and_validation_data(self, training_data_ids_path, training_data_labels_path, - validation_data_ids_path, validation_data_labels_path, classifier_data_dir, - batch_size=10): + def load_training_and_validation_data(self, training_data_ids_path, training_data_references_path, training_data_labels_path, + validation_data_ids_path, validation_data_references_path, validation_data_labels_path, + classifier_data_dir, batch_size=10): training_token_ids = np.load(training_data_ids_path) validation_token_ids = np.load(validation_data_ids_path) + training_references = np.load(training_data_references_path) + validation_references = np.load(validation_data_references_path) training_labels = np.load(training_data_labels_path) validation_labels = np.load(validation_data_labels_path) @@ -161,8 +166,8 @@ def load_training_and_validation_data(self, training_data_ids_path, training_dat training_labels -= training_labels.min() validation_labels -= validation_labels.min() - training_dataset = TextDataset(training_token_ids, training_labels) - validation_dataset = TextDataset(validation_token_ids, validation_labels) + training_dataset = TextPlusReferencesDataset(training_token_ids, training_references, training_labels) + validation_dataset = TextPlusReferencesDataset(validation_token_ids, validation_references, validation_labels) training_data_sampler = SortishSampler(data_source=training_token_ids, key=lambda x: len(training_token_ids[x]), bs=batch_size // 2) validation_data_sampler = SortSampler(data_source=validation_token_ids, @@ -176,14 +181,14 @@ def load_training_and_validation_data(self, training_data_ids_path, training_dat def initialize_learner(self): optimization_function = partial(optim.Adam, betas=(0.8, 0.99)) - self.learner = RNN_Learner(data=self.model_data, models=TextModel(to_gpu(self.model)), + self.learner = RNN_Learner(data=self.model_data, models=TextPlusReferencesModel(to_gpu(self.model)), opt_fn=optimization_function) self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1) self.learner.clip = 25. self.learner.metrics = [accuracy] def load_finetuned_language_model_weights(self, finetuned_language_model_encoder_path): - load_model(self.learner.model[0], finetuned_language_model_encoder_path) + load_model(self.learner.model.text_network, finetuned_language_model_encoder_path) def train(self, trained_classifier_save_path, learning_rates=np.array([1e-4, 1e-4, 1e-4, 1e-3, 1e-2]), weight_decay=1e-6, cycle_length=14): @@ -199,7 +204,7 @@ def train(self, trained_classifier_save_path, learning_rates=np.array([1e-4, 1e- def load_trained_classifier_weights(self, trained_classifier_path): self.model.load_state_dict(torch.load(trained_classifier_path, map_location=lambda storage, loc: storage)) - def predict(self, text): + def predict(self, text, reference_data): self.model.reset() self.model.eval() @@ -209,15 +214,7 @@ def predict(self, text): encoded_tokens = [self.inspire_data_stoi[p] for p in tokens[0]] token_array = np.reshape(np.array(encoded_tokens), (-1, 1)) token_array = Variable(torch.from_numpy(token_array)) - prediction_scores = self.model(token_array) + prediction_scores = self.model(token_array, reference_data) prediction_scores_numpy = prediction_scores[0].data.cpu().numpy() - return numpy_softmax(prediction_scores_numpy[0])[0] - - -def numpy_softmax(x): - if x.ndim == 1: - x = x.reshape((1, -1)) - max_x = np.max(x, axis=1).reshape((-1, 1)) - exp_x = np.exp(x - max_x) - return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1)) + return numpy_softmax(prediction_scores_numpy[0])[0] \ No newline at end of file diff --git a/inspire_classifier/domain/preprocessor.py b/inspire_classifier/domain/preprocessor.py index fe10971..83e6224 100644 --- a/inspire_classifier/domain/preprocessor.py +++ b/inspire_classifier/domain/preprocessor.py @@ -56,8 +56,9 @@ def split_and_save_data_for_language_model_and_classifier(dataframe_path, langua # Shuffle the data inspire_data = inspire_data.sample(frac=1).reset_index(drop=True) - # Swap the columns so that the labels are Column 0 and the text is Column 1 (and remove any additional columns) - inspire_data = inspire_data[['labels', 'text']] + inspire_data = inspire_data[['core_reference_fraction_first_order', 'core_reference_fraction_second_order', + 'noncore_reference_fraction_first_order', 'noncore_reference_fraction_second_order', + 'total_first_order_references', 'total_second_order_references', 'labels', 'text']] training_dataframe, validation_dataframe = sklearn.model_selection.train_test_split( inspire_data, test_size=val_fraction) @@ -65,6 +66,13 @@ def split_and_save_data_for_language_model_and_classifier(dataframe_path, langua training_dataframe = training_dataframe.reset_index(drop=True) validation_dataframe = validation_dataframe.reset_index(drop=True) + # Standardize the numerical data values + training_data_means = training_dataframe[:,:-2].mean().values + training_data_standard_deviations = training_dataframe[:,:,-2].std().values + + training_dataframe.iloc[:,:-2] = (training_dataframe.iloc[:,:-2] - training_data_means) / training_data_standard_deviations + validation_dataframe.iloc[:, :-2] = (validation_dataframe.iloc[:,:-2] - training_data_means) / training_data_standard_deviations + # Save the data for the classifier training_dataframe.to_csv(classifier_data_dir / 'training_data.csv', header=False, index=False) validation_dataframe.to_csv(classifier_data_dir / 'validation_data.csv', header=False, index=False) @@ -131,40 +139,56 @@ def generate_and_save_classifier_tokens(classifier_data_dir): training_dataframe = pd.read_csv(classifier_data_dir / 'training_data.csv', header=None) validation_dataframe = pd.read_csv(classifier_data_dir / 'validation_data.csv', header=None) - training_tokens, training_labels = get_texts(training_dataframe) - validation_tokens, validation_labels = get_texts(validation_dataframe) + training_tokens_text, training_references, training_labels = get_texts_classifier(training_dataframe) + validation_tokens_text, validation_references, validation_labels = get_texts_classifier(validation_dataframe) - assert len(training_tokens) == len(training_dataframe) + assert len(training_tokens_text) == len(training_dataframe) - np.save(classifier_data_dir / 'training_tokens.npy', training_tokens) - np.save(classifier_data_dir / 'validation_tokens.npy', validation_tokens) + np.save(classifier_data_dir / 'training_tokens_text.npy', training_tokens_text) + np.save(classifier_data_dir / 'validation_tokens_text.npy', validation_tokens_text) + np.save(classifier_data_dir / 'training_references.npy', training_references) + np.save(classifier_data_dir / 'validation_references.npy', validation_references) np.save(classifier_data_dir / 'training_labels.npy', training_labels) np.save(classifier_data_dir / 'validation_labels.npy', validation_labels) def map_and_save_tokens_to_ids_for_classifier(classifier_data_dir, data_itos_path): - training_tokens = np.load(classifier_data_dir / 'training_tokens.npy') - validation_tokens = np.load(classifier_data_dir / 'validation_tokens.npy') + training_tokens_text = np.load(classifier_data_dir / 'training_tokens_text.npy') + validation_tokens_text = np.load(classifier_data_dir / 'validation_tokens_text.npy') inspire_data_itos = pickle.load(open(data_itos_path, 'rb')) inspire_data_stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(inspire_data_itos)}) - training_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in training_tokens]) - validation_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in validation_tokens]) + training_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in training_tokens_text]) + validation_token_ids = np.array([[inspire_data_stoi[o] for o in p] for p in validation_tokens_text]) np.save(classifier_data_dir / 'training_token_ids.npy', training_token_ids) np.save(classifier_data_dir / 'validation_token_ids.npy', validation_token_ids) def get_texts(df): - labels = df[0].values.astype(np.int64) - texts = f'\n{BOS} {FLD} 1 ' + df[1].astype(str) + labels = df['labels'].values.astype(np.int64) + texts = f'\n{BOS} {FLD} 1 ' + df['text'].astype(str) texts = list(texts.apply(fixup).values) tokens = FastLoadTokenizer().proc_all_mp(partition_by_cores(texts)) return tokens, list(labels) +def get_texts_classifier(df): + labels = df['labels'].values.astype(np.int64) + texts = f'\n{BOS} {FLD} 1 ' + df['texts'].astype(str) + texts = list(texts.apply(fixup).values) + refs = np.array([df['core_references_fraction_first_order'].values.astype(np.float32), + df['core_references_fraction_second_order'].values.astype(np.float32), + df['noncore_references_fraction_first_order'].values.astype(np.float32), + df['noncore_references_fraction_second_order'].values.astype(np.float32), + df['total_first_order_references'].values.astype(np.float32), + df['total_second_order_references'].values.astype(np.float32)]) + tokens = FastLoadTokenizer().proc_all_mp(partition_by_cores(texts)) + return tokens, refs.T, list(labels) + + def fixup(x): x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace( 'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( diff --git a/inspire_classifier/utils.py b/inspire_classifier/utils.py index bc0d699..8b4245d 100644 --- a/inspire_classifier/utils.py +++ b/inspire_classifier/utils.py @@ -23,15 +23,23 @@ # Modified from the fastai library (https://github.com/fastai/fastai). from fastai.text import ( + BasicModel, + Dataset, + LinearBlock, + MultiBatchRNN, num_cpus, Tokenizer ) from flask import current_app +import numpy as np from pathlib import Path from concurrent.futures import ProcessPoolExecutor import re from spacy.lang.en import English from spacy.symbols import ORTH +import torch +from torch import nn +import torch.nn.functional as F def path_for(name): @@ -62,3 +70,91 @@ def proc_all_mp(self, ss, ncpus=None): ncpus = ncpus or num_cpus() // 2 with ProcessPoolExecutor(ncpus) as executor: return sum(executor.map(self.proc_all, ss), []) + + +def numpy_softmax(x): + if x.ndim == 1: + x = x.reshape((1, -1)) + max_x = np.max(x, axis=1).reshape((-1, 1)) + exp_x = np.exp(x - max_x) + return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1)) + + +class PoolingLinearClassifier(nn.Module): + def __init__(self, layers, drops): + super().__init__() + self.layers = nn.ModuleList([ + LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1) + ]) + self.ref_layers = nn.ModuleList([ + LinearBlock(6, 200, 0.0), + LinearBlock(200, 100, 0.2) + ]) + + def pool(self, x, bs, is_max): + f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d + return f(x.permute(1,2,0), (1,)).view(bs,-1) + + def forward(self, text_input, ref_input): + raw_outputs, outputs = text_input + output = outputs[-1] + sl,bs,_ = output.size() + avgpool = self.pool(output, bs, False) + mxpool = self.pool(output, bs, True) + ref_x = ref_input + for l_ref in self.ref_layers: + ref_output = l_ref(ref_x) + ref_x = F.relu(ref_output) + x = torch.cat([output[-1], mxpool, avgpool, ref_output], 1) + for l in self.layers: + l_x = l(x) + x = F.relu(l_x) + return l_x, raw_outputs, outputs + + +class TextPlusReferencesDataset(Dataset): + def __init__(self, x_text, x_ref, y, backwards=False, sos=None, eos=None): + self.x_text, self.x_ref, self.y, self.backwards, self.sos, self.eos = \ + x_text, x_ref, y, backwards, sos, eos + + def __getitem__(self, idx): + x_text = self.x_text[idx] + x_ref = self.x_ref[idx] + if self.backwards: x_text = list(reversed(x_text)) + if self.eos is not None: x_text = x_text + [self.eos] + if self.sos is not None: x_text = [self.sos] + x_text + return np.array(x_text), x_ref, self.y[idx] + + def __len__(self): + return len(self.x_text) + + +class MultiInputRNN(nn.Module): + + def __init__(self, rnn_encoder, final_classifier_layers, final_classifier_dropouts=[0.2, 0.1]): + super(MultiInputRNN, self).__init__() + self.text_network = rnn_encoder + if hasattr(self.text_network, 'reset'): + self.text_network.reset() + self.combined_network = PoolingLinearClassifier(layers=final_classifier_layers, drops=final_classifier_dropouts) + + def forward(self, x_text, x_ref): + text_network_output = self.text_network(x_text) + output = self.combined_network(text_network_output, x_ref) + + return output + + +class TextPlusReferencesModel(BasicModel): + def get_layer_groups(self): + m = self.model + return [(m.text_network.encoder, m.text_network.dropouti), + *zip(m.text_network.rnns, m.text_network.dropouths), + (m.combined_network)] + + +def get_rnn_classifier(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False, + dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, qrnn=False): + rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir, + dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=qrnn) + return MultiInputRNN(rnn_enc, layers, drops) \ No newline at end of file