Skip to content

Commit

Permalink
matcher: refactor code and add tests
Browse files Browse the repository at this point in the history
Signed-off-by: Antonio Cesarano <[email protected]>
  • Loading branch information
ammirate committed Sep 29, 2017
1 parent b944724 commit 775c496
Show file tree
Hide file tree
Showing 3 changed files with 242 additions and 65 deletions.
92 changes: 27 additions & 65 deletions inspirehep/modules/workflows/tasks/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,29 @@

from __future__ import absolute_import, division, print_function

import datetime
import os
import re
import traceback
import backoff
from functools import wraps
import os
import traceback
import re

import requests
import six

from elasticsearch_dsl import Q
from flask import current_app
from simplejson import JSONDecodeError

from invenio_db import db
from invenio_search import RecordsSearch
from invenio_matcher.api import match as _match
from invenio_workflows import workflow_object_class
from invenio_workflows.models import WorkflowObjectModel, ObjectStatus

from inspire_utils.record import get_value
from inspirehep.utils.datefilter import date_older_than
from inspirehep.modules.workflows.utils import with_debug_logging
from inspirehep.utils.record import get_arxiv_categories, get_arxiv_id

from ..utils import with_debug_logging


@with_debug_logging
@backoff.on_exception(backoff.expo, Exception, max_tries=7)
Expand Down Expand Up @@ -121,8 +126,15 @@ def match_with_invenio_matcher(queries=None, index="records-hep", doc_type="hep"
@with_debug_logging
@wraps(match_with_invenio_matcher)
def _match_with_invenio_matcher(obj, eng):
from invenio_matcher.api import match as _match
"""This function queries ES to match records with the same doi or
arxiv eprint of `obj.data`.
If matches are found, their `control_number` and their content are
stored in `obk.extra_data`.
Returns:
bool: True if there are matches, otherwise False
"""
if queries is None:
queries_ = [
{'type': 'exact', 'match': 'dois.value'},
Expand Down Expand Up @@ -151,7 +163,7 @@ def _match_with_invenio_matcher(obj, eng):
index=index,
doc_type=doc_type
):
matched_recid = matched_record.record.get('id')
matched_recid = matched_record.record.get('control_number')
record_matches['recids'].append(matched_recid)
record_matches['records'].append({
"source": matched_record.record.dumps(),
Expand All @@ -165,58 +177,16 @@ def _match_with_invenio_matcher(obj, eng):
return _match_with_invenio_matcher


@with_debug_logging
def is_too_old(record, days_ago=5):
"""Return True if the record is more than days_ago days old.
If the record is older then it's probably an update of an earlier
record, and we don't want those.
"""
date_format = "%Y-%m-%d"
earliest_date = record.get('earliest_date', '')
if not earliest_date:
earliest_date = record.get('preprint_date', '')

if earliest_date:
try:
parsed_date = datetime.datetime.strptime(
earliest_date,
date_format,
)

except ValueError as err:
raise ValueError(
(
'Unrecognized earliest_date format "%s", valid formats is '
'%s: %s'
) % (earliest_date, date_format, err)
)

if not date_older_than(
parsed_date,
datetime.datetime.utcnow(),
days=days_ago,
):
return False
return True


@with_debug_logging
def article_exists(obj, eng):
"""Check if an article exist in the system."""
# For efficiency check special mark key.
if obj.extra_data.get('match-found', False):
if obj.extra_data.get('is-update', False):
return True
# Use matcher if not on production
if not current_app.config.get('PRODUCTION_MODE'):
if match_with_invenio_matcher(index="records-hep", doc_type="hep")(obj, eng):
obj.log.info("Record already exists in INSPIRE (using matcher).")
return True
else:
obj.log.warning("Remote match is deprecated.")
if match_legacy_inspire(obj, eng):
obj.log.info("Record already exists in INSPIRE.")
return True

if match_with_invenio_matcher(index="records-hep", doc_type="hep")(obj, eng):
obj.log.info("Record already exists in INSPIRE (using matcher).")
return True

return False


Expand Down Expand Up @@ -250,11 +220,6 @@ def already_harvested(obj, eng):
@with_debug_logging
def pending_in_holding_pen(obj, eng):
"""Check if a record exists in HP by looking in given KB."""
from elasticsearch_dsl import Q
from invenio_db import db
from invenio_search import RecordsSearch
from invenio_workflows.models import WorkflowObjectModel, ObjectStatus

config = current_app.config['WORKFLOWS_UI_REST_ENDPOINT']
index = config.get('search_index')
doc_type = config.get('search_type')
Expand Down Expand Up @@ -300,7 +265,6 @@ def pending_in_holding_pen(obj, eng):
@with_debug_logging
def delete_self_and_stop_processing(obj, eng):
"""Delete both versions of itself and stops the workflow."""
from invenio_db import db
db.session.delete(obj.model)
eng.skip_token()

Expand All @@ -314,8 +278,6 @@ def stop_processing(obj, eng):
@with_debug_logging
def update_existing_workflow_object(obj, eng):
"""Update the data of the old object with the new data."""
from invenio_workflows import workflow_object_class

holdingpen_ids = obj.extra_data.get("holdingpen_ids", [])
for matched_id in holdingpen_ids:
existing_obj = workflow_object_class.get(matched_id)
Expand Down
142 changes: 142 additions & 0 deletions tests/integration/workflows/fixtures/matcher_rec.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
{
"$schema": "http://localhost:5000/schemas/records/hep.json",
"_collections": [
"Literature"
],
"abstracts": [
{
"source": "arXiv",
"value": "We discuss the implications of studies of partition function zeros and equimodular curves for the analytic properties of the Ising model on a square lattice in a magnetic field. In particular we consider the dense set of singularities in the susceptibility of the Ising model at $H=0$ found by Nickel and its relation to the analyticity of the field theory computations of Fonseca and Zamolodchikov. Curator change.\n"
}
],
"accelerator_experiments": [
{
"legacy_name": "CERN-LHC-ALICE"
}
],
"acquisition_source": {
"datetime": "2017-05-11T08:50:25.184741",
"method": "hepcrawl",
"source": "arXiv",
"submission_number": "db9325b2362611e78bfd0242ac12000b"
},
"arxiv_eprints": [
{
"categories": [
"math-ph",
"cond-mat.stat-mech",
"math.MP"
],
"value": "1705.02541"
}
],
"authors": [
{
"affiliations": [
{
"curated_relation": true,
"record": {
"$ref": "http://localhost:5000/api/institutions/902725"
},
"value": "CERN"
}
],
"full_name": "Assis, Mathieu",
"signature_block": "ASm",
"uuid": "e3b0a6e6-5950-41c4-ba8a-76cd597cb0d5"
},
{
"affiliations": [
{
"curated_relation": true,
"record": {
"$ref": "http://localhost:5000/api/institutions/902725"
},
"value": "CERN"
}
],
"full_name": "Jacobsen, J.L.",
"signature_block": "JACABSANj",
"uuid": "3f4cc9b7-ade5-4b9c-885c-f20bba88f5e2"
}
],
"citeable": true,
"collaborations": [
{
"value": "ALICE"
}
],
"control_number": 1,
"core": true,
"document_type": [
"article"
],
"dois": [
{
"value": "10.0001/test_doi"
}
],
"inspire_categories": [
{
"source": "arxiv",
"term": "Math and Math Physics"
},
{
"source": "arxiv",
"term": "General Physics"
},
{
"term": "Instrumentation"
}
],
"keywords": [
{
"source": "magpie",
"value": "Ising model"
},
{
"source": "magpie",
"value": "partition function"
},
{
"source": "magpie",
"value": "lattice field theory"
}
],
"license": [
{
"license": "arXiv-1.0",
"url": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"
}
],
"number_of_pages": 21,
"preprint_date": "2017-05-06",
"public_notes": [
{
"source": "arXiv",
"value": "21 pages, 13 figures"
},
{
"value": "*Temporary entry*"
},
{
"value": "This is a test public note by the curator\n"
}
],
"refereed": true,
"report_numbers": [
{
"source": "hepcrawl",
"value": "LPTENS/17/12"
},
{
"value": "CURATOR-001"
}
],
"titles": [
{
"source": "arXiv",
"title": "Analyticity of the Ising curation: An interpretation"
}
]
}
73 changes: 73 additions & 0 deletions tests/integration/workflows/test_task_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

from __future__ import absolute_import, division, print_function

import os
import json
import pytest

from invenio_search import current_search_client as es
from invenio_workflows import workflow_object_class
from inspirehep.modules.migrator.tasks import record_insert_or_replace
from inspirehep.modules.workflows.tasks.matching import (
match_with_invenio_matcher
)


@pytest.fixture(scope="function")
def setup_record(workflow_app):
parent_folder = os.path.abspath(os.path.join(__file__, os.pardir))
with open(os.path.join(parent_folder, 'fixtures', 'matcher_rec.json')) as f:
rec = record_insert_or_replace(json.loads(f.read()))
es.indices.refresh('records-hep')
yield rec
rec._delete(force=True)


def _create_hep_wf(to_match):
return workflow_object_class.create(
data_type='hep',
id_user=1,
data=to_match
)


def test_match_with_invenio_matcher_matches_a_record(setup_record):
to_match = setup_record
to_match['titles'][0]['title'] += ' - different title'

obj = _create_hep_wf(to_match)
matcher = match_with_invenio_matcher()
assert matcher(obj=obj, eng=None) is True


def test_match_with_invenio_matcher_does_not_match_any_record(setup_record):
to_match = setup_record

# matcher won't match this record having different dois and arxiv eprints
to_match['dois'][0]['value'] = 'test'
to_match['arxiv_eprints'][0]['value'] = 'test'

obj = _create_hep_wf(to_match)
matcher = match_with_invenio_matcher()
assert matcher(obj=obj, eng=None) is False

0 comments on commit 775c496

Please sign in to comment.