From ea8229178d3eee5166bcd1f2506f013696bc3ed9 Mon Sep 17 00:00:00 2001 From: PascalEgn Date: Tue, 6 Aug 2024 17:43:24 +0200 Subject: [PATCH] lets see --- .github/workflows/push-master.yml | 2 - inspire_matcher/__init__.py | 4 +- inspire_matcher/api.py | 41 +++++--- inspire_matcher/config.py | 6 +- inspire_matcher/core.py | 107 +++++++++++--------- inspire_matcher/ext.py | 2 +- inspire_matcher/utils.py | 6 +- inspire_matcher/validators.py | 43 +++++--- ruff.toml | 1 + setup.py | 6 +- tests/conftest.py | 5 +- tests/fixtures/harvest_record_2654944.json | 70 ++++++------- tests/fixtures/matching_result_2654944.json | 2 +- tests/fixtures/matching_wrong_2654944.json | 2 +- tests/test_api.py | 13 ++- tests/test_core.py | 102 +++++++++---------- tests/test_utils.py | 3 +- tests/test_validators.py | 89 ++++++++++------ 18 files changed, 285 insertions(+), 219 deletions(-) diff --git a/.github/workflows/push-master.yml b/.github/workflows/push-master.yml index d1a28e6..556d965 100644 --- a/.github/workflows/push-master.yml +++ b/.github/workflows/push-master.yml @@ -16,5 +16,3 @@ jobs: needs: [python2_tests, python3_tests] uses: ./.github/workflows/bump-and-publish.yml secrets: inherit - - diff --git a/inspire_matcher/__init__.py b/inspire_matcher/__init__.py index b54f905..fccd252 100644 --- a/inspire_matcher/__init__.py +++ b/inspire_matcher/__init__.py @@ -22,7 +22,7 @@ from __future__ import absolute_import, division, print_function -from .api import match # noqa: F401 -from .ext import InspireMatcher # noqa: F401 +from inspire_matcher.api import match # noqa: F401 +from inspire_matcher.ext import InspireMatcher # noqa: F401 __version__ = "9.0.29" diff --git a/inspire_matcher/api.py b/inspire_matcher/api.py index 6bbfbba..a8207b5 100644 --- a/inspire_matcher/api.py +++ b/inspire_matcher/api.py @@ -25,24 +25,24 @@ from __future__ import absolute_import, division, print_function from flask import current_app -from six import string_types -from werkzeug.utils import import_string - from invenio_search import current_search_client as es from invenio_search.utils import prefix_index +from six import string_types +from werkzeug.utils import import_string -from .core import compile +from inspire_matcher.core import compile def _get_validator(validator_param): - if callable(validator_param): return validator_param try: validator = import_string(validator_param) except (KeyError, ImportError, AttributeError): - current_app.logger.debug('No validator provided. Falling back to the default validator.') + current_app.logger.debug( + 'No validator provided. Falling back to the default validator.' + ) validator = import_string('inspire_matcher.validators:default_validator') return validator @@ -56,7 +56,9 @@ def match(record, config=None): out which record a reference should be pointing to. """ if config is None: - current_app.logger.debug('No configuration provided. Falling back to the default configuration.') + current_app.logger.debug( + 'No configuration provided. Falling back to the default configuration.' + ) config = current_app.config['MATCHER_DEFAULT_CONFIGURATION'] try: @@ -72,11 +74,17 @@ def match(record, config=None): query_config['_source'] = source match_deleted = config.get('match_deleted', False) collections = config.get('collections') - if not (collections is None or ( - isinstance(collections, (list, tuple)) and - all(isinstance(collection, string_types) for collection in collections) - )): - raise ValueError('Malformed collections. Expected a list of strings bug got: %s' % repr(collections)) + if not ( + collections is None + or ( + isinstance(collections, (list, tuple)) + and all(isinstance(collection, string_types) for collection in collections) + ) + ): + raise ValueError( + 'Malformed collections. Expected a list of strings bug got: %s' + % repr(collections) + ) for i, step in enumerate(algorithm): try: @@ -95,9 +103,14 @@ def match(record, config=None): for j, query in enumerate(queries): try: - body = compile(query, record, collections=collections, match_deleted=match_deleted) + body = compile( + query, record, collections=collections, match_deleted=match_deleted + ) except Exception as e: - raise ValueError('Malformed query. Query %d of step %d does not compile: %s.' % (j, i, repr(e))) + raise ValueError( + 'Malformed query. Query %d of step %d does not compile: %s.' + % (j, i, repr(e)) + ) if not body: continue diff --git a/inspire_matcher/config.py b/inspire_matcher/config.py index f236a31..84db92e 100644 --- a/inspire_matcher/config.py +++ b/inspire_matcher/config.py @@ -61,7 +61,11 @@ "validator": "inspire_matcher.validators:cds_identifier_validator", }, ], - "source": ["control_number", "external_system_identifiers", "persistent_identifiers"], + "source": [ + "control_number", + "external_system_identifiers", + "persistent_identifiers", + ], "doc_type": "hep", "index": "records-hep", } diff --git a/inspire_matcher/core.py b/inspire_matcher/core.py index 692b937..63d8632 100644 --- a/inspire_matcher/core.py +++ b/inspire_matcher/core.py @@ -56,7 +56,9 @@ def _compile_filters(query, collections, match_deleted): } if collections: - result['query']['bool']['filter']['bool']['should'] = _compile_collections(collections) + result['query']['bool']['filter']['bool']['should'] = _compile_collections( + collections + ) if not match_deleted: result['query']['bool']['filter']['bool']['must_not'] = { 'match': { @@ -85,21 +87,31 @@ def _compile_inner(query, record): def _compile_collections(collections): - return [{ - 'match': { - '_collections': collection, - }, - } for collection in collections] + return [ + { + 'match': { + '_collections': collection, + }, + } + for collection in collections + ] def _compile_exact(query, record): if 'match' in query: query['path'] = query.get('path', query['match']) - warnings.warn('The "match" key is deprecated. Use "path" instead.', DeprecationWarning) + warnings.warn( + 'The "match" key is deprecated. Use "path" instead.', DeprecationWarning, + stacklevel=1, + ) if 'search' in query: query['search_path'] = query.get('search_path', query['search']) - warnings.warn('The "search" key is deprecated. Use "search_path" instead.', DeprecationWarning) + warnings.warn( + 'The "search" key is deprecated. Use "search_path" instead.', + DeprecationWarning, + stacklevel=1, + ) path, search_path = query['path'], query['search_path'] @@ -116,11 +128,13 @@ def _compile_exact(query, record): } for value in values: - result['query']['bool']['should'].append({ - 'match': { - search_path: value, - }, - }) + result['query']['bool']['should'].append( + { + 'match': { + search_path: value, + }, + } + ) return result @@ -151,21 +165,23 @@ def _compile_fuzzy(query, record): if '.' in path: raise ValueError('the "path" key can\'t contain dots') # TODO: This query should be refined instead of relying on validation to filter out irrelevant results. - result['query']['dis_max']['queries'].append({ - 'more_like_this': { - 'boost': boost, - 'like': [ - { - 'doc': { - path: values, + result['query']['dis_max']['queries'].append( + { + 'more_like_this': { + 'boost': boost, + 'like': [ + { + 'doc': { + path: values, + }, }, - }, - ], - 'max_query_terms': 25, - 'min_doc_freq': 1, - 'min_term_freq': 1, - }, - }) + ], + 'max_query_terms': 25, + 'min_doc_freq': 1, + 'min_term_freq': 1, + }, + } + ) if not result['query']['dis_max']['queries']: return @@ -201,19 +217,16 @@ def _create_nested_query(query): def _compile_nested(query, record): query_operator = query.get('operator', 'OR') nested_query, paths, search_paths = _create_nested_query(query) - for path, search_path in zip(paths, search_paths): + for path, search_path in zip(paths, search_paths, strict=False): value = get_value(record, path) if not value: return - nested_query['query']['nested']['query']['bool']['must'].append({ - 'match': { - search_path: { - 'query': value, - 'operator': query_operator - } - }, - }) + nested_query['query']['nested']['query']['bool']['must'].append( + { + 'match': {search_path: {'query': value, 'operator': query_operator}}, + } + ) if "inner_hits" in query: nested_query['query']['nested']['inner_hits'] = query['inner_hits'] @@ -223,22 +236,22 @@ def _compile_nested(query, record): def _compile_nested_prefix(query, record): nested_query, paths, search_paths = _create_nested_query(query) prefix_field = query.get('prefix_search_path', []) - for path, search_path in zip(paths, search_paths): + for path, search_path in zip(paths, search_paths, strict=False): value = get_value(record, path) if not value: return if prefix_field and prefix_field in search_path: - nested_query['query']['nested']['query']['bool']['must'].append({ - 'match_phrase_prefix': { - search_path: value - } - }) + nested_query['query']['nested']['query']['bool']['must'].append( + {'match_phrase_prefix': {search_path: value}} + ) else: - nested_query['query']['nested']['query']['bool']['must'].append({ - 'match': { - search_path: value, - }, - }) + nested_query['query']['nested']['query']['bool']['must'].append( + { + 'match': { + search_path: value, + }, + } + ) if "inner_hits" in query: nested_query['query']['nested']['inner_hits'] = query['inner_hits'] diff --git a/inspire_matcher/ext.py b/inspire_matcher/ext.py index 11dbb0c..4b731e9 100644 --- a/inspire_matcher/ext.py +++ b/inspire_matcher/ext.py @@ -24,7 +24,7 @@ from __future__ import absolute_import, division, print_function -from . import config +from inspire_matcher import config class InspireMatcher(object): diff --git a/inspire_matcher/utils.py b/inspire_matcher/utils.py index 856636d..fd919e6 100644 --- a/inspire_matcher/utils.py +++ b/inspire_matcher/utils.py @@ -3,7 +3,8 @@ # This file is part of INSPIRE. # Copyright (C) 2014-2017 CERN. # -# INSPIRE is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +# INSPIRE is free software: you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # @@ -84,7 +85,8 @@ def compute_jaccard_index(x_set, y_set): def get_tokenized_title(title): """Return the tokenised title. - The title is lowercased and split on the spaces. Then, duplicate tokens are removed by adding the tokens to a set. + The title is lowercased and split on the spaces. Then, duplicate + tokens are removed by adding the tokens to a set. Args: title (string): a title. diff --git a/inspire_matcher/validators.py b/inspire_matcher/validators.py index 7e36847..103fecd 100644 --- a/inspire_matcher/validators.py +++ b/inspire_matcher/validators.py @@ -28,10 +28,7 @@ from inspire_utils.record import get_value -from .utils import ( - compute_author_match_score, - compute_title_score, -) +from inspire_matcher.utils import compute_author_match_score, compute_title_score def default_validator(record, result): @@ -41,14 +38,18 @@ def default_validator(record, result): def authors_titles_validator(record, result): """Compute a validation score for the possible match. - The score is based on a similarity score of the authors sets and the maximum Jaccard index found between 2 titles: + The score is based on a similarity score of the authors sets and + the maximum Jaccard index found between 2 titles: one from the record and one from the result title sets. - If the computed score is higher than 0.5, then the match is valid, otherwise it is not. + If the computed score is higher than 0.5, then the match is valid, + otherwise it is not. Args: - record (dict): the given record we are trying to match with similar ones in INSPIRE. - result (dict): possible match returned by the ES query that needs to be validated. + record (dict): the given record we are trying to match + with similar ones in INSPIRE. + result (dict): possible match returned by the ES query + that needs to be validated. Returns: bool: validation decision. @@ -63,7 +64,9 @@ def authors_titles_validator(record, result): result_titles = get_value(result, '_source.titles.title', []) title_score = max( - compute_title_score(record_title, result_title, threshold=0.5, math_threshold=0.3) + compute_title_score( + record_title, result_title, threshold=0.5, math_threshold=0.3 + ) for (record_title, result_title) in product(record_titles, result_titles) ) @@ -79,8 +82,10 @@ def cds_identifier_validator(record, result): ``schema`` different from CDS. Args: - record (dict): the given record we are trying to match with similar ones in INSPIRE. - result (dict): possible match returned by the ES query that needs to be validated. + record (dict): the given record we are trying to match with + similar ones in INSPIRE. + result (dict): possible match returned by the ES query + that needs to be validated. Returns: bool: validation decision. @@ -88,10 +93,20 @@ def cds_identifier_validator(record, result): """ record_external_identifiers = get_value(record, 'external_system_identifiers', []) - result_external_identifiers = get_value(result, '_source.external_system_identifiers', []) + result_external_identifiers = get_value( + result, '_source.external_system_identifiers', [] + ) - record_external_identifiers = {external_id["value"] for external_id in record_external_identifiers if external_id["schema"] == 'CDS'} - result_external_identifiers = {external_id["value"] for external_id in result_external_identifiers if external_id["schema"] == 'CDS'} + record_external_identifiers = { + external_id["value"] + for external_id in record_external_identifiers + if external_id["schema"] == 'CDS' + } + result_external_identifiers = { + external_id["value"] + for external_id in result_external_identifiers + if external_id["schema"] == 'CDS' + } return bool(record_external_identifiers & result_external_identifiers) diff --git a/ruff.toml b/ruff.toml index f6f5368..41c11f7 100644 --- a/ruff.toml +++ b/ruff.toml @@ -19,6 +19,7 @@ select = [ # flake8-pytest-style "PT", ] +ignore = ["B904"] [lint.pycodestyle] ignore-overlong-task-comments = true diff --git a/setup.py b/setup.py index 0bb7886..067ca08 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,6 @@ from setuptools import find_packages, setup - URL = 'https://github.com/inspirehep/inspire-matcher' with open("README.rst") as f: @@ -68,10 +67,7 @@ 'tests:python_version=="2.7"': [ 'unicode-string-literal~=1.0,>=1.1', ], - 'opensearch1': [ - 'opensearch-py>=1.0.0,<3.0.0', - 'opensearch-dsl>=1.0.0,<3.0.0' - ], + 'opensearch1': ['opensearch-py>=1.0.0,<3.0.0', 'opensearch-dsl>=1.0.0,<3.0.0'], 'elasticsearch7': [ 'elasticsearch-dsl~=7.0', 'elasticsearch~=7.0', diff --git a/tests/conftest.py b/tests/conftest.py index 4e64860..4f05c1f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,7 +24,6 @@ import pytest from flask import Flask - from invenio_search import InvenioSearch from inspire_matcher import InspireMatcher @@ -36,10 +35,10 @@ def app(): InvenioSearch(app) InspireMatcher(app) - yield app + return app -@pytest.fixture(autouse=True, scope='function') +@pytest.fixture(autouse=True) def app_context(app): with app.app_context(): yield app diff --git a/tests/fixtures/harvest_record_2654944.json b/tests/fixtures/harvest_record_2654944.json index 3ab0bf4..ac8861d 100644 --- a/tests/fixtures/harvest_record_2654944.json +++ b/tests/fixtures/harvest_record_2654944.json @@ -1,99 +1,99 @@ { - "$schema": "https://qa.inspirehep.net/schemas/records/hep.json", + "$schema": "https://qa.inspirehep.net/schemas/records/hep.json", "_collections": [ "Literature" - ], + ], "abstracts": [ { - "source": "submitter", + "source": "submitter", "value": "SYNOPSIS\nThe main goal of ultra relativistic heavy-ion collisons is to investigate the strongly\ninteracting matter, called the Quark Gluon Plasma (QGP), expected to produce in\nsuch collisions. The Large Hadron Collider (LHC) at CERN provides an unique\nopportunity to study such strongly interacting matter at extreme energy densities.\nThe short lived resonances are very useful tool in high energy collisions to study\nthe dynamics and properties of a strongly interacting medium. In particular, the\nK\n\u21e4\n0\n(892) meson is important because its lifetime (4 fm/c) is comparable to the time\nscale of the hot and dense matter produced. Owing to short lifetime, the characteris-\ntic properties such as mass, width, yield and transverse momentum spectra of\nK\n\u21e4\n0\nis\nvery sensitive to the dynamics and in-medium e\n\u21b5\nects. Basically the decay products of\nK\n\u21e4\n0\n, the pions and kaons, may undergo in-medium e\n\u21b5\nects. The decay products of high\nmomentum resonances have a larger probability to escape the system and thereby de-\ntected, while that of low momentum resonances can be re-scattered by other hadrons\npresent in the medium. Thus, we can not reconstruct back the resonance and the\nsignal is lost. On the other hand, the pions and kaons in the medium can re-generate\nK\n\u21e4\n0\nvia pseudo-elastic interactions (\nK\n\u21e1\n!\nK\n\u21e4\n0\n!\nK\n\u21e1\n)duringthephasebetween\nthe chemical freeze-out (when inelastic collision ceases) and the kinetic freeze out\n(when elastic collision ceases). This re-generation process could compensate for the\nK \u21e4 0 yield, lost in re-scattering, if the system formed has a long expansion time. It\nwas observed that the pion-pion interaction cross section is five times larger than\nthe kaon-pion interaction cross section. The pion-pion interaction cross-section is responsible for the re-scattering, while the kaon-pion cross-section for the re-generation\nprocesses. Thus, the interplay of the two processes, re-scattering and re-generation,\nwill decide the final resonance yield and a resonance to non-resonance particle ratio\ncan be used to understand these processes." } - ], + ], "accelerator_experiments": [ { "legacy_name": "CERN-LHC-ALICE" } - ], + ], "acquisition_source": { - "datetime": "2019-01-29T13:45:45.439962", - "method": "hepcrawl", - "source": "CDS", + "datetime": "2019-01-29T13:45:45.439962", + "method": "hepcrawl", + "source": "CDS", "submission_number": "946eba3e23c311e9a6e8fa163e8cc40e" - }, + }, "authors": [ { "affiliations": [ { "value": "Bhubaneswar, NISER" } - ], - "full_name": "Singha, Subhash", + ], + "full_name": "Singha, Subhash", "ids": [ { - "schema": "INSPIRE ID", + "schema": "INSPIRE ID", "value": "INSPIRE-00291765" } ] - }, + }, { "affiliations": [ { "value": "Bhubaneswar, NISER" } - ], - "full_name": "Mohanty, Bedangadas", + ], + "full_name": "Mohanty, Bedangadas", "inspire_roles": [ "supervisor" ] } - ], - "core": true, - "curated": true, + ], + "core": true, + "curated": true, "document_type": [ "thesis" - ], + ], "external_system_identifiers": [ { - "schema": "CDS", + "schema": "CDS", "value": "2654944" } - ], + ], "inspire_categories": [ { - "source": "cds", + "source": "cds", "term": "Experiment-HEP" - }, + }, { - "source": "cds", + "source": "cds", "term": "Experiment-Nucl" } - ], - "number_of_pages": 295, + ], + "number_of_pages": 295, "report_numbers": [ { - "source": "CDS", + "source": "CDS", "value": "CERN-THESIS-2014-462" } - ], + ], "thesis_info": { - "date": "2014-09", - "degree_type": "phd", + "date": "2014-09", + "degree_type": "phd", "institutions": [ { "name": "Bhubaneswar, NISER" } ] - }, + }, "titles": [ { - "source": "submitter", + "source": "submitter", "title": "Identified particle production in Pb-Pb and pp collisions at LHC energies" } - ], + ], "urls": [ { - "description": "Homi Bhabha Nat. Inst.", + "description": "Homi Bhabha Nat. Inst.", "value": "http://www.hbni.ac.in/students/dsp_ths.html?nm=phys/PHYS07200904008.pdf" } ] -} \ No newline at end of file +} diff --git a/tests/fixtures/matching_result_2654944.json b/tests/fixtures/matching_result_2654944.json index 84792e6..4a743c8 100644 --- a/tests/fixtures/matching_result_2654944.json +++ b/tests/fixtures/matching_result_2654944.json @@ -979,4 +979,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/fixtures/matching_wrong_2654944.json b/tests/fixtures/matching_wrong_2654944.json index 7d3143f..d255b77 100644 --- a/tests/fixtures/matching_wrong_2654944.json +++ b/tests/fixtures/matching_wrong_2654944.json @@ -979,4 +979,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/test_api.py b/tests/test_api.py index 0ef4ca3..5e3c793 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -105,7 +105,7 @@ def test_match_raises_if_one_query_does_not_have_a_type(): 'index': 'records-hep', } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="Malformed query. Query 0 of step 0 does not compile:") as excinfo: list(match(None, config)) assert 'Malformed query' in str(excinfo.value) @@ -123,7 +123,7 @@ def test_match_raises_if_one_query_type_is_not_supported(): 'index': 'records-hep', } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="Malformed query. Query 0 of step 0 does not compile:") as excinfo: list(match(None, config)) assert 'Malformed query' in str(excinfo.value) @@ -144,7 +144,7 @@ def test_match_raises_if_an_exact_query_does_not_have_all_the_keys(): 'index': 'records-hep', } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="Malformed query. Query 0 of step 0 does not compile:") as excinfo: list(match(None, config)) assert 'Malformed query' in str(excinfo.value) @@ -167,14 +167,13 @@ def test_match_raises_on_invalid_collections(): 'collections': 'Literature', } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="Malformed collections. Expected a list of strings bug got: 'Literature'") as excinfo: list(match(None, config)) assert 'Malformed collections' in str(excinfo.value) @mock.patch('inspire_matcher.api.es') def test_validator_list(es_mock): - es_mock.search.return_value = { 'hits': { 'hits': { @@ -225,7 +224,7 @@ def test_match_raises_if_inner_hits_param_has_wrong_config(): "paths": ["first_name", "last_name"], "search_paths": ["authors.first_name", "authors.last_name"], "type": "nested", - "inner_hits": {"not_existing_argument": ["authors.record"]} + "inner_hits": {"not_existing_argument": ["authors.record"]}, }, ], }, @@ -234,6 +233,6 @@ def test_match_raises_if_inner_hits_param_has_wrong_config(): 'index': 'records-hep', } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="Malformed query. Query 0 of step 0 does not compile:") as excinfo: list(match(None, config)) assert 'Malformed query' in str(excinfo.value) diff --git a/tests/test_core.py b/tests/test_core.py index 0a5abf7..46b4d3c 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -276,7 +276,7 @@ def test_compile_fuzzy_raises_if_path_contains_a_dot(): ], } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match='the "path" key can\'t contain dots') as excinfo: _compile_fuzzy(query, record) assert 'dots' in str(excinfo.value) @@ -316,7 +316,7 @@ def test_compile_nested(): 'match': { 'publication_info.journal_title': { 'query': 'Phys.Rev.', - 'operator': 'OR' + 'operator': 'OR', } }, }, @@ -324,7 +324,7 @@ def test_compile_nested(): 'match': { 'publication_info.journal_volume': { 'query': 'D94', - 'operator': 'OR' + 'operator': 'OR', } }, }, @@ -332,7 +332,7 @@ def test_compile_nested(): 'match': { 'publication_info.artid': { 'query': '124054', - 'operator': 'OR' + 'operator': 'OR', } }, }, @@ -347,7 +347,7 @@ def test_compile_nested(): assert expected == result -def test_compile_nested_requires_all_paths_to_contain_a_value_in_order_to_generate_a_query(): +def test_compile_nested_requires_all_paths_for_query(): query = { 'paths': [ 'reference.publication_info.journal_title', @@ -387,7 +387,7 @@ def test_compile_nested_raises_when_search_paths_dont_share_a_common_path(): 'type': 'nested', } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="search_paths must share a common path") as excinfo: _compile_nested(query, None) assert 'common path' in str(excinfo.value) @@ -398,13 +398,11 @@ def test_compile_nested_raises_when_paths_and_search_paths_dont_have_the_same_le 'foo', 'bar', ], - 'search_paths': [ - 'baz' - ], + 'search_paths': ['baz'], 'type': 'nested', } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="paths and search_paths must be of the same length") as excinfo: _compile_nested(query, None) assert 'same length' in str(excinfo.value) @@ -427,11 +425,13 @@ def test_compile_without_optional_args(): 'bool': { 'must': { 'bool': { - 'should': [{ - 'match': { - 'dummy.search.path': 'foo', - }, - }], + 'should': [ + { + 'match': { + 'dummy.search.path': 'foo', + }, + } + ], }, }, 'filter': { @@ -466,11 +466,13 @@ def test_compile_with_match_deleted(): expected = { 'query': { 'bool': { - 'should': [{ - 'match': { - 'dummy.search.path': 'foo', - }, - }], + 'should': [ + { + 'match': { + 'dummy.search.path': 'foo', + }, + } + ], }, }, } @@ -496,11 +498,13 @@ def test_compile_with_collections(): 'bool': { 'must': { 'bool': { - 'should': [{ - 'match': { - 'dummy.search.path': 'foo', - }, - }], + 'should': [ + { + 'match': { + 'dummy.search.path': 'foo', + }, + } + ], }, }, 'filter': { @@ -515,7 +519,7 @@ def test_compile_with_collections(): 'match': { '_collections': 'HAL Hidden', }, - } + }, ], 'must_not': { 'match': { @@ -554,7 +558,7 @@ def test_compile_prefix(): 'publication_info.journal_volume', 'publication_info.artid', ], - 'prefix_search_path': 'publication_info.journal_title' + 'prefix_search_path': 'publication_info.journal_title', } reference = { 'reference': { @@ -578,19 +582,11 @@ def test_compile_prefix(): "publication_info.journal_title": "Phys.Rev.D." } }, - { - "match": { - "publication_info.journal_volume": "94" - } - }, - { - "match": { - "publication_info.artid": "124054" - } - } + {"match": {"publication_info.journal_volume": "94"}}, + {"match": {"publication_info.artid": "124054"}}, ] } - } + }, } } } @@ -609,12 +605,9 @@ def test_compile_nested_with_inner_hits(): 'authors.last_name', ], 'type': 'nested', - 'inner_hits': {"_source": ["authors.full_name"]} - } - author_data = { - "first_name": "Name", - "last_name": "Test" + 'inner_hits': {"_source": ["authors.full_name"]}, } + author_data = {"first_name": "Name", "last_name": "Test"} expected = { 'query': { @@ -627,7 +620,7 @@ def test_compile_nested_with_inner_hits(): 'match': { 'authors.first_name': { 'query': 'Name', - 'operator': 'OR' + 'operator': 'OR', } }, }, @@ -635,18 +628,14 @@ def test_compile_nested_with_inner_hits(): 'match': { 'authors.last_name': { 'query': 'Test', - 'operator': 'OR' + 'operator': 'OR', } }, }, ], }, }, - "inner_hits": { - "_source": [ - "authors.full_name" - ] - } + "inner_hits": {"_source": ["authors.full_name"]}, }, }, } @@ -685,7 +674,9 @@ def test_compile_authors_query(): "match_phrase_prefix": { "authors.first_name": { "query": "Nicholas", - "analyzer": "names_analyzer", + "analyzer": ( + "names_analyzer" + ), } } }, @@ -694,7 +685,8 @@ def test_compile_authors_query(): "authors.first_name": { "query": "Nicholas", "operator": "AND", - "analyzer": "names_initials_analyzer", + "analyzer": + "names_initials_analyzer", } } }, @@ -706,7 +698,9 @@ def test_compile_authors_query(): "authors.first_name.initials": { "query": "A", "operator": "AND", - "analyzer": "names_initials_analyzer", + "analyzer": ( + "names_initials_analyzer" + ), } } }, @@ -763,7 +757,7 @@ def test_nested_query_with_and_operator(): "type": "nested", "paths": ["full_name"], "search_paths": ["authors.full_name"], - "operator": "AND" + "operator": "AND", } author_data = {"full_name": "John Smith"} diff --git a/tests/test_utils.py b/tests/test_utils.py index 03409ca..5c14176 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -23,6 +23,7 @@ from __future__ import absolute_import, division, print_function import pytest + from inspire_matcher.utils import ( compute_author_match_score, compute_jaccard_index, @@ -104,7 +105,7 @@ def test_compute_author_match_score_similar_authors(): @pytest.mark.parametrize( - "x_authors,y_authors,expected", + ("x_authors", "y_authors", "expected"), [ ( [ diff --git a/tests/test_validators.py b/tests/test_validators.py index bbf2872..00531ee 100644 --- a/tests/test_validators.py +++ b/tests/test_validators.py @@ -22,17 +22,18 @@ from __future__ import absolute_import, division, print_function -import pytest import json import os + import pkg_resources +import pytest from inspire_matcher.validators import ( + arxiv_eprints_validator, authors_titles_validator, cds_identifier_validator, default_validator, persistent_identifier_validator, - arxiv_eprints_validator, ) @@ -40,52 +41,82 @@ def test_default_validator_is_not_very_exciting(): assert default_validator(None, None) -def test_authors_titles_validator_does_match_when_authors_are_same_and_titles_contain_perfect_match(): - record = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'harvest_record_1601.02340.json'))) +def test_validator_matches_on_same_authors_and_titles(): + record = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'harvest_record_1601.02340.json') + ) + ) - result = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'matching_result_1601.02340.json'))) + result = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'matching_result_1601.02340.json') + ) + ) assert authors_titles_validator(record, result) -def test_authors_titles_validator_does_not_match_when_authors_are_similar_but_titles_are_too_different(): - record = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'harvest_record_1804.09082.json'))) +def test_validator_no_match_on_similar_authors_different_titles(): + record = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'harvest_record_1804.09082.json') + ) + ) - result = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'matching_wrong_result_1211.4028.json'))) + result = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'matching_wrong_result_1211.4028.json') + ) + ) assert not authors_titles_validator(record, result) -def test_authors_titles_validator_does_not_match_when_authors_are_same_but_titles_are_too_different(): - record = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'harvest_record_1712.05946.json'))) +def test_validator_no_match_on_different_titles(): + record = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'harvest_record_1712.05946.json') + ) + ) - result = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'matching_wrong_result_10.1103.json'))) + result = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'matching_wrong_result_10.1103.json') + ) + ) assert not authors_titles_validator(record, result) -def test_cds_identifier_validator_does_match_when_external_system_identifiers_contain_perfect_match(): - record = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'harvest_record_2654944.json'))) +def test_cds_id_validator_matches_perfectlyh(): + record = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'harvest_record_2654944.json') + ) + ) - result = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'matching_result_2654944.json'))) + result = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'matching_result_2654944.json') + ) + ) assert cds_identifier_validator(record, result) -def test_cds_identifier_validator_does_not_match_when_system_identifiers_are_from_different_sources(): - record = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'harvest_record_2654944.json'))) +def test_cds_identifier_mismatch_different_sources(): + record = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'harvest_record_2654944.json') + ) + ) - result = json.loads(pkg_resources.resource_string( - __name__, os.path.join('fixtures', 'matching_wrong_2654944.json'))) + result = json.loads( + pkg_resources.resource_string( + __name__, os.path.join('fixtures', 'matching_wrong_2654944.json') + ) + ) assert not cds_identifier_validator(record, result) @@ -118,7 +149,7 @@ def test_persistent_identifier_validator_doesnt_validate_when_pid_entry_not_equa @pytest.mark.parametrize( - 'expected,record,result', + ('expected', 'record', 'result'), [ ( False, @@ -145,7 +176,7 @@ def test_persistent_identifier_validator_doesnt_validate_when_pid_entry_not_equa {}, {'_source': {}}, ), - ] + ], ) def test_arxiv_eprints_validator(expected, record, result): assert expected == arxiv_eprints_validator(record, result)