Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "citation count: index new citation's records - INSPIR-1127" #3681

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 1 addition & 44 deletions inspirehep/modules/records/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

from inspirehep.modules.pidstore.minters import inspire_recid_minter
from inspirehep.modules.pidstore.utils import get_pid_type_from_schema, get_endpoint_from_pid_type
from inspirehep.modules.records.utils import get_pid_from_record_uri, populate_earliest_date
from inspirehep.modules.records.utils import populate_earliest_date
from inspirehep.utils.record_getter import (
RecordGetterError,
get_es_record_by_uuid
Expand Down Expand Up @@ -648,49 +648,6 @@ def to_dict(self):
"""Gets a deep copy of the record's json."""
return deepcopy(dict(self))

def get_modified_references(self):
"""Return the ids of the references diff between the latest and the
previous version.

The diff includes references added or deleted. Changes in a
reference's content won't be detected.

Also, it detects if record was deleted/un-deleted compared to the
previous version and, in such cases, returns the full list of
references.

References not linked to any record will be ignored.

Note: record should be committed to DB in order to correctly get the
previous version.


Returns:
Set[Tuple[str, int]]: pids of references changed from the previous
version.
"""
def _get_ids_from_refs(references):
return set([
get_pid_from_record_uri(ref['record']['$ref'])
for ref in references
if 'record' in ref
])

try:
prev_version = self.model.versions[-2].json
except IndexError:
prev_version = {}

changed_deleted_status = self.get('deleted', False) ^ prev_version.get('deleted', False)

if changed_deleted_status:
return _get_ids_from_refs(self.get('references', []))

ids_latest = _get_ids_from_refs(self.get('references', []))
ids_oldest = _get_ids_from_refs(prev_version.get('references', []))

return set.symmetric_difference(ids_latest, ids_oldest)


class ESRecord(InspireRecord):
"""Record class that fetches records from ElasticSearch."""
Expand Down
33 changes: 1 addition & 32 deletions inspirehep/modules/records/receivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,10 @@
from flask import current_app
from flask_sqlalchemy import models_committed
from elasticsearch import NotFoundError
from sqlalchemy import tuple_


from invenio_db import db
from invenio_indexer.signals import before_record_index
from invenio_pidstore.models import PersistentIdentifier
from invenio_records.models import RecordMetadata
from invenio_records.signals import (
after_record_insert,
after_record_update,
before_record_insert,
before_record_update,
Expand All @@ -53,8 +48,6 @@
get_orcids_for_push,
)
from inspirehep.modules.records.api import InspireRecord
from inspirehep.modules.records.indexer import InspireRecordIndexer
from inspirehep.modules.records.tasks import batch_reindex
from inspirehep.modules.records.utils import (
is_author,
is_book,
Expand All @@ -78,7 +71,7 @@
populate_recid_from_ref,
populate_title_suggest,
)

from inspirehep.modules.records.indexer import InspireRecordIndexer

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -160,7 +153,6 @@ def index_after_commit(sender, changes):
has been really committed to the DB.
"""
indexer = InspireRecordIndexer()

for model_instance, change in changes:
if isinstance(model_instance, RecordMetadata):
if change in ('insert', 'update') and not model_instance.json.get("deleted"):
Expand Down Expand Up @@ -215,26 +207,3 @@ def enhance_after_index(sender, json, record, *args, **kwargs):
elif is_data(json):
populate_citations_count(record=record, json=json)


@after_record_insert.connect
@after_record_update.connect
def index_new_cited_records_after_record_update(sender, record, *args, **kwargs):
"""Index records whose reference has been added or deleted"""
pids = record.get_modified_references()

if not pids:
return

uuids = [
str(pid.object_uuid) for pid in
db.session.query(PersistentIdentifier.object_uuid).filter(
PersistentIdentifier.object_type == 'rec',
tuple_(PersistentIdentifier.pid_type, PersistentIdentifier.pid_value).in_(pids)
)
]

if uuids:
return batch_reindex.apply_async(
kwargs={'uuids': uuids},
queue='bulk_index',
)
7 changes: 2 additions & 5 deletions inspirehep/modules/records/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
from invenio_db import db
from invenio_pidstore.models import PersistentIdentifier
from invenio_search import current_search_client as es
from inspire_dojson.utils import get_recid_from_ref

from inspire_dojson.utils import get_recid_from_ref
from inspirehep.modules.records.api import InspireRecord
from inspirehep.modules.records.utils import get_endpoint_from_record
from inspirehep.modules.pidstore.utils import get_pid_type_from_schema
Expand Down Expand Up @@ -182,7 +182,7 @@ def _get_uuids_to_merge():


@shared_task(ignore_result=False, max_retries=0)
def batch_reindex(uuids, request_timeout=None):
def batch_reindex(uuids, request_timeout):
"""Task for bulk reindexing records."""
def actions():
for uuid in uuids:
Expand All @@ -192,9 +192,6 @@ def actions():
except NoResultFound as e:
logger.warn('Record %s failed to load: %s', uuid, e)

if not request_timeout:
request_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']

success, failures = bulk(
es,
actions(),
Expand Down
4 changes: 1 addition & 3 deletions inspirehep/utils/record_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,6 @@ def get_db_records(pids):
The order in which records are returned is different from the order of
the input.
"""
from inspirehep.modules.records.api import InspireRecord

pids = [(pid_type, str(pid_value)) for (pid_type, pid_value) in pids]

if not pids:
Expand All @@ -137,4 +135,4 @@ def get_db_records(pids):
)

for record in query.yield_per(100):
yield InspireRecord(record.json, model=record)
yield record.json
9 changes: 4 additions & 5 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@

import sys
import os
import mock

import pytest
import sqlalchemy

from flask_alembic import Alembic

from functools import partial

from click.testing import CliRunner
from flask import current_app
from flask.cli import ScriptInfo
Expand Down Expand Up @@ -101,9 +102,7 @@ def app():
init_all_storage_paths()
init_users_and_permissions()

with mock.patch('inspirehep.modules.records.receivers.batch_reindex.apply_async'):
migrate_from_file('./inspirehep/demosite/data/demo-records.xml.gz', wait_for_results=True)

migrate_from_file('./inspirehep/demosite/data/demo-records.xml.gz', wait_for_results=True)
es.indices.refresh('records-hep') # Makes sure that all HEP records were migrated.

yield app
Expand Down
Loading