diff --git a/inspirehep/modules/workflows/models.py b/inspirehep/modules/workflows/models.py index fb04eb1b1a..d61530bf23 100644 --- a/inspirehep/modules/workflows/models.py +++ b/inspirehep/modules/workflows/models.py @@ -26,6 +26,9 @@ from datetime import datetime +from sqlalchemy.dialects import postgresql +from sqlalchemy_utils.types import JSONType, UUIDType + from invenio_db import db @@ -74,3 +77,26 @@ class WorkflowsPendingRecord(db.Model): nullable=False, ) record_id = db.Column(db.Integer, nullable=False) + + +class WorkflowsRecordSources(db.Model): + + __tablename__ = "workflows_record_sources" + __table_args__ = ( + db.PrimaryKeyConstraint('record_id', 'source'), + ) + source = db.Column(db.Text, default="", nullable=False) + record_id = db.Column( + UUIDType, + db.ForeignKey("records_metadata.id", ondelete='CASCADE'), + primary_key=True, + nullable=False, + ) + json = db.Column( + JSONType().with_variant( + postgresql.JSON(none_as_null=True), + 'postgresql', + ), + default=lambda: dict(), + nullable=True + ) diff --git a/inspirehep/modules/workflows/utils.py b/inspirehep/modules/workflows/utils.py index fc039763da..4d8b4a7e19 100644 --- a/inspirehep/modules/workflows/utils.py +++ b/inspirehep/modules/workflows/utils.py @@ -34,8 +34,9 @@ import requests import urllib3 from flask import current_app +from invenio_db import db -from .models import WorkflowsAudit +from .models import WorkflowsAudit, WorkflowsRecordSources LOGGER = logging.getLogger(__name__) @@ -176,3 +177,40 @@ def download_file_to_workflow(workflow, name, url): if req.status_code == 200: workflow.files[name] = req.raw return workflow.files[name] + + +def store_root_json(record_uuid, source, json): + """Store the root json for a given source. + + Given the ``record_uuid``, the ``source`` information (e.g. `arXiv` or + `Elsevier`) and the actual record in ``data`` store this information so + that it can be retrieved later. + """ + with db.session.begin_nested(): + record_source = WorkflowsRecordSources.query.filter( + WorkflowsRecordSources.record_id == record_uuid, + WorkflowsRecordSources.source == source + ).one_or_none() + if record_source is None: + record_source = WorkflowsRecordSources( + source=source, + json=json, + record_id=record_uuid + ) + else: + record_source.json = json + db.session.add(record_source) + + +def retrieve_root_json(record_uuid, source): + """Retrieve the root json for a given source. + + Given a previously matched ``record_uuid``, the ``source`` information + (e.g. `arXiv` or `Elsevier`) returns the original record if existing or + empty json object otherwise. + """ + entry = WorkflowsRecordSources.query.filter( + WorkflowsRecordSources.record_id == record_uuid, + WorkflowsRecordSources.source == source + ).one_or_none() + return entry.json if entry else {}