diff --git a/inspirehep/modules/workflows/models.py b/inspirehep/modules/workflows/models.py
index fb04eb1b1a..d61530bf23 100644
--- a/inspirehep/modules/workflows/models.py
+++ b/inspirehep/modules/workflows/models.py
@@ -26,6 +26,9 @@
 
 from datetime import datetime
 
+from sqlalchemy.dialects import postgresql
+from sqlalchemy_utils.types import JSONType, UUIDType
+
 from invenio_db import db
 
 
@@ -74,3 +77,26 @@ class WorkflowsPendingRecord(db.Model):
         nullable=False,
     )
     record_id = db.Column(db.Integer, nullable=False)
+
+
+class WorkflowsRecordSources(db.Model):
+
+    __tablename__ = "workflows_record_sources"
+    __table_args__ = (
+        db.PrimaryKeyConstraint('record_id', 'source'),
+    )
+    source = db.Column(db.Text, default="", nullable=False)
+    record_id = db.Column(
+        UUIDType,
+        db.ForeignKey("records_metadata.id", ondelete='CASCADE'),
+        primary_key=True,
+        nullable=False,
+    )
+    json = db.Column(
+        JSONType().with_variant(
+            postgresql.JSON(none_as_null=True),
+            'postgresql',
+        ),
+        default=lambda: dict(),
+        nullable=True
+    )
diff --git a/inspirehep/modules/workflows/utils.py b/inspirehep/modules/workflows/utils.py
index fc039763da..4d8b4a7e19 100644
--- a/inspirehep/modules/workflows/utils.py
+++ b/inspirehep/modules/workflows/utils.py
@@ -34,8 +34,9 @@
 import requests
 import urllib3
 from flask import current_app
+from invenio_db import db
 
-from .models import WorkflowsAudit
+from .models import WorkflowsAudit, WorkflowsRecordSources
 
 
 LOGGER = logging.getLogger(__name__)
@@ -176,3 +177,40 @@ def download_file_to_workflow(workflow, name, url):
         if req.status_code == 200:
             workflow.files[name] = req.raw
             return workflow.files[name]
+
+
+def store_root_json(record_uuid, source, json):
+    """Store the root json for a given source.
+
+    Given the ``record_uuid``, the ``source`` information (e.g. `arXiv` or
+    `Elsevier`) and the actual record in ``data`` store this information so
+    that it can be retrieved later.
+    """
+    with db.session.begin_nested():
+        record_source = WorkflowsRecordSources.query.filter(
+            WorkflowsRecordSources.record_id == record_uuid,
+            WorkflowsRecordSources.source == source
+        ).one_or_none()
+        if record_source is None:
+            record_source = WorkflowsRecordSources(
+                source=source,
+                json=json,
+                record_id=record_uuid
+            )
+        else:
+            record_source.json = json
+        db.session.add(record_source)
+
+
+def retrieve_root_json(record_uuid, source):
+    """Retrieve the root json for a given source.
+
+    Given a previously matched ``record_uuid``, the ``source`` information
+    (e.g. `arXiv` or `Elsevier`) returns the original record if existing or
+    empty json object otherwise.
+    """
+    entry = WorkflowsRecordSources.query.filter(
+        WorkflowsRecordSources.record_id == record_uuid,
+        WorkflowsRecordSources.source == source
+    ).one_or_none()
+    return entry.json if entry else {}