From 4f8e1ce87255d7ac41af44f75351f30e14332fed Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 12:01:12 +0200 Subject: [PATCH] feat: validate datasets --- oc4ids_datastore_pipeline/pipeline.py | 17 +++- pyproject.toml | 7 +- requirements_dev.txt | 136 +++++++++++++++++++++++++- tests/test_pipeline.py | 34 ++++++- 4 files changed, 187 insertions(+), 7 deletions(-) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 1ce6189..4d87911 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -2,6 +2,7 @@ from typing import Any import requests +from libcoveoc4ids.api import oc4ids_json_output logger = logging.getLogger(__name__) @@ -28,10 +29,24 @@ def download_json(url: str) -> Any: raise Exception("Download failed", e) +def validate_json(dataset_name: str, json_data: Any) -> None: + logger.info(f"Validating dataset {dataset_name}") + try: + validation_result = oc4ids_json_output(json_data=json_data) + validation_errors_count = validation_result["validation_errors_count"] + if validation_errors_count > 0: + raise Exception(f"Dataset has {validation_errors_count} validation errors") + logger.info(f"Dataset {dataset_name} is valid") + except Exception as e: + raise Exception("Validation failed", e) + + def process_dataset(dataset_name: str, dataset_url: str) -> None: logger.info(f"Processing dataset {dataset_name}") try: - download_json(dataset_url) + json_data = download_json(dataset_url) + validate_json(dataset_name, json_data) + logger.info(f"Processed dataset {dataset_name}") except Exception as e: logger.warning(f"Failed to process dataset {dataset_name} with error {e}") diff --git a/pyproject.toml b/pyproject.toml index 96ebb3d..a12b620 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ description = "OC4IDS Datastore Pipeline" version = "0.1.0" readme = "README.md" dependencies = [ - "requests" + "libcoveoc4ids", + "requests", ] [project.optional-dependencies] @@ -35,6 +36,10 @@ max-line-length = 88 [tool.mypy] strict = true +[[tool.mypy.overrides]] +module = ["libcoveoc4ids.*"] +follow_untyped_imports = true + [tool.pytest.ini_options] log_cli = true log_cli_level = "INFO" diff --git a/requirements_dev.txt b/requirements_dev.txt index 1102839..c9d8620 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -4,26 +4,75 @@ # # pip-compile --extra=dev --output-file=requirements_dev.txt pyproject.toml # +attrs==25.1.0 + # via + # cattrs + # jsonschema + # referencing + # requests-cache +backports-datetime-fromisoformat==2.0.3 + # via flattentool black==25.1.0 # via oc4ids-datastore-pipeline (pyproject.toml) +btrees==6.1 + # via zodb +cattrs==24.1.2 + # via requests-cache certifi==2025.1.31 # via requests +cffi==1.17.1 + # via persistent charset-normalizer==3.4.1 # via requests click==8.1.8 - # via black + # via + # black + # libcoveoc4ids + # libcoveocds +defusedxml==0.7.1 + # via odfpy +et-xmlfile==2.0.0 + # via openpyxl flake8==7.1.1 # via # flake8-pyproject # oc4ids-datastore-pipeline (pyproject.toml) flake8-pyproject==1.2.3 # via oc4ids-datastore-pipeline (pyproject.toml) +flattentool==0.27.0 + # via libcove idna==3.10 # via requests +ijson==3.3.0 + # via flattentool iniconfig==2.0.0 # via pytest isort==6.0.0 # via oc4ids-datastore-pipeline (pyproject.toml) +json-merge-patch==0.2 + # via ocdsextensionregistry +jsonref==1.1.0 + # via + # flattentool + # libcove + # libcoveocds + # ocdsextensionregistry +jsonschema==4.23.0 + # via + # libcove + # libcoveocds +jsonschema-specifications==2024.10.1 + # via jsonschema +libcove==0.32.1 + # via + # libcoveoc4ids + # libcoveocds +libcoveoc4ids==0.9.0 + # via oc4ids-datastore-pipeline (pyproject.toml) +libcoveocds==0.16.4 + # via libcoveoc4ids +lxml==5.3.0 + # via flattentool mccabe==0.7.0 # via flake8 mypy==1.14.1 @@ -32,18 +81,32 @@ mypy-extensions==1.0.0 # via # black # mypy +ocdsextensionregistry==0.6.9 + # via libcoveocds +odfpy==1.4.1 + # via flattentool +openpyxl==3.1.5 + # via flattentool packaging==24.2 # via # black # pytest pathspec==0.12.1 # via black +persistent==6.1 + # via + # btrees + # zodb platformdirs==4.3.6 - # via black + # via + # black + # requests-cache pluggy==1.5.0 # via pytest pycodestyle==2.12.1 # via flake8 +pycparser==2.22 + # via cffi pyflakes==3.2.0 # via flake8 pytest==8.3.4 @@ -52,13 +115,78 @@ pytest==8.3.4 # pytest-mock pytest-mock==3.14.0 # via oc4ids-datastore-pipeline (pyproject.toml) +pytz==2025.1 + # via flattentool +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications + # libcove + # libcoveocds requests==2.32.3 - # via oc4ids-datastore-pipeline (pyproject.toml) + # via + # libcove + # libcoveocds + # oc4ids-datastore-pipeline (pyproject.toml) + # ocdsextensionregistry + # requests-cache +requests-cache==1.2.1 + # via ocdsextensionregistry +rfc3339-validator==0.1.4 + # via libcove +rfc3987==1.3.8 + # via libcove +rpds-py==0.22.3 + # via + # jsonschema + # referencing +schema==0.7.7 + # via flattentool +six==1.17.0 + # via + # rfc3339-validator + # url-normalize +transaction==5.0 + # via zodb types-requests==2.32.0.20241016 # via oc4ids-datastore-pipeline (pyproject.toml) typing-extensions==4.12.2 - # via mypy + # via + # mypy + # referencing +url-normalize==1.4.3 + # via requests-cache urllib3==2.3.0 # via # requests + # requests-cache # types-requests +xmltodict==0.14.2 + # via flattentool +zc-lockfile==3.0.post1 + # via zodb +zc-zlibstorage==1.2.0 + # via flattentool +zconfig==4.2 + # via zodb +zodb==6.0 + # via + # flattentool + # zc-zlibstorage +zodbpickle==4.1.1 + # via zodb +zope-deferredimport==5.0 + # via persistent +zope-interface==7.2 + # via + # btrees + # persistent + # transaction + # zc-zlibstorage + # zodb + # zope-proxy +zope-proxy==6.1 + # via zope-deferredimport + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 955f6cc..20d8575 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,7 +1,11 @@ import pytest from pytest_mock import MockerFixture -from oc4ids_datastore_pipeline.pipeline import download_json, process_dataset +from oc4ids_datastore_pipeline.pipeline import ( + download_json, + process_dataset, + validate_json, +) def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: @@ -15,6 +19,34 @@ def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: assert "Mocked exception" in str(exc_info.value) +def test_validate_json_raises_failure_exception(mocker: MockerFixture) -> None: + patch_oc4ids_json_output = mocker.patch( + "oc4ids_datastore_pipeline.pipeline.oc4ids_json_output" + ) + patch_oc4ids_json_output.side_effect = Exception("Mocked exception") + + with pytest.raises(Exception) as exc_info: + validate_json(dataset_name="test_dataset", json_data={}) + + assert "Validation failed" in str(exc_info.value) + assert "Mocked exception" in str(exc_info.value) + + +def test_validate_json_raises_validation_errors_exception( + mocker: MockerFixture, +) -> None: + patch_oc4ids_json_output = mocker.patch( + "oc4ids_datastore_pipeline.pipeline.oc4ids_json_output" + ) + patch_oc4ids_json_output.return_value = {"validation_errors_count": 2} + + with pytest.raises(Exception) as exc_info: + validate_json(dataset_name="test_dataset", json_data={}) + + assert "Validation failed" in str(exc_info.value) + assert "Dataset has 2 validation errors" in str(exc_info.value) + + def test_process_dataset_catches_exception(mocker: MockerFixture) -> None: patch_download_json = mocker.patch( "oc4ids_datastore_pipeline.pipeline.download_json"