Skip to content

Commit

Permalink
feat: validate datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
tillywoodfield committed Feb 4, 2025
1 parent a358860 commit 4f8e1ce
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 7 deletions.
17 changes: 16 additions & 1 deletion oc4ids_datastore_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any

import requests
from libcoveoc4ids.api import oc4ids_json_output

logger = logging.getLogger(__name__)

Expand All @@ -28,10 +29,24 @@ def download_json(url: str) -> Any:
raise Exception("Download failed", e)


def validate_json(dataset_name: str, json_data: Any) -> None:
logger.info(f"Validating dataset {dataset_name}")
try:
validation_result = oc4ids_json_output(json_data=json_data)
validation_errors_count = validation_result["validation_errors_count"]
if validation_errors_count > 0:
raise Exception(f"Dataset has {validation_errors_count} validation errors")
logger.info(f"Dataset {dataset_name} is valid")
except Exception as e:
raise Exception("Validation failed", e)


def process_dataset(dataset_name: str, dataset_url: str) -> None:
logger.info(f"Processing dataset {dataset_name}")
try:
download_json(dataset_url)
json_data = download_json(dataset_url)
validate_json(dataset_name, json_data)
logger.info(f"Processed dataset {dataset_name}")
except Exception as e:
logger.warning(f"Failed to process dataset {dataset_name} with error {e}")

Expand Down
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ description = "OC4IDS Datastore Pipeline"
version = "0.1.0"
readme = "README.md"
dependencies = [
"requests"
"libcoveoc4ids",
"requests",
]

[project.optional-dependencies]
Expand All @@ -35,6 +36,10 @@ max-line-length = 88
[tool.mypy]
strict = true

[[tool.mypy.overrides]]
module = ["libcoveoc4ids.*"]
follow_untyped_imports = true

[tool.pytest.ini_options]
log_cli = true
log_cli_level = "INFO"
136 changes: 132 additions & 4 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,75 @@
#
# pip-compile --extra=dev --output-file=requirements_dev.txt pyproject.toml
#
attrs==25.1.0
# via
# cattrs
# jsonschema
# referencing
# requests-cache
backports-datetime-fromisoformat==2.0.3
# via flattentool
black==25.1.0
# via oc4ids-datastore-pipeline (pyproject.toml)
btrees==6.1
# via zodb
cattrs==24.1.2
# via requests-cache
certifi==2025.1.31
# via requests
cffi==1.17.1
# via persistent
charset-normalizer==3.4.1
# via requests
click==8.1.8
# via black
# via
# black
# libcoveoc4ids
# libcoveocds
defusedxml==0.7.1
# via odfpy
et-xmlfile==2.0.0
# via openpyxl
flake8==7.1.1
# via
# flake8-pyproject
# oc4ids-datastore-pipeline (pyproject.toml)
flake8-pyproject==1.2.3
# via oc4ids-datastore-pipeline (pyproject.toml)
flattentool==0.27.0
# via libcove
idna==3.10
# via requests
ijson==3.3.0
# via flattentool
iniconfig==2.0.0
# via pytest
isort==6.0.0
# via oc4ids-datastore-pipeline (pyproject.toml)
json-merge-patch==0.2
# via ocdsextensionregistry
jsonref==1.1.0
# via
# flattentool
# libcove
# libcoveocds
# ocdsextensionregistry
jsonschema==4.23.0
# via
# libcove
# libcoveocds
jsonschema-specifications==2024.10.1
# via jsonschema
libcove==0.32.1
# via
# libcoveoc4ids
# libcoveocds
libcoveoc4ids==0.9.0
# via oc4ids-datastore-pipeline (pyproject.toml)
libcoveocds==0.16.4
# via libcoveoc4ids
lxml==5.3.0
# via flattentool
mccabe==0.7.0
# via flake8
mypy==1.14.1
Expand All @@ -32,18 +81,32 @@ mypy-extensions==1.0.0
# via
# black
# mypy
ocdsextensionregistry==0.6.9
# via libcoveocds
odfpy==1.4.1
# via flattentool
openpyxl==3.1.5
# via flattentool
packaging==24.2
# via
# black
# pytest
pathspec==0.12.1
# via black
persistent==6.1
# via
# btrees
# zodb
platformdirs==4.3.6
# via black
# via
# black
# requests-cache
pluggy==1.5.0
# via pytest
pycodestyle==2.12.1
# via flake8
pycparser==2.22
# via cffi
pyflakes==3.2.0
# via flake8
pytest==8.3.4
Expand All @@ -52,13 +115,78 @@ pytest==8.3.4
# pytest-mock
pytest-mock==3.14.0
# via oc4ids-datastore-pipeline (pyproject.toml)
pytz==2025.1
# via flattentool
referencing==0.36.2
# via
# jsonschema
# jsonschema-specifications
# libcove
# libcoveocds
requests==2.32.3
# via oc4ids-datastore-pipeline (pyproject.toml)
# via
# libcove
# libcoveocds
# oc4ids-datastore-pipeline (pyproject.toml)
# ocdsextensionregistry
# requests-cache
requests-cache==1.2.1
# via ocdsextensionregistry
rfc3339-validator==0.1.4
# via libcove
rfc3987==1.3.8
# via libcove
rpds-py==0.22.3
# via
# jsonschema
# referencing
schema==0.7.7
# via flattentool
six==1.17.0
# via
# rfc3339-validator
# url-normalize
transaction==5.0
# via zodb
types-requests==2.32.0.20241016
# via oc4ids-datastore-pipeline (pyproject.toml)
typing-extensions==4.12.2
# via mypy
# via
# mypy
# referencing
url-normalize==1.4.3
# via requests-cache
urllib3==2.3.0
# via
# requests
# requests-cache
# types-requests
xmltodict==0.14.2
# via flattentool
zc-lockfile==3.0.post1
# via zodb
zc-zlibstorage==1.2.0
# via flattentool
zconfig==4.2
# via zodb
zodb==6.0
# via
# flattentool
# zc-zlibstorage
zodbpickle==4.1.1
# via zodb
zope-deferredimport==5.0
# via persistent
zope-interface==7.2
# via
# btrees
# persistent
# transaction
# zc-zlibstorage
# zodb
# zope-proxy
zope-proxy==6.1
# via zope-deferredimport

# The following packages are considered to be unsafe in a requirements file:
# setuptools
34 changes: 33 additions & 1 deletion tests/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import pytest
from pytest_mock import MockerFixture

from oc4ids_datastore_pipeline.pipeline import download_json, process_dataset
from oc4ids_datastore_pipeline.pipeline import (
download_json,
process_dataset,
validate_json,
)


def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
Expand All @@ -15,6 +19,34 @@ def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
assert "Mocked exception" in str(exc_info.value)


def test_validate_json_raises_failure_exception(mocker: MockerFixture) -> None:
patch_oc4ids_json_output = mocker.patch(
"oc4ids_datastore_pipeline.pipeline.oc4ids_json_output"
)
patch_oc4ids_json_output.side_effect = Exception("Mocked exception")

with pytest.raises(Exception) as exc_info:
validate_json(dataset_name="test_dataset", json_data={})

assert "Validation failed" in str(exc_info.value)
assert "Mocked exception" in str(exc_info.value)


def test_validate_json_raises_validation_errors_exception(
mocker: MockerFixture,
) -> None:
patch_oc4ids_json_output = mocker.patch(
"oc4ids_datastore_pipeline.pipeline.oc4ids_json_output"
)
patch_oc4ids_json_output.return_value = {"validation_errors_count": 2}

with pytest.raises(Exception) as exc_info:
validate_json(dataset_name="test_dataset", json_data={})

assert "Validation failed" in str(exc_info.value)
assert "Dataset has 2 validation errors" in str(exc_info.value)


def test_process_dataset_catches_exception(mocker: MockerFixture) -> None:
patch_download_json = mocker.patch(
"oc4ids_datastore_pipeline.pipeline.download_json"
Expand Down

0 comments on commit 4f8e1ce

Please sign in to comment.