Skip to content

Commit

Permalink
[SUPDESQ-131] (#60)
Browse files Browse the repository at this point in the history
- Added validated datasets feature
  • Loading branch information
MarkCalvert authored Oct 26, 2023
1 parent 19e5a38 commit 384e0bf
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 3 deletions.
15 changes: 13 additions & 2 deletions ckanext/qdes/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,18 @@ def ckan_worker_job_monitor():
click.secho(u"CKAN job worker monitor added to worker queue", fg=u"green")
except Exception as e:
log.error(e)


@click.command(u"validate-datasets")
@click.pass_context
def validate_datasets(ctx):
click.secho(f"Starting validating datasets", fg=u"green")
try:
flask_app = ctx.meta['flask_app']
with flask_app.test_request_context():
jobs.validate_datasets()
except Exception as e:
log.error(e)
click.secho(f"Finished validating datasets", fg=u"green")

def get_commands():
return [generate_audit_reports, review_datasets, send_email_notifications, ckan_worker_job_monitor]
return [generate_audit_reports, review_datasets, send_email_notifications, ckan_worker_job_monitor, validate_datasets]
100 changes: 99 additions & 1 deletion ckanext/qdes/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,14 @@
from ckanext.qdes import helpers, constants
from datetime import datetime
from dateutil.relativedelta import relativedelta
from ckanext.scheming.plugins import SchemingDatasetsPlugin
from ckan.logic.schema import default_update_package_schema

get_action = toolkit.get_action
render = toolkit.render
config = toolkit.config
enqueue_job = toolkit.enqueue_job
mail_recipient = toolkit.mail_recipient
log = logging.getLogger(__name__)


Expand Down Expand Up @@ -48,7 +53,7 @@ def review_datasets(data_dict={}):
body_html = render('emails/body/review_datasets.html', {'datasets': datasets})
# Improvements for job worker visibility when troubleshooting via logs
job_title = f'Review datasets: Sending email to {recipient_name}'
toolkit.enqueue_job(toolkit.mail_recipient, [recipient_name, recipient_email, subject, body, body_html], title=job_title)
enqueue_job(mail_recipient, [recipient_name, recipient_email, subject, body, body_html], title=job_title)


def generate_reports():
Expand Down Expand Up @@ -160,3 +165,96 @@ def ckan_worker_job_monitor():
except requests.RequestException as e:
log.error(f'Failed to send ckan worker job monitor notification to {monitor_url}')
log.error(str(e))

def validate_datasets():
# Get site default user
user = get_action('get_site_user')({'ignore_auth': True})
context = {
'user': user.get('name'),
'auth_user_obj': user,
'ignore_auth': True,
}
p = SchemingDatasetsPlugin.instance
# Get all public datasets
dataset_list = get_action('package_list')(context, {})
validation_errors = []
exclude_from_validation = toolkit.aslist(config.get('ckanext.qdes.validation_error_exclude_metadata'))
# Loop through all datasets and validate each one
for dataset in dataset_list:
try:
data_dict = get_action('package_show')(context, {"id": dataset})
schema = default_update_package_schema()
pkg_data, pkg_errors = p.validate(context, data_dict, schema, 'package_update')
if pkg_errors:
log.error("Validation errors for dataset: {}".format(dataset))
# Exclude certain fields from dataset validation errors
pkg_errors = {key: error for key, error in pkg_errors.items() if key not in exclude_from_validation}
if pkg_errors.get('resources'):
pkg_errors.pop('resources')
# Validate resource, the above code will validate resource
# but it has no indication which resource is throwing an error,
# so we will re-run the validation for each resource.
resources = data_dict.get('resources', [])
if resources:
data_dict.pop('resources')
for res in resources:
data_dict['resources'] = [res]
pkg_data, resource_errors = p.validate(context, data_dict, schema, 'package_update')
if resource_errors.get('resources') and len(resource_errors.get('resources')) == 1:
# Exclude certain fields from resource validation errors
resource_errors = {key: error for key, error in resource_errors.get('resources')[0].items() if key not in exclude_from_validation}
if resource_errors and len(resource_errors) > 0 :
pkg_errors['resources'] = [{
'resource_id': res.get('id'),
'resource_name': res.get('name'),
'errors': resource_errors
}]
if len(pkg_errors) > 0:
validation_errors.append({'dataset': data_dict, 'errors': pkg_errors})
except Exception as e:
log.error(f"Error validating dataset: {dataset}")
log.error(f"Error: {e}")

# Aggregate dataset validation errors by contact point
contact_points = {}
for validation_error in validation_errors:
contact_point = validation_error.get('dataset').get('contact_point')
datasets = contact_points.get(contact_point, [])
# Only add dataset if it does not already exist in datasets list
datasets.append(validation_error) if validation_error not in datasets else datasets
contact_points[contact_point] = datasets

# Send email to contact points if there are validation errors
for contact_point in contact_points:
try:
datasets = contact_points[contact_point]
# Only email contact point if there are datasets
if len(datasets) > 0:
contact_point_data = get_action('get_secure_vocabulary_record')(context, {'vocabulary_name': 'point-of-contact', 'query': contact_point})
if contact_point_data:
recipient_name = contact_point_data.get('Name', '')
recipient_email = contact_point_data.get('Email', '')
subject = render('emails/subject/validate_datasets.txt')
body = render('emails/body/validate_datasets.txt', {'datasets': datasets})
body_html = render('emails/body/validate_datasets.html', {'datasets': datasets})
mail_recipient(recipient_name, recipient_email, subject, body, body_html)
else:
# No contact point found for email notification
log.error(f'No contact point found for {contact_point}')
except Exception as e:
log.error(f"Error sending email to {contact_point}")

# Send email to admin if there are validation errors
if validation_errors:
recipient_name = config.get('ckanext.qdes.validation_error_recipient_name')
recipient_email = config.get('ckanext.qdes.validation_error_recipient_email')
try:
if recipient_email:
subject = render('emails/subject/validate_datasets.txt')
body = render('emails/body/validate_datasets.txt', {'datasets': validation_errors})
body_html = render('emails/body/validate_datasets.html', {'datasets': validation_errors})
mail_recipient(recipient_name, recipient_email, subject, body, body_html)
else:
log.error(f'validation_error_recipient_email is not set')
except Exception as e:
log.error(f"Error sending email to {recipient_name}:{recipient_email}")
44 changes: 44 additions & 0 deletions ckanext/qdes/templates/emails/body/validate_datasets.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<p>Queensland Environmental Science Data (QESD) catalogue automatically checks for validation errors of datasets and their resources.</p>
<p>The following dataset(s) have validation errors:</p>
<ul>
{% for validation_error in datasets %}
{%- set dataset = validation_error.get('dataset') -%}
{%- set errors = validation_error.get('errors') -%}
{%- set schema = h.scheming_get_dataset_schema(dataset.type) -%}
<li><a href="{{ h.url_for(dataset.type + '.read', id=dataset.name, _external=True) }}">{{dataset.title}}</a></li>
<ul>
{% for key, error in errors.items() %}
{% if key == 'resources' %}
<li>Resources:
<ul>
{% for resource in error %}
<li>
<a href="{{ h.url_for(dataset.type + '_resource.read', package_type=dataset.type, id=dataset.name, resource_id=resource.resource_id, _external=True) }}">
{{resource.resource_name}}
</a>
</li>
{% for key, error in resource.errors.items() %}
<ul>
<li>
<a target="_blank" href="{{ h.url_for('resource.edit', package_type=dataset.type, id=dataset.name, resource_id=resource.resource_id, _external=True) }}#field-{{key}}">
{{ h.qdes_get_field_label(key, schema, 'resource_fields') }}
</a> : {{ error|join(', ') }}
</li>
</ul>
{% endfor %}
{% endfor %}
</ul>
</li>
{% else %}
<li>
<a target="_blank" href="{{ h.url_for(dataset.type + '.edit', id=dataset.name, _external=True) }}#field-{{key}}">
{{ h.qdes_get_field_label(key, schema) }}
</a> : {{ error|join(', ') }}
</li>
{% endif %}
{% endfor %}
</ul>
<br>
{% endfor %}
</ul>
Please review the dataset metadata.
25 changes: 25 additions & 0 deletions ckanext/qdes/templates/emails/body/validate_datasets.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Queensland Environmental Science Data (QESD) catalogue automatically checks for validation errors of datasets and their resources.

The following dataset(s) have validation errors:

{%- for validation_error in datasets -%}
{%- set dataset = validation_error.get('dataset') -%}
{%- set errors = validation_error.get('errors') -%}
{%- set schema = h.scheming_get_dataset_schema(dataset.type) -%}
{%- set dataset_type_action = dataset.type + '.edit' %}
- {{dataset.title}}: {{ h.url_for(dataset_type_action, id=dataset.name, _external=True) }}
{%- for key, error in errors.items() %}
{% if key == 'resources' -%}
- Resources:
{%- for resource in error %}
- {{ resource.resource_name }}: {{ h.url_for('resource.edit', package_type=dataset.type, id=dataset.name, resource_id=resource.resource_id, _external=True) }}
{%- for key, error in resource.errors.items() %}
- {{ h.qdes_get_field_label(key, schema, 'resource_fields') }}: {{ error|join(', ')|safe }}
{%- endfor %}
{%- endfor %}
{%- else -%}
- {{ h.qdes_get_field_label(key, schema) }}: {{ error|join(', ')|safe }}
{%- endif -%}
{% endfor %}
{% endfor %}
Please review the dataset metadata.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
QESD catalogue – Dataset validation errors

0 comments on commit 384e0bf

Please sign in to comment.