diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 79826d45..de9097f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,13 +1,15 @@ name: Tests +env: + COLUMNS: 120 on: [push, pull_request] jobs: lint: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Install requirements run: pip install flake8 pycodestyle - name: Check syntax @@ -16,16 +18,22 @@ jobs: test: strategy: matrix: - ckan-version: ["2.10", 2.9, 2.8, 2.7] + include: + - ckan-version: "2.11" + ckan-image: "ckan/ckan-dev:2.11-py3.10" + - ckan-version: "2.10" + ckan-image: "ckan/ckan-dev:2.10-py3.10" + - ckan-version: "2.9" + ckan-image: "ckan/ckan-dev:2.9-py3.9" fail-fast: false name: CKAN ${{ matrix.ckan-version }} runs-on: ubuntu-20.04 container: - image: openknowledge/ckan-dev:${{ matrix.ckan-version }} + image: ${{ matrix.ckan-image }} services: solr: - image: ckan/ckan-solr:${{ matrix.ckan-version }} + image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 postgres: image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} env: @@ -43,43 +51,28 @@ jobs: CKAN_REDIS_URL: redis://redis:6379/1 steps: - - uses: actions/checkout@v3 - - name: Install requirements (Python 3) - if: ${{ matrix.ckan-version != '2.7' && matrix.ckan-version != '2.8' && matrix.ckan-version != '2.9-py2'}} + - uses: actions/checkout@v4 + - name: Install requirements (common) run: | pip install -r requirements.txt pip install -r dev-requirements.txt - - name: Install requirements (Python 2) - if: ${{ matrix.ckan-version == '2.7' || matrix.ckan-version == '2.8' || matrix.ckan-version == '2.9-py2'}} - run: | - pip install -r requirements-py2.txt - pip install -r dev-requirements-py2.txt - - name: Install requirements (common) - run: | pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Setup extension (CKAN >= 2.9) - if: ${{ matrix.ckan-version != '2.7' && matrix.ckan-version != '2.8' }} + - name: Install requirements (2.9) run: | - # Install ckanext-harvest - git clone https://github.com/OpenGov-OpenData/ckanext-harvest.git - pip install -e ckanext-harvest - pip install -r ckanext-harvest/pip-requirements.txt - ckan -c test.ini db init - ckan -c test.ini harvester initdb - - name: Setup extension (CKAN < 2.9) - if: ${{ matrix.ckan-version == '2.7' || matrix.ckan-version == '2.8' }} + pip install -U pytest-rerunfailures + if: ${{ matrix.ckan-version == '2.9' }} + - name: Setup other extensions run: | - # Install ckanext-harvest version that supports 2.7 - git clone https://github.com/OpenGov-OpenData/ckanext-harvest.git + git clone https://github.com/OpenGov-OpenData/ckanext-harvest pip install -e ckanext-harvest - pip install -r ckanext-harvest/pip-requirements.txt - paster --plugin=ckan db init -c test.ini - paster --plugin=ckanext-harvest harvester initdb -c test.ini + pip install -r ckanext-harvest/requirements.txt + git clone https://github.com/OpenGov-OpenData/ckanext-scheming + pip install -e ckanext-scheming + - name: Setup extension + run: | + ckan -c test.ini db init + ckan -c test.ini db pending-migrations --apply - name: Run tests - run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=xml --cov-append --disable-warnings ckanext/dcat/tests - - name: Upload coverage report to codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml + run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a9c788b..69aea84f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,66 @@ # Changelog - -## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v1.3.0...HEAD) +## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v1.7.0...HEAD) + +* Support for standard CKAN [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas. + The DCAT profiles now seamlessly integrate with fields defined via the YAML or JSON scheming files. + Sites willing to migrate to a scheming based metadata schema can do + so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. + `ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile + outputs to the expected format by the scheming validators. Sample schemas are provided + in the `ckanext/dcat/schemas` folder. See the [documentation](https://github.com/ckan/ckanext-dcat?tab=readme-ov-file#schemas) + for all details. Some highlights of the new scheming based profiles: + + * Actual list support in the API output for list properties like `dct:language` + * Multiple objects now allowed for properties like `dcat:ContactPoint`, `dct:spatial` or `dct:temporal` + * Custom validators for date values that allow `xsd:gYear`, `xsd:gYearMonth`, `xsd:date` and `xsd:dateTime` + + (#281) +* [SHACL validation](https://github.com/SEMICeu/DCAT-AP/tree/master/releases/2.1.1) for DCAT-AP 2.1.1 profile (scheming and legacy). + SHACL validation made surface the following issues in the existing profiles, which are now fixed: + * Cast `dcat:byteSize` and `dcat:spatialResolutionInMeters` as Decimal, not float + * Allow only one value of `dcat:spatialResolutionInMeters` and `dcat:temporalResolution` + * Only output the WKT version of geometries in `locn:geometry`, `dcat:bbox` and `dcat:centroid`. Sites that for some reason + require GeoJSON (or both) can use the `ckanext.dcat.output_spatial_format` config option + to choose which format to use + * When using the `euro_dcat_ap_2` profile, don't output temporal extent namespaced + both with `schema` and `dcat`, just with the latter (`dcat:startDate` and `dcat:endDate`) + (#288) +* New `ckan dcat consume` and `ckan dcat produce` CLI commands (#279) +* Parse dcat:spatialResolutionInMeters as float (#285) +* Split profile classes into their own separate files (#282) +* Catch Not Authorized in View (#280) +* CKAN 2.11 support and requirements updates (#270) + + +## [v1.7.0](https://github.com/ckan/ckanext-dcat/compare/v1.6.0...v1.7.0) - 2024-04-04 + +* Adds support for the latest Hydra vocabulary. For backward compatibility, the old properties are still supported but marked as deprecated. (#267) + +## [v1.6.0](https://github.com/ckan/ckanext-dcat/compare/v1.5.1...v1.6.0) - 2024-02-29 + +* Add support for `DCATAP.applicableLegislation` and `DCATAP.hvdCategory` to the `euro_dcat_ap_2` profile (#262) +* Improve access service tests (#258) +* Fix missing access service items when parsing dataset (#256) + +## [v1.5.1](https://github.com/ckan/ckanext-dcat/compare/v1.5.0...v1.5.1) - 2023-06-20 + +* Fix tests to work with `ckanext-harvest >= 1.5.4`. (#250) +* Add references for dcat:accessService to the `euro_dcat_ap_2` profile (#251) + +## [v1.5.0](https://github.com/ckan/ckanext-dcat/compare/v1.4.0...v1.5.0) - 2023-05-02 + +* Remove support for old CKAN versions prior 2.9 and Python 2 (#244) +* Update hooks to support CKAN 2.10 (#241) +* Fix description for RDF endpoints in README (#246) +* Fix media type for links to the Turtle representation in HTML templates (#242) +* Ignore already deleted packages when deleting (#238) +* Add support for dcat:accessService in dcat:Distribution (#235) + +## [v1.4.0](https://github.com/ckan/ckanext-dcat/compare/v1.3.0...v1.4.0) - 2022-12-05 + +* RDF serialization: Add fallback values for resource dates (#233) +* Add option for fallback distribution license if missing (#231) ## [v1.3.0](https://github.com/ckan/ckanext-dcat/compare/v1.2.0...v1.3.0) - 2022-08-01 diff --git a/README.md b/README.md index f03ecfd8..21ced668 100644 --- a/README.md +++ b/README.md @@ -5,50 +5,66 @@ [![Code Coverage](http://codecov.io/github/ckan/ckanext-dcat/coverage.svg?branch=master)](http://codecov.io/github/ckan/ckanext-dcat?branch=master) -This extension provides plugins that allow CKAN to expose and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: +This extension provides plugins that allow CKAN to expose its metadata and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: [http://www.w3.org/TR/vocab-dcat](http://www.w3.org/TR/vocab-dcat) It also offers other features related to Semantic Data like exposing the necessary markup to get your datasets indexed in [Google Dataset Search](https://toolbox.google.com/datasetsearch). +Check the [overview](#overview) section for a summary of the available features. + ## Contents + + - [Overview](#overview) - [Installation](#installation) +- [Schemas](#schemas) + * [Compatibility with existing profiles](#compatibility-with-existing-profiles) - [RDF DCAT endpoints](#rdf-dcat-endpoints) - - [Dataset endpoints](#dataset-endpoints) - - [Catalog endpoint](#catalog-endpoint) - - [URIs](#uris) - - [Content negotiation](#content-negotiation) + * [Dataset endpoints](#dataset-endpoints) + * [Catalog endpoint](#catalog-endpoint) + * [URIs](#uris) + * [Content negotiation](#content-negotiation) - [RDF DCAT harvester](#rdf-dcat-harvester) - - [Maximum file size](#maximum-file-size) - - [Transitive harvesting](#transitive-harvesting) - - [Extending the RDF harvester](#extending-the-rdf-harvester) + * [Maximum file size](#maximum-file-size) + * [Transitive harvesting](#transitive-harvesting) + * [Extending the RDF harvester](#extending-the-rdf-harvester) - [JSON DCAT harvester](#json-dcat-harvester) - [RDF DCAT to CKAN dataset mapping](#rdf-dcat-to-ckan-dataset-mapping) + * [Custom fields](#custom-fields) + * [URIs](#uris-1) + * [Lists](#lists) + * [Contact points and Publisher](#contact-points-and-publisher) + * [Spatial coverage](#spatial-coverage) + * [Licenses](#licenses) - [RDF DCAT Parser](#rdf-dcat-parser) - [RDF DCAT Serializer](#rdf-dcat-serializer) + * [Inherit license from the dataset as fallback in distributions](#inherit-license-from-the-dataset-as-fallback-in-distributions) - [Profiles](#profiles) - - [Writing custom profiles](#writing-custom-profiles) - - [Command line interface](#command-line-interface) - - [Compatibility mode](#compatibility-mode) + * [Writing custom profiles](#writing-custom-profiles) + * [Command line interface](#command-line-interface) + * [Compatibility mode](#compatibility-mode) - [XML DCAT harvester (deprecated)](#xml-dcat-harvester-deprecated) - [Translation of fields](#translation-of-fields) -- [Structured Data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) +- [Structured data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) +- [CLI](#cli) - [Running the Tests](#running-the-tests) - [Releases](#releases) - [Acknowledgements](#acknowledgements) - [Copying and License](#copying-and-license) -## Overview + -With the emergence of Open Data initiatives around the world, the need to share metadata across different catalogs has became more evident. Sites like [the EU Open Data Portal](https://data.europa.eu/euodp/en/data/) aggregate datasets from different portals, and there has been a growing demand to provide a clear and standard interface to allow incorporating metadata into them automatically. +## Overview -There is growing consensus around [DCAT](http://www.w3.org/TR/vocab-dcat) being the right way forward, but actual implementations are needed. This extension aims to provide tools and guidance to allow publishers to publish and share DCAT based metadata easily. +[DCAT](http://www.w3.org/TR/vocab-dcat) has become the basis for many metadata sharing standards, like DCAT-AP and DCAT-US for data portals in Europe and the USA respectively. This extension aims to provide tools and guidance to allow publishers to publish and share DCAT based metadata easily. In terms of CKAN features, this extension offers: +* [Pre-built CKAN schemas](#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the -box DCAT support in data portals. + * [RDF DCAT Endpoints](#rdf-dcat-endpoints) that expose the catalog's datasets in different RDF serializations (`dcat` plugin). * An [RDF Harvester](#rdf-dcat-harvester) that allows importing RDF serializations from other catalogs to create CKAN datasets (`dcat_rdf_harvester` plugin). @@ -68,24 +84,66 @@ These are implemented internally using: ## Installation -1. Install ckanext-harvest ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)) (Only if you want to use the RDF harvester) -2. Install the extension on your virtualenv: +1. Install the extension on your virtualenv: (pyenv) $ pip install -e git+https://github.com/ckan/ckanext-dcat.git#egg=ckanext-dcat -3. Install the extension requirements: +2. Install the extension requirements: (pyenv) $ pip install -r ckanext-dcat/requirements.txt - > **Note** - > - > If you are running on Python 2.7 or 3.6 please use `requirements-py2-py36.txt` instead - -4. Enable the required plugins in your ini file: +3. Enable the required plugins in your ini file: ckan.plugins = dcat dcat_rdf_harvester dcat_json_harvester dcat_json_interface structured_data +4. To use the pre-built schemas, install [ckanext-scheming](https://github.com/ckan/ckanext-scheming): + + pip install -e "git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming" + +Check the [Schemas](#schemas) section for extra configuration needed. + +Optionally, if you want to use the RDF harvester, install ckanext-harvest as well ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)). + +## Schemas + +The extension includes ready to use [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas that enable DCAT support. These include a schema definition file (located in `ckanext/dcat/schemas`) plus extra validators and other custom logic that integrates the metadata modifications with the RDF DCAT [Parsers](#rdf-dcat-parser) and [Serializers](#rdf-dcat-serializer) and other CKAN features and extensions. + +There are the following schemas currently included with the extension: + +* *dcat_ap_2.1_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. +* *dcat_ap_2.1_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. + +Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](#writing-custom-profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. + +In any case, the schema file used should be defined in the configuration file, alongside these configuration options: + + # Make sure to add scheming_datasets after the dcat plugin + ckan.plugins = activity dcat [...] scheming_datasets + + # Point to one of the defaults or your own version of the schema file + scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_2.1_recommended.yaml + + # Include the dcat presets as well as the standard scheming ones + scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml + + # Sites using the euro_dcat_ap and euro_dcat_ap_2 profiles must add the + # euro_dcat_ap_scheming profile if they want to use ckanext-scheming schemas (see next section) + ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming + +### Compatibility with existing profiles + +Sites using the existing `euro_dcat_ap` and `euro_dcat_ap_2` profiles should not see any change in their +current parsing and serialization functionalities and these profiles will not change their outputs going +forward (unless a bug is being fixed). Sites willing to migrate to a scheming based metadata schema can do +so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. +`ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile +outputs to the expected format by the scheming validators. + +Note that the scheming profile will only affect fields defined in the schema definition file, so sites can start migrating gradually different metadata fields. + + + ## RDF DCAT endpoints By default when the `dcat` plugin is enabled, the following RDF endpoints are available on your CKAN instance. The schema used on the serializations can be customized using [profiles](#profiles). @@ -125,7 +183,7 @@ RDF representations will be advertised using `` tags on th
- + @@ -152,7 +210,7 @@ This endpoint can be customized if necessary using the `ckanext.dcat.catalog_end ckanext.dcat.catalog_endpoint = /dcat/catalog/{_format} -The custom endpoint **must** start with a backslash (`/`) and contain the `{_format}` placeholder. +The custom endpoint **must** start with a forward slash (`/`) and contain the `{_format}` placeholder. As described previously, the extension will determine the RDF serialization format returned. @@ -167,7 +225,7 @@ RDF representations will be advertised using `` tags on th - + @@ -177,10 +235,9 @@ The number of datasets returned is limited. The response will include paging inf @prefix hydra: