diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 79826d45..de9097f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,13 +1,15 @@ name: Tests +env: + COLUMNS: 120 on: [push, pull_request] jobs: lint: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Install requirements run: pip install flake8 pycodestyle - name: Check syntax @@ -16,16 +18,22 @@ jobs: test: strategy: matrix: - ckan-version: ["2.10", 2.9, 2.8, 2.7] + include: + - ckan-version: "2.11" + ckan-image: "ckan/ckan-dev:2.11-py3.10" + - ckan-version: "2.10" + ckan-image: "ckan/ckan-dev:2.10-py3.10" + - ckan-version: "2.9" + ckan-image: "ckan/ckan-dev:2.9-py3.9" fail-fast: false name: CKAN ${{ matrix.ckan-version }} runs-on: ubuntu-20.04 container: - image: openknowledge/ckan-dev:${{ matrix.ckan-version }} + image: ${{ matrix.ckan-image }} services: solr: - image: ckan/ckan-solr:${{ matrix.ckan-version }} + image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 postgres: image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} env: @@ -43,43 +51,28 @@ jobs: CKAN_REDIS_URL: redis://redis:6379/1 steps: - - uses: actions/checkout@v3 - - name: Install requirements (Python 3) - if: ${{ matrix.ckan-version != '2.7' && matrix.ckan-version != '2.8' && matrix.ckan-version != '2.9-py2'}} + - uses: actions/checkout@v4 + - name: Install requirements (common) run: | pip install -r requirements.txt pip install -r dev-requirements.txt - - name: Install requirements (Python 2) - if: ${{ matrix.ckan-version == '2.7' || matrix.ckan-version == '2.8' || matrix.ckan-version == '2.9-py2'}} - run: | - pip install -r requirements-py2.txt - pip install -r dev-requirements-py2.txt - - name: Install requirements (common) - run: | pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Setup extension (CKAN >= 2.9) - if: ${{ matrix.ckan-version != '2.7' && matrix.ckan-version != '2.8' }} + - name: Install requirements (2.9) run: | - # Install ckanext-harvest - git clone https://github.com/OpenGov-OpenData/ckanext-harvest.git - pip install -e ckanext-harvest - pip install -r ckanext-harvest/pip-requirements.txt - ckan -c test.ini db init - ckan -c test.ini harvester initdb - - name: Setup extension (CKAN < 2.9) - if: ${{ matrix.ckan-version == '2.7' || matrix.ckan-version == '2.8' }} + pip install -U pytest-rerunfailures + if: ${{ matrix.ckan-version == '2.9' }} + - name: Setup other extensions run: | - # Install ckanext-harvest version that supports 2.7 - git clone https://github.com/OpenGov-OpenData/ckanext-harvest.git + git clone https://github.com/OpenGov-OpenData/ckanext-harvest pip install -e ckanext-harvest - pip install -r ckanext-harvest/pip-requirements.txt - paster --plugin=ckan db init -c test.ini - paster --plugin=ckanext-harvest harvester initdb -c test.ini + pip install -r ckanext-harvest/requirements.txt + git clone https://github.com/OpenGov-OpenData/ckanext-scheming + pip install -e ckanext-scheming + - name: Setup extension + run: | + ckan -c test.ini db init + ckan -c test.ini db pending-migrations --apply - name: Run tests - run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=xml --cov-append --disable-warnings ckanext/dcat/tests - - name: Upload coverage report to codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml + run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a9c788b..69aea84f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,66 @@ # Changelog - -## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v1.3.0...HEAD) +## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v1.7.0...HEAD) + +* Support for standard CKAN [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas. + The DCAT profiles now seamlessly integrate with fields defined via the YAML or JSON scheming files. + Sites willing to migrate to a scheming based metadata schema can do + so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. + `ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile + outputs to the expected format by the scheming validators. Sample schemas are provided + in the `ckanext/dcat/schemas` folder. See the [documentation](https://github.com/ckan/ckanext-dcat?tab=readme-ov-file#schemas) + for all details. Some highlights of the new scheming based profiles: + + * Actual list support in the API output for list properties like `dct:language` + * Multiple objects now allowed for properties like `dcat:ContactPoint`, `dct:spatial` or `dct:temporal` + * Custom validators for date values that allow `xsd:gYear`, `xsd:gYearMonth`, `xsd:date` and `xsd:dateTime` + + (#281) +* [SHACL validation](https://github.com/SEMICeu/DCAT-AP/tree/master/releases/2.1.1) for DCAT-AP 2.1.1 profile (scheming and legacy). + SHACL validation made surface the following issues in the existing profiles, which are now fixed: + * Cast `dcat:byteSize` and `dcat:spatialResolutionInMeters` as Decimal, not float + * Allow only one value of `dcat:spatialResolutionInMeters` and `dcat:temporalResolution` + * Only output the WKT version of geometries in `locn:geometry`, `dcat:bbox` and `dcat:centroid`. Sites that for some reason + require GeoJSON (or both) can use the `ckanext.dcat.output_spatial_format` config option + to choose which format to use + * When using the `euro_dcat_ap_2` profile, don't output temporal extent namespaced + both with `schema` and `dcat`, just with the latter (`dcat:startDate` and `dcat:endDate`) + (#288) +* New `ckan dcat consume` and `ckan dcat produce` CLI commands (#279) +* Parse dcat:spatialResolutionInMeters as float (#285) +* Split profile classes into their own separate files (#282) +* Catch Not Authorized in View (#280) +* CKAN 2.11 support and requirements updates (#270) + + +## [v1.7.0](https://github.com/ckan/ckanext-dcat/compare/v1.6.0...v1.7.0) - 2024-04-04 + +* Adds support for the latest Hydra vocabulary. For backward compatibility, the old properties are still supported but marked as deprecated. (#267) + +## [v1.6.0](https://github.com/ckan/ckanext-dcat/compare/v1.5.1...v1.6.0) - 2024-02-29 + +* Add support for `DCATAP.applicableLegislation` and `DCATAP.hvdCategory` to the `euro_dcat_ap_2` profile (#262) +* Improve access service tests (#258) +* Fix missing access service items when parsing dataset (#256) + +## [v1.5.1](https://github.com/ckan/ckanext-dcat/compare/v1.5.0...v1.5.1) - 2023-06-20 + +* Fix tests to work with `ckanext-harvest >= 1.5.4`. (#250) +* Add references for dcat:accessService to the `euro_dcat_ap_2` profile (#251) + +## [v1.5.0](https://github.com/ckan/ckanext-dcat/compare/v1.4.0...v1.5.0) - 2023-05-02 + +* Remove support for old CKAN versions prior 2.9 and Python 2 (#244) +* Update hooks to support CKAN 2.10 (#241) +* Fix description for RDF endpoints in README (#246) +* Fix media type for links to the Turtle representation in HTML templates (#242) +* Ignore already deleted packages when deleting (#238) +* Add support for dcat:accessService in dcat:Distribution (#235) + +## [v1.4.0](https://github.com/ckan/ckanext-dcat/compare/v1.3.0...v1.4.0) - 2022-12-05 + +* RDF serialization: Add fallback values for resource dates (#233) +* Add option for fallback distribution license if missing (#231) ## [v1.3.0](https://github.com/ckan/ckanext-dcat/compare/v1.2.0...v1.3.0) - 2022-08-01 diff --git a/README.md b/README.md index f03ecfd8..21ced668 100644 --- a/README.md +++ b/README.md @@ -5,50 +5,66 @@ [![Code Coverage](http://codecov.io/github/ckan/ckanext-dcat/coverage.svg?branch=master)](http://codecov.io/github/ckan/ckanext-dcat?branch=master) -This extension provides plugins that allow CKAN to expose and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: +This extension provides plugins that allow CKAN to expose its metadata and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: [http://www.w3.org/TR/vocab-dcat](http://www.w3.org/TR/vocab-dcat) It also offers other features related to Semantic Data like exposing the necessary markup to get your datasets indexed in [Google Dataset Search](https://toolbox.google.com/datasetsearch). +Check the [overview](#overview) section for a summary of the available features. + ## Contents + + - [Overview](#overview) - [Installation](#installation) +- [Schemas](#schemas) + * [Compatibility with existing profiles](#compatibility-with-existing-profiles) - [RDF DCAT endpoints](#rdf-dcat-endpoints) - - [Dataset endpoints](#dataset-endpoints) - - [Catalog endpoint](#catalog-endpoint) - - [URIs](#uris) - - [Content negotiation](#content-negotiation) + * [Dataset endpoints](#dataset-endpoints) + * [Catalog endpoint](#catalog-endpoint) + * [URIs](#uris) + * [Content negotiation](#content-negotiation) - [RDF DCAT harvester](#rdf-dcat-harvester) - - [Maximum file size](#maximum-file-size) - - [Transitive harvesting](#transitive-harvesting) - - [Extending the RDF harvester](#extending-the-rdf-harvester) + * [Maximum file size](#maximum-file-size) + * [Transitive harvesting](#transitive-harvesting) + * [Extending the RDF harvester](#extending-the-rdf-harvester) - [JSON DCAT harvester](#json-dcat-harvester) - [RDF DCAT to CKAN dataset mapping](#rdf-dcat-to-ckan-dataset-mapping) + * [Custom fields](#custom-fields) + * [URIs](#uris-1) + * [Lists](#lists) + * [Contact points and Publisher](#contact-points-and-publisher) + * [Spatial coverage](#spatial-coverage) + * [Licenses](#licenses) - [RDF DCAT Parser](#rdf-dcat-parser) - [RDF DCAT Serializer](#rdf-dcat-serializer) + * [Inherit license from the dataset as fallback in distributions](#inherit-license-from-the-dataset-as-fallback-in-distributions) - [Profiles](#profiles) - - [Writing custom profiles](#writing-custom-profiles) - - [Command line interface](#command-line-interface) - - [Compatibility mode](#compatibility-mode) + * [Writing custom profiles](#writing-custom-profiles) + * [Command line interface](#command-line-interface) + * [Compatibility mode](#compatibility-mode) - [XML DCAT harvester (deprecated)](#xml-dcat-harvester-deprecated) - [Translation of fields](#translation-of-fields) -- [Structured Data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) +- [Structured data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) +- [CLI](#cli) - [Running the Tests](#running-the-tests) - [Releases](#releases) - [Acknowledgements](#acknowledgements) - [Copying and License](#copying-and-license) -## Overview + -With the emergence of Open Data initiatives around the world, the need to share metadata across different catalogs has became more evident. Sites like [the EU Open Data Portal](https://data.europa.eu/euodp/en/data/) aggregate datasets from different portals, and there has been a growing demand to provide a clear and standard interface to allow incorporating metadata into them automatically. +## Overview -There is growing consensus around [DCAT](http://www.w3.org/TR/vocab-dcat) being the right way forward, but actual implementations are needed. This extension aims to provide tools and guidance to allow publishers to publish and share DCAT based metadata easily. +[DCAT](http://www.w3.org/TR/vocab-dcat) has become the basis for many metadata sharing standards, like DCAT-AP and DCAT-US for data portals in Europe and the USA respectively. This extension aims to provide tools and guidance to allow publishers to publish and share DCAT based metadata easily. In terms of CKAN features, this extension offers: +* [Pre-built CKAN schemas](#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the -box DCAT support in data portals. + * [RDF DCAT Endpoints](#rdf-dcat-endpoints) that expose the catalog's datasets in different RDF serializations (`dcat` plugin). * An [RDF Harvester](#rdf-dcat-harvester) that allows importing RDF serializations from other catalogs to create CKAN datasets (`dcat_rdf_harvester` plugin). @@ -68,24 +84,66 @@ These are implemented internally using: ## Installation -1. Install ckanext-harvest ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)) (Only if you want to use the RDF harvester) -2. Install the extension on your virtualenv: +1. Install the extension on your virtualenv: (pyenv) $ pip install -e git+https://github.com/ckan/ckanext-dcat.git#egg=ckanext-dcat -3. Install the extension requirements: +2. Install the extension requirements: (pyenv) $ pip install -r ckanext-dcat/requirements.txt - > **Note** - > - > If you are running on Python 2.7 or 3.6 please use `requirements-py2-py36.txt` instead - -4. Enable the required plugins in your ini file: +3. Enable the required plugins in your ini file: ckan.plugins = dcat dcat_rdf_harvester dcat_json_harvester dcat_json_interface structured_data +4. To use the pre-built schemas, install [ckanext-scheming](https://github.com/ckan/ckanext-scheming): + + pip install -e "git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming" + +Check the [Schemas](#schemas) section for extra configuration needed. + +Optionally, if you want to use the RDF harvester, install ckanext-harvest as well ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)). + +## Schemas + +The extension includes ready to use [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas that enable DCAT support. These include a schema definition file (located in `ckanext/dcat/schemas`) plus extra validators and other custom logic that integrates the metadata modifications with the RDF DCAT [Parsers](#rdf-dcat-parser) and [Serializers](#rdf-dcat-serializer) and other CKAN features and extensions. + +There are the following schemas currently included with the extension: + +* *dcat_ap_2.1_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. +* *dcat_ap_2.1_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. + +Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](#writing-custom-profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. + +In any case, the schema file used should be defined in the configuration file, alongside these configuration options: + + # Make sure to add scheming_datasets after the dcat plugin + ckan.plugins = activity dcat [...] scheming_datasets + + # Point to one of the defaults or your own version of the schema file + scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_2.1_recommended.yaml + + # Include the dcat presets as well as the standard scheming ones + scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml + + # Sites using the euro_dcat_ap and euro_dcat_ap_2 profiles must add the + # euro_dcat_ap_scheming profile if they want to use ckanext-scheming schemas (see next section) + ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming + +### Compatibility with existing profiles + +Sites using the existing `euro_dcat_ap` and `euro_dcat_ap_2` profiles should not see any change in their +current parsing and serialization functionalities and these profiles will not change their outputs going +forward (unless a bug is being fixed). Sites willing to migrate to a scheming based metadata schema can do +so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. +`ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile +outputs to the expected format by the scheming validators. + +Note that the scheming profile will only affect fields defined in the schema definition file, so sites can start migrating gradually different metadata fields. + + + ## RDF DCAT endpoints By default when the `dcat` plugin is enabled, the following RDF endpoints are available on your CKAN instance. The schema used on the serializations can be customized using [profiles](#profiles). @@ -125,7 +183,7 @@ RDF representations will be advertised using `` tags on th - + @@ -152,7 +210,7 @@ This endpoint can be customized if necessary using the `ckanext.dcat.catalog_end ckanext.dcat.catalog_endpoint = /dcat/catalog/{_format} -The custom endpoint **must** start with a backslash (`/`) and contain the `{_format}` placeholder. +The custom endpoint **must** start with a forward slash (`/`) and contain the `{_format}` placeholder. As described previously, the extension will determine the RDF serialization format returned. @@ -167,7 +225,7 @@ RDF representations will be advertised using `` tags on th - + @@ -177,10 +235,9 @@ The number of datasets returned is limited. The response will include paging inf @prefix hydra: . a hydra:PagedCollection ; - hydra:firstPage "http://example.com/catalog.ttl?page=1" ; - hydra:itemsPerPage 100 ; - hydra:lastPage "http://example.com/catalog.ttl?page=3" ; - hydra:nextPage "http://example.com/catalog.ttl?page=2" ; + hydra:first "http://example.com/catalog.ttl?page=1" ; + hydra:last "http://example.com/catalog.ttl?page=3" ; + hydra:next "http://example.com/catalog.ttl?page=2" ; hydra:totalItems 283 . The default number of datasets returned (100) can be modified by CKAN site maintainers using the following configuration option on your ini file: @@ -277,12 +334,12 @@ The information contained in the harvested `dcat:Catalog` node will be stored as When serializing, your Catalog will expose the harvested Catalog using the `dct:hasPart` relation. This means that your catalog will have this structure: - `dcat:Catalog` (represents your current catalog) - `dcat:dataset` (1..n, the dataset created withing your catalog) - - `dct:hasPart` + - `dct:hasPart` - `dcat:Catalog` (info of one of the harvested catalogs) - `dcat:dataset` (dataset in the harvested catalog) - - `dct:hasPart` + - `dct:hasPart` - `dcat:Catalog` (info of one of another harvester catalog) - ... + ... ### Extending the RDF harvester @@ -312,85 +369,121 @@ To enable the JSON harvester, add the `dcat_json_harvester` plugin to your CKAN ## RDF DCAT to CKAN dataset mapping The following table provides a generic mapping between the fields of the `dcat:Dataset` and `dcat:Distribution` classes and -their equivalents on the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link +their equivalents in the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link the DCAT publisher property with a CKAN dataset author, maintainer or organization, as the link between them is not straight-forward and may depend on a particular instance needs. When mapping from CKAN metadata to DCAT though, there are in some cases fallback fields that are used if the default field is not present (see [RDF Serializer](#rdf-dcat-serializer) for more details on this. This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210). It depends on the active profile(s) (see [Profiles](#profiles)) which DCAT properties are mapped. +Sites are encouraged to use ckanext-scheming to manage their metadata schema (see [Schemas](#schemas) for all details). This changes in +some cases the way metadata is stored internally and presented at the CKAN API level, but should not affect the RDF DCAT output. | DCAT class | DCAT property | CKAN dataset field | CKAN fallback fields | Stored as | | |-------------------|------------------------|-------------------------------------------|--------------------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| dcat:Dataset | - | extra:uri | | text | See note about URIs | +| dcat:Dataset | - | extra:uri | | text | See [URIs](#uris-1) | | dcat:Dataset | dct:title | title | | text | | | dcat:Dataset | dct:description | notes | | text | | | dcat:Dataset | dcat:keyword | tags | | text | | -| dcat:Dataset | dcat:theme | extra:theme | | list | See note about lists | +| dcat:Dataset | dcat:theme | extra:theme | | list | See [Lists](#lists) | | dcat:Dataset | dct:identifier | extra:identifier | extra:guid, id | text | | | dcat:Dataset | adms:identifier | extra:alternate_identifier | | text | | | dcat:Dataset | dct:issued | extra:issued | metadata_created | text | | | dcat:Dataset | dct:modified | extra:modified | metadata_modified | text | | | dcat:Dataset | owl:versionInfo | version | extra:dcat_version | text | | | dcat:Dataset | adms:versionNotes | extra:version_notes | | text | | -| dcat:Dataset | dct:language | extra:language | | list | See note about lists | +| dcat:Dataset | dct:language | extra:language | | list | See [Lists](#lists) | | dcat:Dataset | dcat:landingPage | url | | text | | | dcat:Dataset | dct:accrualPeriodicity | extra:frequency | | text | | -| dcat:Dataset | dct:conformsTo | extra:conforms_to | | list | See note about lists | +| dcat:Dataset | dct:conformsTo | extra:conforms_to | | list | See [Lists](#lists) | | dcat:Dataset | dct:accessRights | extra:access_rights | | text | | -| dcat:Dataset | foaf:page | extra:documentation | | list | See note about lists | +| dcat:Dataset | foaf:page | extra:documentation | | list | See [Lists](#lists) | | dcat:Dataset | dct:provenance | extra:provenance | | text | | | dcat:Dataset | dct:type | extra:dcat_type | | text | As of DCAT-AP v1.1 there's no controlled vocabulary for this field | -| dcat:Dataset | dct:hasVersion | extra:has_version | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:source | extra:source | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | adms:sample | extra:sample | | list | See note about lists. It is assumed that these are one or more URIs referring to dcat:Distribution instances | -| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | If the RDF provides them, profiles should store the textual and geometric representation of the location in extra:spatial_text, extra:spatial, extra:spatial_bbox and extra:spatial_centroid respectively | +| dcat:Dataset | dct:hasVersion | extra:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:source | extra:source | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | adms:sample | extra:sample | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to dcat:Distribution instances | +| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | See [Spatial coverage](#spatial-coverage) | | dcat:Dataset | dct:temporal | extra:temporal_start + extra:temporal_end | | text | None, one or both extras can be present | | dcat:Dataset | dcat:temporalResolution| extra:temporal_resolution | | list | | | dcat:Dataset | dcat:spatialResolutionInMeters| extra:spatial_resolution_in_meters | | list | | | dcat:Dataset | dct:isReferencedBy | extra:is_referenced_by | | list | | -| dcat:Dataset | dct:publisher | extra:publisher_uri | | text | See note about URIs | +| dcat:Dataset | dct:publisher | extra:publisher_uri | | text | See [URIs](#uris-1) and [Publisher](#contact-points-and-publisher) | | foaf:Agent | foaf:name | extra:publisher_name | | text | | | foaf:Agent | foaf:mbox | extra:publisher_email | organization:title | text | | | foaf:Agent | foaf:homepage | extra:publisher_url | | text | | | foaf:Agent | dct:type | extra:publisher_type | | text | | -| dcat:Dataset | dcat:contactPoint | extra:contact_uri | | text | See note about URIs | +| dcat:Dataset | dcat:contactPoint | extra:contact_uri | | text | See [URIs](#uris-1) and [Contact points](#contact-points-and-publisher) | | vcard:Kind | vcard:fn | extra:contact_name | maintainer, author | text | | | vcard:Kind | vcard:hasEmail | extra:contact_email | maintainer_email, author_email | text | | | dcat:Dataset | dcat:distribution | resources | | text | | -| dcat:Distribution | - | resource:uri | | text | See note about URIs | +| dcat:Distribution | - | resource:uri | | text | See [URIs](#uris-1) | | dcat:Distribution | dct:title | resource:name | | text | | | dcat:Distribution | dcat:accessURL | resource:access_url | resource:url | text | If downloadURL is not present, accessURL will be used as resource url | | dcat:Distribution | dcat:downloadURL | resource:download_url | | text | If present, downloadURL will be used as resource url | | dcat:Distribution | dct:description | resource:description | | text | | | dcat:Distribution | dcat:mediaType | resource:mimetype | | text | | -| dcat:Distribution | dct:format | resource:format | | text | This is likely to require extra logic to accommodate how CKAN deals with formats (eg ckan/ckanext-dcat#18) | -| dcat:Distribution | dct:license | resource:license | | text | See note about dataset license | +| dcat:Distribution | dct:format | resource:format | | text | | +| dcat:Distribution | dct:license | resource:license | | text | See [Licenses](#licenses) | | dcat:Distribution | adms:status | resource:status | | text | | | dcat:Distribution | dcat:byteSize | resource:size | | number | | -| dcat:Distribution | dct:issued | resource:issued | | text | | -| dcat:Distribution | dct:modified | resource:modified | | text | | +| dcat:Distribution | dct:issued | resource:issued | created | text | | +| dcat:Distribution | dct:modified | resource:modified | metadata_modified | text | | | dcat:Distribution | dct:rights | resource:rights | | text | | -| dcat:Distribution | foaf:page | resource:documentation | | list | See note about lists | -| dcat:Distribution | dct:language | resource:language | | list | See note about lists | -| dcat:Distribution | dct:conformsTo | resource:conforms_to | | list | See note about lists | +| dcat:Distribution | foaf:page | resource:documentation | | list | See [Lists](#lists) | +| dcat:Distribution | dct:language | resource:language | | list | See [Lists](#lists) | +| dcat:Distribution | dct:conformsTo | resource:conforms_to | | list | See [Lists](#lists) | | dcat:Distribution | dcatap:availability | resource:availability | | text | | | dcat:Distribution | dcat:compressFormat | resource:compress_format | | text | | | dcat:Distribution | dcat:packageFormat | resource:package_format | | text | | +| dcat:Distribution | dcat:accessService | resource:access_services | | text | | +| dcat:DataService | dct:title | access_service:title | | text | | +| dcat:DataService | dcat:endpointURL | access_service:endpoint_url | | list | | +| dcat:DataService | dcat:endpointDescription| access_service:endpoint_description | | text | | +| dcat:DataService | dcatap:availability | access_service:availability | | text | | +| dcat:DataService | dcat:servesDataset | access_service:serves_dataset | | list | | +| dcat:DataService | dct:description | access_service:description | | text | | +| dcat:DataService | dct:license | access_service:license | | text | | +| dcat:DataService | dct:accessRights | access_service:access_rights | | text | | | spdx:Checksum | spdx:checksumValue | resource:hash | | text | | | spdx:Checksum | spdx:algorithm | resource:hash_algorithm | | text | | *Notes* -* Whenever possible, URIs are extracted and stored so there is a clear reference to the original RDF resource. - For instance: +### Custom fields + +Fields marked as `extra:` are stored as free form extras in the `euro_dcat_ap` and `euro_dcat_ap_2` profiles, +but stored as first level custom fields when using the scheming based profile (`euro_dcat_ap_scheming`), i.e: + + ```json + { + "name": "test_dataset_dcat", + "extras": [ + {"key": "version_notes", "value": "Some version notes"} + ] + } + ``` + + vs: + + ```json + { + "name": "test_dataset_dcat", + "version_notes": "Some version notes" + } + ``` + +### URIs + +Whenever possible, URIs are extracted and stored so there is a clear reference to the original RDF resource. +For instance: ```xml @@ -451,7 +544,9 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a } ``` -* Lists are stored as a JSON string, eg: +### Lists + +On the legacy profiles, lists are stored as a JSON string, eg: ``` @prefix dcat: . @@ -476,7 +571,58 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a } ``` -* The following formats for `dct:spatial` are supported by the default [parser](#rdf-dcat-parser). Note that the default [serializer](#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. +On the scheming-based ones, these are shown as actual lists: + + ```json + { + "title": "Dataset 1", + "uri": "http://data.some.org/catalog/datasets/1"}, + "language": ["ca", "en", "es"] + "theme": ["Earth Sciences", "http://eurovoc.europa.eu/209065", "http://eurovoc.europa.eu/100142"] + } + ``` +### Contact points and Publisher + +Properties for `dcat:contactPoint` and `dct:publisher` are stored as namespaced extras in the legacy profiles. When using +a scheming-based profile, these are stored as proper objects (and multiple instances are allowed for contact point): + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "extras": [ + {"key":"contact_name","value":"PointofContact"}, + {"key":"contact_email","value":"contact@some.org"} + ], +} +``` + +vs: + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "contact": [ + { + "name": "Point of Contact 1", + "email": "contact1@some.org" + }, + { + "name": "Point of Contact 2", + "email": "contact2@some.org" + }, + ] +} +``` + +If no `publisher` or `publisher_*` fields are found, the serializers will fall back to getting the publisher properties from the organization the CKAN dataset belongs to. The organization schema can be customized with the schema located in `ckanext/dcat/schemas/publisher_organization.yaml` to provide the extra properties supported (this will additionally require loading the `scheming_organizations` plugin in `ckan.plugins`). + + +### Spatial coverage + + +The following formats for `dct:spatial` are supported by the default [parser](#rdf-dcat-parser). Note that the default [serializer](#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. - One `dct:spatial` instance, URI only @@ -526,8 +672,45 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a ``` +If the RDF provides them, profiles should store the textual and geometric representation of the location in: + +* For legacy profiles in `spatial_text`, `spatial_bbox`, `spatial_centroid` or `spatial` (for any other geometries) extra fields +* For scheming-based profiles in objects in the `spatial_coverage` field, for instance: + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "spatial_coverage": [ + { + "geom": { + "type": "Polygon", + "coordinates": [...] + }, + "text": "Tarragona", + "uri": "https://sws.geonames.org/6361390/", + "bbox": { + "type": "Polygon", + "coordinates": [ + [ + [-2.1604, 42.7611], + [-2.0938, 42.7611], + [-2.0938, 42.7931], + [-2.1604, 42.7931], + [-2.1604, 42.7611], + ] + ], + }, + "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, + } + ] +} +``` -* On the CKAN model, license is at the dataset level whereas in DCAT model it + +### Licenses + +On the CKAN model, license is at the dataset level whereas in DCAT model it is at distributions level. By default the RDF parser will try to find a distribution with a license that matches one of those registered in CKAN and attach this license to the dataset. The first matching distribution's @@ -940,12 +1123,31 @@ Example output of structured data in JSON-LD: +## CLI + +The `ckan dcat` command offers utilites to transform between DCAT RDF Serializations and CKAN datasets (`ckan dcat consume`) and +viceversa (`ckan dcat produce`). In both cases the input can be provided as a path to a file: + + ckan dcat consume -f ttl examples/dcat/dataset.ttl + + ckan dcat produce -f jsonld examples/ckan/ckan_datasets.json + +or be read from stdin: + + ckan dcat consume - + +The latter form allows chaininig commands for more complex metadata processing, e.g.: + + curl https://demo.ckan.org/api/action/package_search | jq .result.results | ckan dcat produce -f jsonld - + +For the full list of options check `ckan dcat consume --help` and `ckan dcat produce --help`. + ## Running the Tests To run the tests do: pytest --ckan-ini=test.ini ckanext/dcat/tests - + ## Releases To create a new release, follow these steps: @@ -965,7 +1167,7 @@ Work on ckanext-dcat has been made possible by: If you can fund new developments or contribute please get in touch. -## Copying and License +## Copying and License This material is copyright (c) Open Knowledge. diff --git a/ckanext/dcat/blueprints.py b/ckanext/dcat/blueprints.py index 44ab05d2..e7b14b19 100644 --- a/ckanext/dcat/blueprints.py +++ b/ckanext/dcat/blueprints.py @@ -22,7 +22,8 @@ def read_catalog(_format=None, package_type=None): def read_dataset(_id, _format=None, package_type=None): return utils.read_dataset_page(_id, _format) -if toolkit.asbool(config.get(utils.ENABLE_RDF_ENDPOINTS_CONFIG, True)): + +if utils.endpoints_enabled(): # requirements={'_format': 'xml|rdf|n3|ttl|jsonld'} dcat.add_url_rule(config.get('ckanext.dcat.catalog_endpoint', diff --git a/ckanext/dcat/cli.py b/ckanext/dcat/cli.py index ade76959..52075360 100644 --- a/ckanext/dcat/cli.py +++ b/ckanext/dcat/cli.py @@ -1,25 +1,145 @@ # -*- coding: utf-8 -*- +import json import click + import ckan.plugins.toolkit as tk + import ckanext.dcat.utils as utils +from ckanext.dcat.processors import ( + RDFParser, + RDFSerializer, + DEFAULT_RDF_PROFILES, + RDF_PROFILES_CONFIG_OPTION, +) -@click.group() -def generate_static(): - """Generates static files containing all datasets. - """ +@click.group() +def dcat(): + """DCAT utilities for CKAN""" pass -@generate_static.command() -@click.argument('output', type=click.File(mode="w")) -def json(output): - """The generate command will generate a static file containing all of - the datasets in the catalog in JSON format. +@dcat.command() +@click.argument("output", type=click.File(mode="w")) +def generate_static(output): + """[Deprecated] Generate a static datasets file in JSON format + (requires the dcat_json_interface plugin) . """ utils.generate_static_json(output) +def _get_profiles(profiles): + if profiles: + profiles = profiles.split() + elif tk.config.get(RDF_PROFILES_CONFIG_OPTION): + profiles = tk.aslist(tk.config[RDF_PROFILES_CONFIG_OPTION]) + else: + profiles = None + + return profiles + + +@dcat.command(context_settings={"show_default": True}) +@click.argument("input", type=click.File(mode="r")) +@click.option( + "-o", + "--output", + type=click.File(mode="w"), + default="-", + help="By default the command will output the result to stdin, " + "alternatively you can provide a file path with this option", +) +@click.option( + "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)" +) +@click.option( + "-p", + "--profiles", + help=f"RDF profiles to use. If not provided will be read from config, " + "if not present there, the default will be used: {DEFAULT_RDF_PROFILES}", +) +@click.option( + "-P", "--pretty", is_flag=True, help="Make the output more human readable" +) +@click.option( + "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)" +) +def consume(input, output, format, profiles, pretty, compat_mode): + """ + Parses DCAT RDF graphs into CKAN dataset JSON objects. + + The input serializations can be provided as a path to a file, e.g.: + + ckan dcat consume examples/dcat/dataset.ttl + + Or be read from stdin: + + ckan dcat consume - + """ + contents = input.read() + + profiles = _get_profiles(profiles) + + parser = RDFParser(profiles=profiles, compatibility_mode=compat_mode) + parser.parse(contents, _format=format) + + ckan_datasets = [d for d in parser.datasets()] + + indent = 4 if pretty else None + out = json.dumps(ckan_datasets, indent=indent) + + output.write(out) + + +@dcat.command(context_settings={"show_default": True}) +@click.argument("input", type=click.File(mode="r")) +@click.option( + "-o", + "--output", + type=click.File(mode="w"), + default="-", + help="By default the command will output the result to stdin, " + "alternatively you can provide a file path with this option", +) +@click.option( + "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)" +) +@click.option( + "-p", + "--profiles", + help=f"RDF profiles to use. If not provided will be read from config, " + "if not present there, the default will be used: {DEFAULT_RDF_PROFILES}", +) +@click.option( + "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)" +) +def produce(input, output, format, profiles, compat_mode): + """ + Transforms CKAN dataset JSON objects into DCAT RDF serializations. + + The input datasets can be provided as a path to a file, e.g.: + + ckan dcat consume examples/ckan/ckan_dataset.json + + Or be read from stdin: + + ckan dcat produce - + """ + contents = input.read() + + profiles = _get_profiles(profiles) + + serializer = RDFSerializer(profiles=profiles, compatibility_mode=compat_mode) + + dataset = json.loads(contents) + if isinstance(dataset, list): + out = serializer.serialize_datasets(dataset, _format=format) + else: + out = serializer.serialize_dataset(dataset, _format=format) + + output.write(out) + + def get_commands(): - return [generate_static] + return [dcat] diff --git a/ckanext/dcat/commands.py b/ckanext/dcat/commands.py deleted file mode 100644 index 39f07b93..00000000 --- a/ckanext/dcat/commands.py +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- - -import logging -from ckan import plugins as p - -import ckanext.dcat.utils as utils - - -class GenerateStaticDCATCommand(p.toolkit.CkanCommand): - """ - Generates static JSON files containing all datasets. - - The generate command will generate a static file containing all of the - datasets in the catalog in JSON format. - - paster generate_static json -c - """ - summary = __doc__.split('\n')[0] - usage = __doc__ - max_args = 2 - min_args = 2 - - def __init__(self, name): - super(GenerateStaticDCATCommand, self).__init__(name) - - def command(self): - self._load_config() - self.log = logging.getLogger(__name__) - - if len(self.args) != 2: - self.log.error("You must specify the command and the output file") - return - - cmd, output = self.args - - if cmd == 'json': - self.generate(output) - else: - self.log.error("Unknown command {0}".format(cmd)) - - def generate(self, output): - """ - Keep reading and converting datasets until we get an empty list back - from dcat_datasets_list - """ - with open(output, 'w') as f: - utils.generate_static_json(f) diff --git a/ckanext/dcat/configuration_processors.py b/ckanext/dcat/configuration_processors.py index 6bf95166..6d0b87b0 100644 --- a/ckanext/dcat/configuration_processors.py +++ b/ckanext/dcat/configuration_processors.py @@ -312,7 +312,8 @@ def modify_package_dict(package_dict, config, dcat_dict): value_dict = {} for subfield in list(composite_map.get(field_name)): mapped_field = composite_map.get(field_name).get(subfield) - value_dict[subfield] = dcat_dict.get(mapped_field) + if dcat_dict.get(mapped_field) and dcat_dict.get(mapped_field) not in ['none', 'null']: + value_dict[subfield] = dcat_dict.get(mapped_field) package_dict[field_name] = json.dumps(value_dict, ensure_ascii=False) diff --git a/ckanext/dcat/controllers.py b/ckanext/dcat/controllers.py index f8dc8daa..e58377a4 100644 --- a/ckanext/dcat/controllers.py +++ b/ckanext/dcat/controllers.py @@ -4,13 +4,7 @@ import ckanext.dcat.utils as utils -if toolkit.check_ckan_version(min_version='2.1'): - BaseController = toolkit.BaseController -else: - from ckan.lib.base import BaseController - - -class DCATController(BaseController): +class DCATController(toolkit.BaseController): def read_catalog(self, _format=None): return utils.read_catalog_page(_format) diff --git a/ckanext/dcat/harvesters/base.py b/ckanext/dcat/harvesters/base.py index a8e237ed..0aaa9130 100644 --- a/ckanext/dcat/harvesters/base.py +++ b/ckanext/dcat/harvesters/base.py @@ -1,7 +1,6 @@ import os import logging -import six import requests import rdflib @@ -115,7 +114,7 @@ def _get_content_and_type(self, url, harvest_job, page=1, r = session.get(url, stream=True, timeout=30) length = 0 - content = '' if six.PY2 else b'' + content = b'' for chunk in r.iter_content(chunk_size=self.CHUNK_SIZE): content = content + chunk @@ -126,8 +125,7 @@ def _get_content_and_type(self, url, harvest_job, page=1, harvest_job) return None, None - if not six.PY2: - content = content.decode('utf-8') + content = content.decode('utf-8') if content_type is None and r.headers.get('content-type'): content_type = r.headers.get('content-type').split(";", 1)[0] diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index 05026f4f..564ba1af 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -6,8 +6,6 @@ import hashlib import traceback -import six - import ckan.plugins as p import ckan.model as model @@ -19,7 +17,6 @@ from ckanext.dcat.processors import RDFParserException, RDFParser from ckanext.dcat.interfaces import IDCATRDFHarvester - log = logging.getLogger(__name__) @@ -175,10 +172,7 @@ def gather_stage(self, harvest_job): content_hash = hashlib.md5() if content: - if six.PY2: - content_hash.update(content) - else: - content_hash.update(content.encode('utf8')) + content_hash.update(content.encode('utf8')) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): @@ -278,9 +272,13 @@ def import_stage(self, harvest_object): context = {'model': model, 'session': model.Session, 'user': self._get_user_name(), 'ignore_auth': True} - p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id}) - log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, - harvest_object.guid)) + try: + p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id}) + log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, + harvest_object.guid)) + except p.toolkit.ObjectNotFound: + log.info('Package {0} already deleted.'.format(harvest_object.package_id)) + return True if harvest_object.content is None: diff --git a/ckanext/dcat/logic.py b/ckanext/dcat/logic.py index 20e09d24..61827a88 100644 --- a/ckanext/dcat/logic.py +++ b/ckanext/dcat/logic.py @@ -1,7 +1,6 @@ from __future__ import division import math -import six from ckantoolkit import config from dateutil.parser import parse as dateutil_parse @@ -160,7 +159,7 @@ def _page_url(page): qs = '&'.join( ['{0}={1}'.format( p[0], - p[1].encode('utf8') if six.PY2 else p[1] + p[1] ) for p in params ] ) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 8e92b87e..32b80a17 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -2,18 +2,16 @@ from builtins import object import os - -import six +import json from ckantoolkit import config from ckan import plugins as p -try: - from ckan.lib.plugins import DefaultTranslation -except ImportError: - class DefaultTranslation(object): - pass +from ckan.lib.plugins import DefaultTranslation + +import ckanext.dcat.blueprints as blueprints +import ckanext.dcat.cli as cli from ckanext.dcat.logic import (dcat_dataset_show, dcat_catalog_show, @@ -22,15 +20,7 @@ class DefaultTranslation(object): dcat_auth, ) from ckanext.dcat import utils - -if p.toolkit.check_ckan_version('2.9'): - from ckanext.dcat.plugins.flask_plugin import ( - MixinDCATPlugin, MixinDCATJSONInterface - ) -else: - from ckanext.dcat.plugins.pylons_plugin import ( - MixinDCATPlugin, MixinDCATJSONInterface - ) +from ckanext.dcat.validators import dcat_validators CUSTOM_ENDPOINT_CONFIG = 'ckanext.dcat.catalog_endpoint' @@ -40,15 +30,40 @@ class DefaultTranslation(object): I18N_DIR = os.path.join(HERE, u"../i18n") -class DCATPlugin(MixinDCATPlugin, p.SingletonPlugin, DefaultTranslation): +def _get_dataset_schema(dataset_type="dataset"): + schema = None + try: + schema_show = p.toolkit.get_action("scheming_dataset_schema_show") + try: + schema = schema_show({}, {"type": dataset_type}) + except p.toolkit.ObjectNotFound: + pass + except KeyError: + pass + return schema + + +class DCATPlugin(p.SingletonPlugin, DefaultTranslation): p.implements(p.IConfigurer, inherit=True) p.implements(p.ITemplateHelpers, inherit=True) p.implements(p.IActions, inherit=True) p.implements(p.IAuthFunctions, inherit=True) p.implements(p.IPackageController, inherit=True) - if p.toolkit.check_ckan_version(min_version='2.5.0'): - p.implements(p.ITranslation, inherit=True) + p.implements(p.ITranslation, inherit=True) + p.implements(p.IClick) + p.implements(p.IBlueprint) + p.implements(p.IValidators) + + # IClick + + def get_commands(self): + return cli.get_commands() + + # IBlueprint + + def get_blueprint(self): + return [blueprints.dcat] # ITranslation @@ -81,6 +96,7 @@ def get_helpers(self): return { 'helper_available': utils.helper_available, 'dcat_get_endpoint': utils.get_endpoint, + 'dcat_endpoints_enabled': utils.endpoints_enabled, } # IActions @@ -101,12 +117,31 @@ def get_auth_functions(self): 'dcat_catalog_search': dcat_auth, } + # IValidators + def get_validators(self): + return dcat_validators + # IPackageController + # CKAN < 2.10 hooks def after_show(self, context, data_dict): + return self.after_dataset_show(context, data_dict) + + def before_index(self, dataset_dict): + return self.before_dataset_index(dataset_dict) + # CKAN >= 2.10 hooks + def after_dataset_show(self, context, data_dict): + + schema = _get_dataset_schema(data_dict["type"]) # check if config is enabled to translate keys (default: True) - if not p.toolkit.asbool(config.get(TRANSLATE_KEYS_CONFIG, True)): + # skip if scheming is enabled, as this will be handled there + translate_keys = ( + p.toolkit.asbool(config.get(TRANSLATE_KEYS_CONFIG, True)) + and not schema + ) + + if not translate_keys: return data_dict if context.get('for_view'): @@ -114,7 +149,7 @@ def after_show(self, context, data_dict): def set_titles(object_dict): try: - for key, value in six.iteritems(object_dict.copy()): + for key, value in object_dict.copy().items(): if key in field_labels: object_dict[field_labels[key]] = object_dict[key] del object_dict[key] @@ -130,25 +165,75 @@ def set_titles(object_dict): return data_dict - def before_index(self, pkg_dict): - dcat_modified = utils.parse_date_iso_format(pkg_dict.get('extras_dcat_modified')) + def before_dataset_index(self, dataset_dict): + schema = _get_dataset_schema(dataset_dict["type"]) + spatial = None + if schema: + for field in schema['dataset_fields']: + if field['field_name'] in dataset_dict and 'repeating_subfields' in field: + for item in dataset_dict[field['field_name']]: + for key in item: + value = item[key] + if not isinstance(value, dict): + # Index a flattened version + new_key = f'extras_{field["field_name"]}__{key}' + if not dataset_dict.get(new_key): + dataset_dict[new_key] = value + else: + dataset_dict[new_key] += ' ' + value + + subfields = dataset_dict.pop(field['field_name'], None) + if field['field_name'] == 'spatial_coverage': + spatial = subfields + + # Store the first geometry found so ckanext-spatial can pick it up for indexing + def _check_for_a_geom(spatial_dict): + value = None + + for field in ('geom', 'bbox', 'centroid'): + if spatial_dict.get(field): + value = spatial_dict[field] + if isinstance(value, dict): + try: + value = json.dumps(value) + break + except ValueError: + pass + return value + + if spatial and not dataset_dict.get('spatial'): + for item in spatial: + value = _check_for_a_geom(item) + if value: + dataset_dict['spatial'] = value + dataset_dict['extras_spatial'] = value + break + + # Index the dcat_modified and dcat_issued extras as metadata_modified and metadata_created + dcat_modified = utils.parse_date_iso_format(dataset_dict.get('extras_dcat_modified')) if dcat_modified: if not dcat_modified.endswith('Z'): dcat_modified += 'Z' - pkg_dict['metadata_modified'] = dcat_modified + dataset_dict['metadata_modified'] = dcat_modified - dcat_issued = utils.parse_date_iso_format(pkg_dict.get('extras_dcat_issued')) + dcat_issued = utils.parse_date_iso_format(dataset_dict.get('extras_dcat_issued')) if dcat_issued: if not dcat_issued.endswith('Z'): dcat_issued += 'Z' - pkg_dict['metadata_created'] = dcat_issued + dataset_dict['metadata_created'] = dcat_issued - return pkg_dict + return dataset_dict -class DCATJSONInterface(MixinDCATJSONInterface, p.SingletonPlugin): +class DCATJSONInterface(p.SingletonPlugin): p.implements(p.IActions) p.implements(p.IAuthFunctions, inherit=True) + p.implements(p.IBlueprint) + + # IBlueprint + + def get_blueprint(self): + return [blueprints.dcat_json_interface] # IActions diff --git a/ckanext/dcat/plugins/flask_plugin.py b/ckanext/dcat/plugins/flask_plugin.py deleted file mode 100644 index 7b1b6130..00000000 --- a/ckanext/dcat/plugins/flask_plugin.py +++ /dev/null @@ -1,30 +0,0 @@ -# -*- coding: utf-8 -*- - -import ckan.plugins as p - -import ckanext.dcat.cli as cli -import ckanext.dcat.blueprints as blueprints - - -class MixinDCATPlugin(p.SingletonPlugin): - p.implements(p.IClick) - p.implements(p.IBlueprint) - - # IClick - - def get_commands(self): - return cli.get_commands() - - # IBlueprint - - def get_blueprint(self): - return [blueprints.dcat] - - -class MixinDCATJSONInterface(p.SingletonPlugin): - p.implements(p.IBlueprint) - - # IBlueprint - - def get_blueprint(self): - return [blueprints.dcat_json_interface] diff --git a/ckanext/dcat/plugins/pylons_plugin.py b/ckanext/dcat/plugins/pylons_plugin.py deleted file mode 100644 index 93848341..00000000 --- a/ckanext/dcat/plugins/pylons_plugin.py +++ /dev/null @@ -1,52 +0,0 @@ -# -*- coding: utf-8 -*- - -from ckantoolkit import config -import ckan.plugins as p - -import ckanext.dcat.utils as utils - -class MixinDCATPlugin(p.SingletonPlugin): - p.implements(p.IRoutes, inherit=True) - - # IRoutes - - def before_map(self, _map): - - controller = 'ckanext.dcat.controllers:DCATController' - - if p.toolkit.asbool(config.get(utils.ENABLE_RDF_ENDPOINTS_CONFIG, True)): - - _map.connect('dcat_catalog', - config.get('ckanext.dcat.catalog_endpoint', - utils.DEFAULT_CATALOG_ENDPOINT), - controller=controller, action='read_catalog', - requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}) - - _map.connect('dcat_dataset', '/dataset/{_id}.{_format}', - controller=controller, action='read_dataset', - requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}) - - if p.toolkit.asbool(config.get(utils.ENABLE_CONTENT_NEGOTIATION_CONFIG)): - - _map.connect('home', '/', controller=controller, - action='read_catalog') - - _map.connect('add dataset', '/dataset/new', controller='package', action='new') - _map.connect('dataset_read', '/dataset/{_id}', - controller=controller, action='read_dataset', - ckan_icon='sitemap') - - return _map - -class MixinDCATJSONInterface(p.SingletonPlugin): - p.implements(p.IRoutes, inherit=True) - - # IRoutes - - def after_map(self, map): - - controller = 'ckanext.dcat.controllers:DCATController' - route = config.get('ckanext.dcat.json_endpoint', '/dcat.json') - map.connect(route, controller=controller, action='dcat_json') - - return map diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index 2a93d279..92b15c4a 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -33,12 +33,15 @@ class RDFProcessor(object): - def __init__(self, profiles=None, compatibility_mode=False): + def __init__(self, profiles=None, dataset_type='dataset', compatibility_mode=False): ''' Creates a parser or serializer instance You can optionally pass a list of profiles to be used. + A scheming dataset type can be provided, in which case the scheming schema + will be loaded by the base profile so it can be used by other profiles. + In compatibility mode, some fields are modified to maintain compatibility with previous versions of the ckanext-dcat parsers (eg adding the `dcat_` prefix or storing comma separated lists instead @@ -56,6 +59,8 @@ def __init__(self, profiles=None, compatibility_mode=False): raise RDFProfileException( 'No suitable RDF profiles could be loaded') + self.dataset_type = dataset_type + if not compatibility_mode: compatibility_mode = p.toolkit.asbool( config.get(COMPAT_MODE_CONFIG_OPTION, False)) @@ -116,11 +121,15 @@ def next_page(self): Returns the URL of the next page or None if there is no next page ''' for pagination_node in self.g.subjects(RDF.type, HYDRA.PagedCollection): + # Try to find HYDRA.next first + for o in self.g.objects(pagination_node, HYDRA.next): + return str(o) + + # If HYDRA.next is not found, try HYDRA.nextPage (deprecated) for o in self.g.objects(pagination_node, HYDRA.nextPage): return str(o) return None - def parse(self, data, _format=None): ''' Parses and RDF graph serialization and into the class graph @@ -173,7 +182,11 @@ def datasets(self): for dataset_ref in self._datasets(): dataset_dict = {} for profile_class in self._profiles: - profile = profile_class(self.g, self.compatibility_mode) + profile = profile_class( + self.g, + dataset_type=self.dataset_type, + compatibility_mode=self.compatibility_mode + ) profile.parse_dataset(dataset_dict, dataset_ref) yield dataset_dict @@ -209,19 +222,23 @@ def _add_pagination_triples(self, paging_info): pagination_ref = BNode() self.g.add((pagination_ref, RDF.type, HYDRA.PagedCollection)) + # The predicates `nextPage`, `previousPage`, `firstPage`, `lastPage` + # and `itemsPerPage` are deprecated and will be removed in the future items = [ - ('next', HYDRA.nextPage), - ('previous', HYDRA.previousPage), - ('first', HYDRA.firstPage), - ('last', HYDRA.lastPage), - ('count', HYDRA.totalItems), - ('items_per_page', HYDRA.itemsPerPage), + ('next', [HYDRA.nextPage, HYDRA.next]), + ('previous', [HYDRA.previousPage, HYDRA.previous]), + ('first', [HYDRA.firstPage, HYDRA.first]), + ('last', [HYDRA.lastPage, HYDRA.last]), + ('count', [HYDRA.totalItems]), + ('items_per_page', [HYDRA.itemsPerPage]), ] + for item in items: - key, predicate = item + key, predicates = item if paging_info.get(key): - self.g.add((pagination_ref, predicate, - Literal(paging_info[key]))) + for predicate in predicates: + self.g.add((pagination_ref, predicate, + Literal(paging_info[key]))) return pagination_ref @@ -238,7 +255,7 @@ def graph_from_dataset(self, dataset_dict): dataset_ref = URIRef(dataset_uri(dataset_dict)) for profile_class in self._profiles: - profile = profile_class(self.g, self.compatibility_mode) + profile = profile_class(self.g, compatibility_mode=self.compatibility_mode) profile.graph_from_dataset(dataset_dict, dataset_ref) return dataset_ref @@ -256,7 +273,7 @@ def graph_from_catalog(self, catalog_dict=None): catalog_ref = URIRef(catalog_uri()) for profile_class in self._profiles: - profile = profile_class(self.g, self.compatibility_mode) + profile = profile_class(self.g, compatibility_mode=self.compatibility_mode) profile.graph_from_catalog(catalog_dict, catalog_ref) return catalog_ref @@ -284,6 +301,22 @@ def serialize_dataset(self, dataset_dict, _format='xml'): return output + def serialize_datasets(self, dataset_dicts, _format='xml'): + ''' + Given a list of CKAN dataset dicts, returns an RDF serialization + + The serialization format can be defined using the `_format` parameter. + It must be one of the ones supported by RDFLib, defaults to `xml`. + + Returns a string with the serialized datasets + ''' + out = [] + for dataset_dict in dataset_dicts: + out.append(self.serialize_dataset(dataset_dict, _format)) + return '\n'.join(out) + + + def serialize_catalog(self, catalog_dict=None, dataset_dicts=None, _format='xml', pagination_info=None): ''' @@ -387,67 +420,3 @@ def _get_from_extra(key): g.add((agent, predicate, _type(val))) return catalog_ref - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='DCAT RDF - CKAN operations') - parser.add_argument('mode', - default='consume', - help=''' -Operation mode. -`consume` parses DCAT RDF graphs to CKAN dataset JSON objects. -`produce` serializes CKAN dataset JSON objects into DCAT RDF. - ''') - parser.add_argument('file', nargs='?', type=argparse.FileType('r'), - default=sys.stdin, - help='Input file. If omitted will read from stdin') - parser.add_argument('-f', '--format', - default='xml', - help='''Serialization format (as understood by rdflib) - eg: xml, n3 ... Defaults to \'xml\'.''') - parser.add_argument('-P', '--pretty', - action='store_true', - help='Make the output more human readable') - parser.add_argument('-p', '--profile', nargs='*', - action='store', - help='RDF Profiles to use, defaults to euro_dcat_ap_2') - parser.add_argument('-m', '--compat-mode', - action='store_true', - help='Enable compatibility mode') - - parser.add_argument('-s', '--subcatalogs', action='store_true', dest='subcatalogs', - default=False, - help="Enable subcatalogs handling (dct:hasPart support)") - args = parser.parse_args() - - contents = args.file.read() - - config.update({DCAT_EXPOSE_SUBCATALOGS: args.subcatalogs}) - - # Workaround until the core translation function defaults to the Flask one - from paste.registry import Registry - from ckan.lib.cli import MockTranslator - registry = Registry() - registry.prepare() - from pylons import translator - registry.register(translator, MockTranslator()) - - if args.mode == 'produce': - serializer = RDFSerializer(profiles=args.profile, - compatibility_mode=args.compat_mode) - - dataset = json.loads(contents) - out = serializer.serialize_dataset(dataset, _format=args.format) - print(out) - else: - parser = RDFParser(profiles=args.profile, - compatibility_mode=args.compat_mode) - - parser.parse(contents, _format=args.format) - - ckan_datasets = [d for d in parser.datasets()] - - indent = 4 if args.pretty else None - print(json.dumps(ckan_datasets, indent=indent)) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py deleted file mode 100644 index a1b86832..00000000 --- a/ckanext/dcat/profiles.py +++ /dev/null @@ -1,1910 +0,0 @@ -from builtins import str -from past.builtins import basestring -from builtins import object -import datetime -import json - -import six -from six.moves.urllib.parse import quote - -from dateutil.parser import parse as parse_date - -from ckantoolkit import config - -import rdflib -from rdflib import URIRef, BNode, Literal -from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS - -from geomet import wkt, InvalidGeoJSONException - -from ckan.model.license import LicenseRegister -from ckan.plugins import toolkit -from ckan.lib.munge import munge_tag -from ckanext.dcat.urls import url_for -from ckanext.dcat.utils import resource_uri, publisher_uri_organization_fallback, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS - -DCT = Namespace("http://purl.org/dc/terms/") -DCAT = Namespace("http://www.w3.org/ns/dcat#") -DCATAP = Namespace("http://data.europa.eu/r5r/") -ADMS = Namespace("http://www.w3.org/ns/adms#") -VCARD = Namespace("http://www.w3.org/2006/vcard/ns#") -FOAF = Namespace("http://xmlns.com/foaf/0.1/") -SCHEMA = Namespace('http://schema.org/') -TIME = Namespace('http://www.w3.org/2006/time') -LOCN = Namespace('http://www.w3.org/ns/locn#') -GSP = Namespace('http://www.opengis.net/ont/geosparql#') -OWL = Namespace('http://www.w3.org/2002/07/owl#') -SPDX = Namespace('http://spdx.org/rdf/terms#') - -GEOJSON_IMT = 'https://www.iana.org/assignments/media-types/application/vnd.geo+json' - -namespaces = { - 'dct': DCT, - 'dcat': DCAT, - 'dcatap': DCATAP, - 'adms': ADMS, - 'vcard': VCARD, - 'foaf': FOAF, - 'schema': SCHEMA, - 'time': TIME, - 'skos': SKOS, - 'locn': LOCN, - 'gsp': GSP, - 'owl': OWL, - 'spdx': SPDX, -} - -PREFIX_MAILTO = u'mailto:' - -DISTRIBUTION_LICENSE_FALLBACK_CONFIG = 'ckanext.dcat.resource.inherit.license' - - -class URIRefOrLiteral(object): - '''Helper which creates an URIRef if the value appears to be an http URL, - or a Literal otherwise. URIRefs are also cleaned using CleanedURIRef. - - Like CleanedURIRef, this is a factory class. - ''' - def __new__(cls, value): - try: - stripped_value = value.strip() - if (isinstance(value, basestring) and (stripped_value.startswith("http://") - or stripped_value.startswith("https://"))): - uri_obj = CleanedURIRef(value) - # although all invalid chars checked by rdflib should have been quoted, try to serialize - # the object. If it breaks, use Literal instead. - uri_obj.n3() - # URI is fine, return the object - return uri_obj - else: - return Literal(value) - except Exception: - # In case something goes wrong: use Literal - return Literal(value) - - -class CleanedURIRef(object): - '''Performs some basic URL encoding on value before creating an URIRef object. - - This is a factory for URIRef objects, which allows usage as type in graph.add() - without affecting the resulting node types. That is, - g.add(..., URIRef) and g.add(..., CleanedURIRef) will result in the exact same node type. - ''' - @staticmethod - def _careful_quote(value): - # only encode this limited subset of characters to avoid more complex URL parsing - # (e.g. valid ? in query string vs. ? as value). - # can be applied multiple times, as encoded %xy is left untouched. Therefore, no - # unquote is necessary beforehand. - quotechars = ' !"$\'()*,;<>[]{|}\\^`' - for c in quotechars: - value = value.replace(c, quote(c)) - return value - - def __new__(cls, value): - if isinstance(value, basestring): - value = CleanedURIRef._careful_quote(value.strip()) - return URIRef(value) - - -class RDFProfile(object): - '''Base class with helper methods for implementing RDF parsing profiles - - This class should not be used directly, but rather extended to create - custom profiles - ''' - - def __init__(self, graph, compatibility_mode=False): - '''Class constructor - - Graph is an rdflib.Graph instance. - - In compatibility mode, some fields are modified to maintain - compatibility with previous versions of the ckanext-dcat parsers - (eg adding the `dcat_` prefix or storing comma separated lists instead - of JSON dumps). - ''' - - self.g = graph - - self.compatibility_mode = compatibility_mode - - # Cache for mappings of licenses URL/title to ID built when needed in - # _license(). - self._licenceregister_cache = None - - def _datasets(self): - ''' - Generator that returns all DCAT datasets on the graph - - Yields rdflib.term.URIRef objects that can be used on graph lookups - and queries - ''' - for dataset in self.g.subjects(RDF.type, DCAT.Dataset): - yield dataset - - def _distributions(self, dataset): - ''' - Generator that returns all DCAT distributions on a particular dataset - - Yields rdflib.term.URIRef objects that can be used on graph lookups - and queries - ''' - for distribution in self.g.objects(dataset, DCAT.distribution): - yield distribution - - def _keywords(self, dataset_ref): - ''' - Returns all DCAT keywords on a particular dataset - ''' - keywords = self._object_value_list(dataset_ref, DCAT.keyword) or [] - # Split keywords with commas - keywords_with_commas = [k for k in keywords if ',' in k] - for keyword in keywords_with_commas: - keywords.remove(keyword) - keywords.extend([k.strip() for k in keyword.split(',')]) - return keywords - - def _object(self, subject, predicate): - ''' - Helper for returning the first object for this subject and predicate - - Both subject and predicate must be rdflib URIRef or BNode objects - - Returns an rdflib reference (URIRef or BNode) or None if not found - ''' - for _object in self.g.objects(subject, predicate): - return _object - return None - - def _object_value(self, subject, predicate): - ''' - Given a subject and a predicate, returns the value of the object - - Both subject and predicate must be rdflib URIRef or BNode objects - - If found, the string representation is returned, else an empty string - ''' - default_lang = config.get('ckan.locale_default', 'en') - fallback = '' - for o in self.g.objects(subject, predicate): - if isinstance(o, Literal): - if o.language and o.language == default_lang: - return str(o) - # Use first object as fallback if no object with the default language is available - elif fallback == '': - fallback = str(o) - else: - return str(o) - return fallback - - def _object_value_multiple_predicate(self, subject, predicates): - ''' - Given a subject and a list of predicates, returns the value of the object - according to the order in which it was specified. - - Both subject and predicates must be rdflib URIRef or BNode objects - - If found, the string representation is returned, else an empty string - ''' - object_value = '' - for predicate in predicates: - object_value = self._object_value(subject, predicate) - if object_value: - break - - return object_value - - def _object_value_int(self, subject, predicate): - ''' - Given a subject and a predicate, returns the value of the object as an - integer - - Both subject and predicate must be rdflib URIRef or BNode objects - - If the value can not be parsed as intger, returns None - ''' - object_value = self._object_value(subject, predicate) - if object_value: - try: - return int(float(object_value)) - except ValueError: - pass - return None - - def _object_value_int_list(self, subject, predicate): - ''' - Given a subject and a predicate, returns the value of the object as a - list of integers - - Both subject and predicate must be rdflib URIRef or BNode objects - - If the value can not be parsed as intger, returns an empty list - ''' - object_values = [] - for object in self.g.objects(subject, predicate): - if object: - try: - object_values.append(int(float(object))) - except ValueError: - pass - return object_values - - def _object_value_list(self, subject, predicate): - ''' - Given a subject and a predicate, returns a list with all the values of - the objects - - Both subject and predicate must be rdflib URIRef or BNode objects - - If no values found, returns an empty string - ''' - return [str(o) for o in self.g.objects(subject, predicate)] - - def _get_vcard_property_value(self, subject, predicate, predicate_string_property=None): - ''' - Given a subject, a predicate and a predicate for the simple string property (optional), - returns the value of the object. Trying to read the value in the following order - * predicate_string_property - * predicate - - All subject, predicate and predicate_string_property must be rdflib URIRef or BNode objects - - If no value is found, returns an empty string - ''' - - result = '' - if predicate_string_property: - result = self._object_value(subject, predicate_string_property) - - if not result: - obj = self._object(subject, predicate) - if isinstance(obj, BNode): - result = self._object_value(obj, VCARD.hasValue) - else: - result = self._object_value(subject, predicate) - - return result - - def _time_interval(self, subject, predicate, dcat_ap_version=1): - ''' - Returns the start and end date for a time interval object - - Both subject and predicate must be rdflib URIRef or BNode objects - - It checks for time intervals defined with DCAT, W3C Time hasBeginning & hasEnd - and schema.org startDate & endDate. - - Note that partial dates will be expanded to the first month / day - value, eg '1904' -> '1904-01-01'. - - Returns a tuple with the start and end date values, both of which - can be None if not found - ''' - - start_date = end_date = None - - if dcat_ap_version == 1: - start_date, end_date = self._read_time_interval_schema_org(subject, predicate) - if start_date or end_date: - return start_date, end_date - return self._read_time_interval_time(subject, predicate) - elif dcat_ap_version == 2: - start_date, end_date = self._read_time_interval_dcat(subject, predicate) - if start_date or end_date: - return start_date, end_date - start_date, end_date = self._read_time_interval_time(subject, predicate) - if start_date or end_date: - return start_date, end_date - return self._read_time_interval_schema_org(subject, predicate) - - def _read_time_interval_schema_org(self, subject, predicate): - start_date = end_date = None - - for interval in self.g.objects(subject, predicate): - start_date = self._object_value(interval, SCHEMA.startDate) - end_date = self._object_value(interval, SCHEMA.endDate) - - if start_date or end_date: - return start_date, end_date - - return start_date, end_date - - def _read_time_interval_dcat(self, subject, predicate): - start_date = end_date = None - - for interval in self.g.objects(subject, predicate): - start_date = self._object_value(interval, DCAT.startDate) - end_date = self._object_value(interval, DCAT.endDate) - - if start_date or end_date: - return start_date, end_date - - return start_date, end_date - - def _read_time_interval_time(self, subject, predicate): - start_date = end_date = None - - for interval in self.g.objects(subject, predicate): - start_nodes = [t for t in self.g.objects(interval, - TIME.hasBeginning)] - end_nodes = [t for t in self.g.objects(interval, - TIME.hasEnd)] - if start_nodes: - start_date = self._object_value_multiple_predicate(start_nodes[0], - [TIME.inXSDDateTimeStamp, TIME.inXSDDateTime, TIME.inXSDDate]) - if end_nodes: - end_date = self._object_value_multiple_predicate(end_nodes[0], - [TIME.inXSDDateTimeStamp, TIME.inXSDDateTime, TIME.inXSDDate]) - - if start_date or end_date: - return start_date, end_date - - return start_date, end_date - - def _insert_or_update_temporal(self, dataset_dict, key, value): - temporal = next((item for item in dataset_dict['extras'] if(item['key'] == key)), None) - if temporal: - temporal['value'] = value - else: - dataset_dict['extras'].append({'key': key , 'value': value}) - - def _publisher(self, subject, predicate): - ''' - Returns a dict with details about a dct:publisher entity, a foaf:Agent - - Both subject and predicate must be rdflib URIRef or BNode objects - - Examples: - - - - Publishing Organization for dataset 1 - contact@some.org - http://some.org - - - - - { - 'uri': 'http://orgs.vocab.org/some-org', - 'name': 'Publishing Organization for dataset 1', - 'email': 'contact@some.org', - 'url': 'http://some.org', - 'type': 'http://purl.org/adms/publishertype/NonProfitOrganisation', - } - - - - { - 'uri': 'http://publications.europa.eu/resource/authority/corporate-body/EURCOU' - } - - Returns keys for uri, name, email, url and type with the values set to - an empty string if they could not be found - ''' - - publisher = {} - - for agent in self.g.objects(subject, predicate): - - publisher['uri'] = (str(agent) if isinstance(agent, - rdflib.term.URIRef) else '') - - publisher['name'] = self._object_value(agent, FOAF.name) - - publisher['email'] = self._object_value(agent, FOAF.mbox) - - publisher['url'] = self._object_value(agent, FOAF.homepage) - - publisher['type'] = self._object_value(agent, DCT.type) - - return publisher - - def _contact_details(self, subject, predicate): - ''' - Returns a dict with details about a vcard expression - - Both subject and predicate must be rdflib URIRef or BNode objects - - Returns keys for uri, name and email with the values set to - an empty string if they could not be found - ''' - - contact = {} - - for agent in self.g.objects(subject, predicate): - - contact['uri'] = (str(agent) if isinstance(agent, - rdflib.term.URIRef) else '') - - contact['name'] = self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn) - - contact['email'] = self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail)) - - return contact - - def _parse_geodata(self, spatial, datatype, cur_value): - ''' - Extract geodata with the given datatype from the spatial data and check if it contains a valid GeoJSON - or WKT geometry. - - Returns the String or None if the value is no valid GeoJSON or WKT geometry. - ''' - for geometry in self.g.objects(spatial, datatype): - if (geometry.datatype == URIRef(GEOJSON_IMT) or - not geometry.datatype): - try: - json.loads(str(geometry)) - cur_value = str(geometry) - except (ValueError, TypeError): - pass - if not cur_value and geometry.datatype == GSP.wktLiteral: - try: - cur_value = json.dumps(wkt.loads(str(geometry))) - except (ValueError, TypeError): - pass - return cur_value - - - def _spatial(self, subject, predicate): - ''' - Returns a dict with details about the spatial location - - Both subject and predicate must be rdflib URIRef or BNode objects - - Returns keys for uri, text or geom with the values set to - None if they could not be found. - - Geometries are always returned in GeoJSON. If only WKT is provided, - it will be transformed to GeoJSON. - - Check the notes on the README for the supported formats: - - https://github.com/ckan/ckanext-dcat/#rdf-dcat-to-ckan-dataset-mapping - ''' - - uri = None - text = None - geom = None - bbox = None - cent = None - - for spatial in self.g.objects(subject, predicate): - - if isinstance(spatial, URIRef): - uri = str(spatial) - - if isinstance(spatial, Literal): - text = str(spatial) - - if (spatial, RDF.type, DCT.Location) in self.g: - geom = self._parse_geodata(spatial, LOCN.geometry, geom) - bbox = self._parse_geodata(spatial, DCAT.bbox, bbox) - cent = self._parse_geodata(spatial, DCAT.centroid, cent) - for label in self.g.objects(spatial, SKOS.prefLabel): - text = str(label) - for label in self.g.objects(spatial, RDFS.label): - text = str(label) - - return { - 'uri': uri, - 'text': text, - 'geom': geom, - 'bbox': bbox, - 'centroid': cent, - } - - def _license(self, dataset_ref): - ''' - Returns a license identifier if one of the distributions license is - found in CKAN license registry. If no distribution's license matches, - an empty string is returned. - - The first distribution with a license found in the registry is used so - that if distributions have different licenses we'll only get the first - one. - ''' - if self._licenceregister_cache is not None: - license_uri2id, license_title2id = self._licenceregister_cache - else: - license_uri2id = {} - license_title2id = {} - for license_id, license in list(LicenseRegister().items()): - license_uri2id[license.url] = license_id - license_title2id[license.title] = license_id - self._licenceregister_cache = license_uri2id, license_title2id - - for distribution in self._distributions(dataset_ref): - # If distribution has a license, attach it to the dataset - license = self._object(distribution, DCT.license) - if license: - # Try to find a matching license comparing URIs, then titles - license_id = license_uri2id.get(license.toPython()) - if not license_id: - license_id = license_title2id.get( - self._object_value(license, DCT.title)) - if license_id: - return license_id - return '' - - def _access_rights(self, subject, predicate): - ''' - Returns the rights statement or an empty string if no one is found. - ''' - - result = '' - obj = self._object(subject, predicate) - if obj: - if isinstance(obj, BNode) and self._object(obj, RDF.type) == DCT.RightsStatement: - result = self._object_value(obj, RDFS.label) - elif isinstance(obj, Literal) or isinstance(obj, URIRef): - # unicode_safe not include Literal or URIRef - result = six.text_type(obj) - return result - - def _distribution_format(self, distribution, normalize_ckan_format=True): - ''' - Returns the Internet Media Type and format label for a distribution - - Given a reference (URIRef or BNode) to a dcat:Distribution, it will - try to extract the media type (previously knowm as MIME type), eg - `text/csv`, and the format label, eg `CSV` - - Values for the media type will be checked in the following order: - - 1. literal value of dcat:mediaType - 2. literal value of dct:format if it contains a '/' character - 3. value of dct:format if it is an instance of dct:IMT, eg: - - - - - 4. value of dct:format if it is an URIRef and appears to be an IANA type - - Values for the label will be checked in the following order: - - 1. literal value of dct:format if it not contains a '/' character - 2. label of dct:format if it is an instance of dct:IMT (see above) - 3. value of dct:format if it is an URIRef and doesn't look like an IANA type - - If `normalize_ckan_format` is True and using CKAN>=2.3, the label will - be tried to match against the standard list of formats that is included - with CKAN core - (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json) - This allows for instance to populate the CKAN resource format field - with a format that view plugins, etc will understand (`csv`, `xml`, - etc.) - - Return a tuple with the media type and the label, both set to None if - they couldn't be found. - ''' - - imt = None - label = None - - imt = self._object_value(distribution, DCAT.mediaType) - - _format = self._object(distribution, DCT['format']) - if isinstance(_format, Literal): - if not imt and '/' in _format: - imt = str(_format) - else: - label = str(_format) - elif isinstance(_format, (BNode, URIRef)): - if self._object(_format, RDF.type) == DCT.IMT: - if not imt: - imt = str(self.g.value(_format, default=None)) - label = str(self.g.label(_format, default=None)) - elif isinstance(_format, URIRef): - # If the URIRef does not reference a BNode, it could reference an IANA type. - # Otherwise, use it as label. - format_uri = str(_format) - if 'iana.org/assignments/media-types' in format_uri and not imt: - imt = format_uri - else: - label = format_uri - - if ((imt or label) and normalize_ckan_format and - toolkit.check_ckan_version(min_version='2.3')): - import ckan.config - from ckan.lib import helpers - - format_registry = helpers.resource_formats() - - if imt in format_registry: - label = format_registry[imt][1] - elif label in format_registry: - label = format_registry[label][1] - - return imt, label - - def _get_dict_value(self, _dict, key, default=None): - ''' - Returns the value for the given key on a CKAN dict - - By default a key on the root level is checked. If not found, extras - are checked, both with the key provided and with `dcat_` prepended to - support legacy fields. - - If not found, returns the default value, which defaults to None - ''' - - if key in _dict: - return _dict[key] - - for extra in _dict.get('extras', []): - if extra['key'] == key or extra['key'] == 'dcat_' + key: - return extra['value'] - - return default - - def _read_list_value(self, value): - items = [] - # List of values - if isinstance(value, list): - items = value - elif isinstance(value, basestring): - try: - items = json.loads(value) - if isinstance(items, ((int, float, complex))): - items = [items] # JSON list - except ValueError: - if ',' in value: - # Comma-separated list - items = value.split(',') - else: - items = [value] # Normal text value - return items - - def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): - ''' - Adds spatial triples to the graph. - ''' - # GeoJSON - self.g.add((spatial_ref, - predicate, - Literal(value, datatype=GEOJSON_IMT))) - # WKT, because GeoDCAT-AP says so - try: - self.g.add((spatial_ref, - predicate, - Literal(wkt.dumps(json.loads(value), - decimals=4), - datatype=GSP.wktLiteral))) - except (TypeError, ValueError, InvalidGeoJSONException): - pass - - def _add_spatial_to_dict(self, dataset_dict, key, spatial): - if spatial.get(key): - dataset_dict['extras'].append( - {'key': 'spatial_{0}'.format(key) if key != 'geom' else 'spatial', - 'value': spatial.get(key)}) - - def _get_dataset_value(self, dataset_dict, key, default=None): - ''' - Returns the value for the given key on a CKAN dict - - Check `_get_dict_value` for details - ''' - return self._get_dict_value(dataset_dict, key, default) - - def _get_resource_value(self, resource_dict, key, default=None): - ''' - Returns the value for the given key on a CKAN dict - - Check `_get_dict_value` for details - ''' - return self._get_dict_value(resource_dict, key, default) - - def _add_date_triples_from_dict(self, _dict, subject, items): - self._add_triples_from_dict(_dict, subject, items, - date_value=True) - - def _add_list_triples_from_dict(self, _dict, subject, items): - self._add_triples_from_dict(_dict, subject, items, - list_value=True) - - def _add_triples_from_dict(self, _dict, subject, items, - list_value=False, - date_value=False): - for item in items: - key, predicate, fallbacks, _type = item - self._add_triple_from_dict(_dict, subject, predicate, key, - fallbacks=fallbacks, - list_value=list_value, - date_value=date_value, - _type=_type) - - def _add_triple_from_dict(self, _dict, subject, predicate, key, - fallbacks=None, - list_value=False, - date_value=False, - _type=Literal, - _datatype=None, - value_modifier=None): - ''' - Adds a new triple to the graph with the provided parameters - - The subject and predicate of the triple are passed as the relevant - RDFLib objects (URIRef or BNode). As default, the object is a - literal value, which is extracted from the dict using the provided key - (see `_get_dict_value`). If the value for the key is not found, then - additional fallback keys are checked. - Using `value_modifier`, a function taking the extracted value and - returning a modified value can be passed. - If a value was found, the modifier is applied before adding the value. - - If `list_value` or `date_value` are True, then the value is treated as - a list or a date respectively (see `_add_list_triple` and - `_add_date_triple` for details. - ''' - value = self._get_dict_value(_dict, key) - if not value and fallbacks: - for fallback in fallbacks: - value = self._get_dict_value(_dict, fallback) - if value: - break - - # if a modifying function was given, apply it to the value - if value and callable(value_modifier): - value = value_modifier(value) - - if value and list_value: - self._add_list_triple(subject, predicate, value, _type, _datatype) - elif value and date_value: - self._add_date_triple(subject, predicate, value, _type) - elif value: - # Normal text value - # ensure URIRef items are preprocessed (space removal/url encoding) - if _type == URIRef: - _type = CleanedURIRef - if _datatype: - object = _type(value, datatype=_datatype) - else: - object = _type(value) - self.g.add((subject, predicate, object)) - - def _add_list_triple(self, subject, predicate, value, _type=Literal, _datatype=None): - ''' - Adds as many triples to the graph as values - - Values are literal strings, if `value` is a list, one for each - item. If `value` is a string there is an attempt to split it using - commas, to support legacy fields. - ''' - items = self._read_list_value(value) - - for item in items: - # ensure URIRef items are preprocessed (space removal/url encoding) - if _type == URIRef: - _type = CleanedURIRef - if _datatype: - object = _type(item, datatype=_datatype) - else: - object = _type(item) - self.g.add((subject, predicate, object)) - - def _add_date_triple(self, subject, predicate, value, _type=Literal): - ''' - Adds a new triple with a date object - - Dates are parsed using dateutil, and if the date obtained is correct, - added to the graph as an XSD.dateTime value. - - If there are parsing errors, the literal string value is added. - ''' - if not value: - return - try: - default_datetime = datetime.datetime(1, 1, 1, 0, 0, 0) - _date = parse_date(value, default=default_datetime) - - self.g.add((subject, predicate, _type(_date.isoformat(), - datatype=XSD.dateTime))) - except ValueError: - self.g.add((subject, predicate, _type(value))) - - def _last_catalog_modification(self): - ''' - Returns the date and time the catalog was last modified - - To be more precise, the most recent value for `metadata_modified` on a - dataset. - - Returns a dateTime string in ISO format, or None if it could not be - found. - ''' - context = { - 'ignore_auth': True - } - result = toolkit.get_action('package_search')(context, { - 'sort': 'metadata_modified desc', - 'rows': 1, - }) - if result and result.get('results'): - return result['results'][0]['metadata_modified'] - return None - - def _add_mailto(self, mail_addr): - ''' - Ensures that the mail address has an URIRef-compatible mailto: prefix. - Can be used as modifier function for `_add_triple_from_dict`. - ''' - if mail_addr: - return PREFIX_MAILTO + self._without_mailto(mail_addr) - else: - return mail_addr - - def _without_mailto(self, mail_addr): - ''' - Ensures that the mail address string has no mailto: prefix. - ''' - if mail_addr: - return str(mail_addr).replace(PREFIX_MAILTO, u'') - else: - return mail_addr - - def _get_source_catalog(self, dataset_ref): - ''' - Returns Catalog reference that is source for this dataset. - - Catalog referenced in dct:hasPart is returned, - if dataset is linked there, otherwise main catalog - will be returned. - - This will not be used if ckanext.dcat.expose_subcatalogs - configuration option is set to False. - ''' - if not toolkit.asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)): - return - catalogs = set(self.g.subjects(DCAT.dataset, dataset_ref)) - root = self._get_root_catalog_ref() - try: - catalogs.remove(root) - except KeyError: - pass - assert len(catalogs) in (0, 1,), "len %s" %catalogs - if catalogs: - return catalogs.pop() - return root - - def _get_root_catalog_ref(self): - roots = list(self.g.subjects(DCT.hasPart)) - if not roots: - roots = list(self.g.subjects(RDF.type, DCAT.Catalog)) - return roots[0] - - def _get_or_create_spatial_ref(self, dataset_dict, dataset_ref): - for spatial_ref in self.g.objects(dataset_ref, DCT.spatial): - if spatial_ref: - return spatial_ref - - # Create new spatial_ref - spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') - if spatial_uri: - spatial_ref = CleanedURIRef(spatial_uri) - else: - spatial_ref = BNode() - self.g.add((spatial_ref, RDF.type, DCT.Location)) - self.g.add((dataset_ref, DCT.spatial, spatial_ref)) - return spatial_ref - - # Public methods for profiles to implement - - def parse_dataset(self, dataset_dict, dataset_ref): - ''' - Creates a CKAN dataset dict from the RDF graph - - The `dataset_dict` is passed to all the loaded profiles before being - yielded, so it can be further modified by each one of them. - `dataset_ref` is an rdflib URIRef object - that can be used to reference the dataset when querying the graph. - - Returns a dataset dict that can be passed to eg `package_create` - or `package_update` - ''' - return dataset_dict - - def _extract_catalog_dict(self, catalog_ref): - ''' - Returns list of key/value dictionaries with catalog - ''' - - out = [] - sources = (('source_catalog_title', DCT.title,), - ('source_catalog_description', DCT.description,), - ('source_catalog_homepage', FOAF.homepage,), - ('source_catalog_language', DCT.language,), - ('source_catalog_modified', DCT.modified,),) - - for key, predicate in sources: - val = self._object_value(catalog_ref, predicate) - if val: - out.append({'key': key, 'value': val}) - - out.append({'key': 'source_catalog_publisher', 'value': json.dumps(self._publisher(catalog_ref, DCT.publisher))}) - return out - - def graph_from_catalog(self, catalog_dict, catalog_ref): - ''' - Creates an RDF graph for the whole catalog (site) - - The class RDFLib graph (accessible via `self.g`) should be updated on - this method - - `catalog_dict` is a dict that can contain literal values for the - dcat:Catalog class like `title`, `homepage`, etc. `catalog_ref` is an - rdflib URIRef object that must be used to reference the catalog when - working with the graph. - ''' - pass - - def graph_from_dataset(self, dataset_dict, dataset_ref): - ''' - Given a CKAN dataset dict, creates an RDF graph - - The class RDFLib graph (accessible via `self.g`) should be updated on - this method - - `dataset_dict` is a dict with the dataset metadata like the one - returned by `package_show`. `dataset_ref` is an rdflib URIRef object - that must be used to reference the dataset when working with the graph. - ''' - pass - - -class EuropeanDCATAPProfile(RDFProfile): - ''' - An RDF profile based on the DCAT-AP for data portals in Europe - - More information and specification: - - https://joinup.ec.europa.eu/asset/dcat_application_profile - - ''' - - def parse_dataset(self, dataset_dict, dataset_ref): - - dataset_dict['extras'] = [] - dataset_dict['resources'] = [] - - # Basic fields - for key, predicate in ( - ('title', DCT.title), - ('notes', DCT.description), - ('url', DCAT.landingPage), - ('version', OWL.versionInfo), - ): - value = self._object_value(dataset_ref, predicate) - if value: - dataset_dict[key] = value - - if not dataset_dict.get('version'): - # adms:version was supported on the first version of the DCAT-AP - value = self._object_value(dataset_ref, ADMS.version) - if value: - dataset_dict['version'] = value - - # Tags - # replace munge_tag to noop if there's no need to clean tags - do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False)) - tags_val = [munge_tag(tag) if do_clean else tag for tag in self._keywords(dataset_ref)] - tags = [{'name': tag} for tag in tags_val] - dataset_dict['tags'] = tags - - # Extras - - # Simple values - for key, predicate in ( - ('issued', DCT.issued), - ('modified', DCT.modified), - ('identifier', DCT.identifier), - ('version_notes', ADMS.versionNotes), - ('frequency', DCT.accrualPeriodicity), - ('provenance', DCT.provenance), - ('dcat_type', DCT.type), - ): - value = self._object_value(dataset_ref, predicate) - if value: - dataset_dict['extras'].append({'key': key, 'value': value}) - - # Lists - for key, predicate, in ( - ('language', DCT.language), - ('theme', DCAT.theme), - ('alternate_identifier', ADMS.identifier), - ('conforms_to', DCT.conformsTo), - ('documentation', FOAF.page), - ('related_resource', DCT.relation), - ('has_version', DCT.hasVersion), - ('is_version_of', DCT.isVersionOf), - ('source', DCT.source), - ('sample', ADMS.sample), - ): - values = self._object_value_list(dataset_ref, predicate) - if values: - dataset_dict['extras'].append({'key': key, - 'value': json.dumps(values)}) - - # Contact details - contact = self._contact_details(dataset_ref, DCAT.contactPoint) - if not contact: - # adms:contactPoint was supported on the first version of DCAT-AP - contact = self._contact_details(dataset_ref, ADMS.contactPoint) - - if contact: - for key in ('uri', 'name', 'email'): - if contact.get(key): - dataset_dict['extras'].append( - {'key': 'contact_{0}'.format(key), - 'value': contact.get(key)}) - - # Publisher - publisher = self._publisher(dataset_ref, DCT.publisher) - for key in ('uri', 'name', 'email', 'url', 'type'): - if publisher.get(key): - dataset_dict['extras'].append( - {'key': 'publisher_{0}'.format(key), - 'value': publisher.get(key)}) - - # Temporal - start, end = self._time_interval(dataset_ref, DCT.temporal) - if start: - dataset_dict['extras'].append( - {'key': 'temporal_start', 'value': start}) - if end: - dataset_dict['extras'].append( - {'key': 'temporal_end', 'value': end}) - - # Spatial - spatial = self._spatial(dataset_ref, DCT.spatial) - for key in ('uri', 'text', 'geom'): - self._add_spatial_to_dict(dataset_dict, key, spatial) - - # Dataset URI (explicitly show the missing ones) - dataset_uri = (str(dataset_ref) - if isinstance(dataset_ref, rdflib.term.URIRef) - else '') - dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri}) - - # access_rights - access_rights = self._access_rights(dataset_ref, DCT.accessRights) - if access_rights: - dataset_dict['extras'].append({'key': 'access_rights', 'value': access_rights}) - - # License - if 'license_id' not in dataset_dict: - dataset_dict['license_id'] = self._license(dataset_ref) - - # Source Catalog - if toolkit.asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)): - catalog_src = self._get_source_catalog(dataset_ref) - if catalog_src is not None: - src_data = self._extract_catalog_dict(catalog_src) - dataset_dict['extras'].extend(src_data) - - # Resources - for distribution in self._distributions(dataset_ref): - - resource_dict = {} - - # Simple values - for key, predicate in ( - ('name', DCT.title), - ('description', DCT.description), - ('access_url', DCAT.accessURL), - ('download_url', DCAT.downloadURL), - ('issued', DCT.issued), - ('modified', DCT.modified), - ('status', ADMS.status), - ('license', DCT.license), - ): - value = self._object_value(distribution, predicate) - if value: - resource_dict[key] = value - - resource_dict['url'] = (self._object_value(distribution, - DCAT.downloadURL) or - self._object_value(distribution, - DCAT.accessURL)) - # Lists - for key, predicate in ( - ('language', DCT.language), - ('documentation', FOAF.page), - ('conforms_to', DCT.conformsTo), - ): - values = self._object_value_list(distribution, predicate) - if values: - resource_dict[key] = json.dumps(values) - - # rights - rights = self._access_rights(distribution, DCT.rights) - if rights: - resource_dict['rights'] = rights - - # Format and media type - normalize_ckan_format = toolkit.asbool(config.get( - 'ckanext.dcat.normalize_ckan_format', True)) - imt, label = self._distribution_format(distribution, - normalize_ckan_format) - - if imt: - resource_dict['mimetype'] = imt - - if label: - resource_dict['format'] = label - elif imt: - resource_dict['format'] = imt - - # Size - size = self._object_value_int(distribution, DCAT.byteSize) - if size is not None: - resource_dict['size'] = size - - # Checksum - for checksum in self.g.objects(distribution, SPDX.checksum): - algorithm = self._object_value(checksum, SPDX.algorithm) - checksum_value = self._object_value(checksum, SPDX.checksumValue) - if algorithm: - resource_dict['hash_algorithm'] = algorithm - if checksum_value: - resource_dict['hash'] = checksum_value - - # Distribution URI (explicitly show the missing ones) - resource_dict['uri'] = (str(distribution) - if isinstance(distribution, - rdflib.term.URIRef) - else '') - - # Remember the (internal) distribution reference for referencing in - # further profiles, e.g. for adding more properties - resource_dict['distribution_ref'] = str(distribution) - - dataset_dict['resources'].append(resource_dict) - - if self.compatibility_mode: - # Tweak the resulting dict to make it compatible with previous - # versions of the ckanext-dcat parsers - for extra in dataset_dict['extras']: - if extra['key'] in ('issued', 'modified', 'publisher_name', - 'publisher_email',): - - extra['key'] = 'dcat_' + extra['key'] - - if extra['key'] == 'language': - extra['value'] = ','.join( - sorted(json.loads(extra['value']))) - - return dataset_dict - - def graph_from_dataset(self, dataset_dict, dataset_ref): - - g = self.g - - for prefix, namespace in namespaces.items(): - g.bind(prefix, namespace) - - g.add((dataset_ref, RDF.type, DCAT.Dataset)) - - # Basic fields - items = [ - ('title', DCT.title, None, Literal), - ('notes', DCT.description, None, Literal), - ('url', DCAT.landingPage, None, URIRef), - ('identifier', DCT.identifier, ['guid', 'id'], URIRefOrLiteral), - ('version', OWL.versionInfo, ['dcat_version'], Literal), - ('version_notes', ADMS.versionNotes, None, Literal), - ('frequency', DCT.accrualPeriodicity, None, URIRefOrLiteral), - ('access_rights', DCT.accessRights, None, URIRefOrLiteral), - ('dcat_type', DCT.type, None, Literal), - ('provenance', DCT.provenance, None, Literal), - ] - self._add_triples_from_dict(dataset_dict, dataset_ref, items) - - # Tags - for tag in dataset_dict.get('tags', []): - g.add((dataset_ref, DCAT.keyword, Literal(tag['name']))) - - # Dates - items = [ - ('issued', DCT.issued, ['metadata_created'], Literal), - ('modified', DCT.modified, ['metadata_modified'], Literal), - ] - self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) - - # Lists - items = [ - ('language', DCT.language, None, URIRefOrLiteral), - ('theme', DCAT.theme, None, URIRef), - ('conforms_to', DCT.conformsTo, None, Literal), - ('alternate_identifier', ADMS.identifier, None, URIRefOrLiteral), - ('documentation', FOAF.page, None, URIRefOrLiteral), - ('related_resource', DCT.relation, None, URIRefOrLiteral), - ('has_version', DCT.hasVersion, None, URIRefOrLiteral), - ('is_version_of', DCT.isVersionOf, None, URIRefOrLiteral), - ('source', DCT.source, None, URIRefOrLiteral), - ('sample', ADMS.sample, None, URIRefOrLiteral), - ] - self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) - - # Contact details - if any([ - self._get_dataset_value(dataset_dict, 'contact_uri'), - self._get_dataset_value(dataset_dict, 'contact_name'), - self._get_dataset_value(dataset_dict, 'contact_email'), - self._get_dataset_value(dataset_dict, 'maintainer'), - self._get_dataset_value(dataset_dict, 'maintainer_email'), - self._get_dataset_value(dataset_dict, 'author'), - self._get_dataset_value(dataset_dict, 'author_email'), - ]): - - contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri') - if contact_uri: - contact_details = CleanedURIRef(contact_uri) - else: - contact_details = BNode() - - g.add((contact_details, RDF.type, VCARD.Organization)) - g.add((dataset_ref, DCAT.contactPoint, contact_details)) - - self._add_triple_from_dict( - dataset_dict, contact_details, - VCARD.fn, 'contact_name', ['maintainer', 'author'] - ) - # Add mail address as URIRef, and ensure it has a mailto: prefix - self._add_triple_from_dict( - dataset_dict, contact_details, - VCARD.hasEmail, 'contact_email', ['maintainer_email', - 'author_email'], - _type=URIRef, value_modifier=self._add_mailto - ) - - # Publisher - if any([ - self._get_dataset_value(dataset_dict, 'publisher_uri'), - self._get_dataset_value(dataset_dict, 'publisher_name'), - dataset_dict.get('organization'), - ]): - - publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri') - publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) - publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') - if publisher_uri: - publisher_details = CleanedURIRef(publisher_uri) - elif not publisher_name and publisher_uri_fallback: - # neither URI nor name are available, use organization as fallback - publisher_details = CleanedURIRef(publisher_uri_fallback) - else: - # No publisher_uri - publisher_details = BNode() - - g.add((publisher_details, RDF.type, FOAF.Organization)) - g.add((dataset_ref, DCT.publisher, publisher_details)) - - # In case no name and URI are available, again fall back to organization. - # If no name but an URI is available, the name literal remains empty to - # avoid mixing organization and dataset values. - if not publisher_name and not publisher_uri and dataset_dict.get('organization'): - publisher_name = dataset_dict['organization']['title'] - - g.add((publisher_details, FOAF.name, Literal(publisher_name))) - # TODO: It would make sense to fallback these to organization - # fields but they are not in the default schema and the - # `organization` object in the dataset_dict does not include - # custom fields - items = [ - ('publisher_email', FOAF.mbox, None, Literal), - ('publisher_url', FOAF.homepage, None, URIRef), - ('publisher_type', DCT.type, None, URIRefOrLiteral), - ] - - self._add_triples_from_dict(dataset_dict, publisher_details, items) - - # Temporal - start = self._get_dataset_value(dataset_dict, 'temporal_start') - end = self._get_dataset_value(dataset_dict, 'temporal_end') - if start or end: - temporal_extent = BNode() - - g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) - if start: - self._add_date_triple(temporal_extent, SCHEMA.startDate, start) - if end: - self._add_date_triple(temporal_extent, SCHEMA.endDate, end) - g.add((dataset_ref, DCT.temporal, temporal_extent)) - - # Spatial - spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') - spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') - - if spatial_text or spatial_geom: - spatial_ref = self._get_or_create_spatial_ref(dataset_dict, dataset_ref) - - if spatial_text: - g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) - - if spatial_geom: - self._add_spatial_value_to_graph(spatial_ref, LOCN.geometry, spatial_geom) - - # Use fallback license if set in config - resource_license_fallback = None - if toolkit.asbool(config.get(DISTRIBUTION_LICENSE_FALLBACK_CONFIG, False)): - if 'license_id' in dataset_dict and isinstance(URIRefOrLiteral(dataset_dict['license_id']), URIRef): - resource_license_fallback = dataset_dict['license_id'] - elif 'license_url' in dataset_dict and isinstance(URIRefOrLiteral(dataset_dict['license_url']), URIRef): - resource_license_fallback = dataset_dict['license_url'] - - # Resources - for resource_dict in dataset_dict.get('resources', []): - - distribution = CleanedURIRef(resource_uri(resource_dict)) - - g.add((dataset_ref, DCAT.distribution, distribution)) - - g.add((distribution, RDF.type, DCAT.Distribution)) - - # Simple values - items = [ - ('name', DCT.title, None, Literal), - ('description', DCT.description, None, Literal), - ('status', ADMS.status, None, URIRefOrLiteral), - ('rights', DCT.rights, None, URIRefOrLiteral), - ('license', DCT.license, None, URIRefOrLiteral), - ('access_url', DCAT.accessURL, None, URIRef), - ('download_url', DCAT.downloadURL, None, URIRef), - ] - - self._add_triples_from_dict(resource_dict, distribution, items) - - # Lists - items = [ - ('documentation', FOAF.page, None, URIRefOrLiteral), - ('language', DCT.language, None, URIRefOrLiteral), - ('conforms_to', DCT.conformsTo, None, Literal), - ] - self._add_list_triples_from_dict(resource_dict, distribution, items) - - # Set default license for distribution if needed and available - if resource_license_fallback and not (distribution, DCT.license, None) in g: - g.add((distribution, DCT.license, URIRefOrLiteral(resource_license_fallback))) - - # Format - mimetype = resource_dict.get('mimetype') - fmt = resource_dict.get('format') - - # IANA media types (either URI or Literal) should be mapped as mediaType. - # In case format is available and mimetype is not set or identical to format, - # check which type is appropriate. - if fmt and (not mimetype or mimetype == fmt): - if ('iana.org/assignments/media-types' in fmt - or not fmt.startswith('http') and '/' in fmt): - # output format value as dcat:mediaType instead of dct:format - mimetype = fmt - fmt = None - else: - # Use dct:format - mimetype = None - - if mimetype: - g.add((distribution, DCAT.mediaType, - URIRefOrLiteral(mimetype))) - - if fmt: - g.add((distribution, DCT['format'], - URIRefOrLiteral(fmt))) - - - # URL fallback and old behavior - url = resource_dict.get('url') - download_url = resource_dict.get('download_url') - access_url = resource_dict.get('access_url') - # Use url as fallback for access_url if access_url is not set and download_url is not equal - if url and not access_url: - if (not download_url) or (download_url and url != download_url): - self._add_triple_from_dict(resource_dict, distribution, DCAT.accessURL, 'url', _type=URIRef) - - # Dates - items = [ - ('issued', DCT.issued, None, Literal), - ('modified', DCT.modified, None, Literal), - ] - - self._add_date_triples_from_dict(resource_dict, distribution, items) - - # Numbers - if resource_dict.get('size'): - try: - g.add((distribution, DCAT.byteSize, - Literal(float(resource_dict['size']), - datatype=XSD.decimal))) - except (ValueError, TypeError): - g.add((distribution, DCAT.byteSize, - Literal(resource_dict['size']))) - # Checksum - if resource_dict.get('hash'): - checksum = BNode() - g.add((checksum, RDF.type, SPDX.Checksum)) - g.add((checksum, SPDX.checksumValue, - Literal(resource_dict['hash'], - datatype=XSD.hexBinary))) - - if resource_dict.get('hash_algorithm'): - g.add((checksum, SPDX.algorithm, - URIRefOrLiteral(resource_dict['hash_algorithm']))) - - g.add((distribution, SPDX.checksum, checksum)) - - def graph_from_catalog(self, catalog_dict, catalog_ref): - - g = self.g - - for prefix, namespace in namespaces.items(): - g.bind(prefix, namespace) - - g.add((catalog_ref, RDF.type, DCAT.Catalog)) - - # Basic fields - items = [ - ('title', DCT.title, config.get('ckan.site_title'), Literal), - ('description', DCT.description, config.get('ckan.site_description'), Literal), - ('homepage', FOAF.homepage, config.get('ckan.site_url'), URIRef), - ('language', DCT.language, config.get('ckan.locale_default', 'en'), URIRefOrLiteral), - ] - for item in items: - key, predicate, fallback, _type = item - if catalog_dict: - value = catalog_dict.get(key, fallback) - else: - value = fallback - if value: - g.add((catalog_ref, predicate, _type(value))) - - # Dates - modified = self._last_catalog_modification() - if modified: - self._add_date_triple(catalog_ref, DCT.modified, modified) - - -class EuropeanDCATAP2Profile(EuropeanDCATAPProfile): - ''' - An RDF profile based on the DCAT-AP 2 for data portals in Europe - - More information and specification: - - https://joinup.ec.europa.eu/asset/dcat_application_profile - - ''' - - def parse_dataset(self, dataset_dict, dataset_ref): - - # call super method - super(EuropeanDCATAP2Profile, self).parse_dataset(dataset_dict, dataset_ref) - - # Lists - for key, predicate in ( - ('temporal_resolution', DCAT.temporalResolution), - ('is_referenced_by', DCT.isReferencedBy), - ): - values = self._object_value_list(dataset_ref, predicate) - if values: - dataset_dict['extras'].append({'key': key, - 'value': json.dumps(values)}) - # Temporal - start, end = self._time_interval(dataset_ref, DCT.temporal, dcat_ap_version=2) - if start: - self._insert_or_update_temporal(dataset_dict, 'temporal_start', start) - if end: - self._insert_or_update_temporal(dataset_dict, 'temporal_end', end) - - # Spatial - spatial = self._spatial(dataset_ref, DCT.spatial) - for key in ('bbox', 'centroid'): - self._add_spatial_to_dict(dataset_dict, key, spatial) - - # Spatial resolution in meters - spatial_resolution_in_meters = self._object_value_int_list( - dataset_ref, DCAT.spatialResolutionInMeters) - if spatial_resolution_in_meters: - dataset_dict['extras'].append({'key': 'spatial_resolution_in_meters', - 'value': json.dumps(spatial_resolution_in_meters)}) - - # Resources - for distribution in self._distributions(dataset_ref): - distribution_ref = str(distribution) - for resource_dict in dataset_dict.get('resources', []): - # Match distribution in graph and distribution in resource dict - if resource_dict and distribution_ref == resource_dict.get('distribution_ref'): - # Simple values - for key, predicate in ( - ('availability', DCATAP.availability), - ('compress_format', DCAT.compressFormat), - ('package_format', DCAT.packageFormat), - ): - value = self._object_value(distribution, predicate) - if value: - resource_dict[key] = value - - return dataset_dict - - def graph_from_dataset(self, dataset_dict, dataset_ref): - - # call super method - super(EuropeanDCATAP2Profile, self).graph_from_dataset(dataset_dict, dataset_ref) - - # Lists - for key, predicate, fallbacks, type, datatype in ( - ('temporal_resolution', DCAT.temporalResolution, None, Literal, XSD.duration), - ('is_referenced_by', DCT.isReferencedBy, None, URIRefOrLiteral, None) - ): - self._add_triple_from_dict(dataset_dict, dataset_ref, predicate, key, list_value=True, - fallbacks=fallbacks, _type=type, _datatype=datatype) - - # Temporal - start = self._get_dataset_value(dataset_dict, 'temporal_start') - end = self._get_dataset_value(dataset_dict, 'temporal_end') - if start or end: - temporal_extent_dcat = BNode() - - self.g.add((temporal_extent_dcat, RDF.type, DCT.PeriodOfTime)) - if start: - self._add_date_triple(temporal_extent_dcat, DCAT.startDate, start) - if end: - self._add_date_triple(temporal_extent_dcat, DCAT.endDate, end) - self.g.add((dataset_ref, DCT.temporal, temporal_extent_dcat)) - - # spatial - spatial_bbox = self._get_dataset_value(dataset_dict, 'spatial_bbox') - spatial_cent = self._get_dataset_value(dataset_dict, 'spatial_centroid') - - if spatial_bbox or spatial_cent: - spatial_ref = self._get_or_create_spatial_ref(dataset_dict, dataset_ref) - - if spatial_bbox: - self._add_spatial_value_to_graph(spatial_ref, DCAT.bbox, spatial_bbox) - - if spatial_cent: - self._add_spatial_value_to_graph(spatial_ref, DCAT.centroid, spatial_cent) - - # Spatial resolution in meters - spatial_resolution_in_meters = self._read_list_value( - self._get_dataset_value(dataset_dict, 'spatial_resolution_in_meters')) - if spatial_resolution_in_meters: - for value in spatial_resolution_in_meters: - try: - self.g.add((dataset_ref, DCAT.spatialResolutionInMeters, - Literal(float(value), datatype=XSD.decimal))) - except (ValueError, TypeError): - self.g.add((dataset_ref, DCAT.spatialResolutionInMeters, Literal(value))) - - # Resources - for resource_dict in dataset_dict.get('resources', []): - - distribution = CleanedURIRef(resource_uri(resource_dict)) - - # Simple values - items = [ - ('availability', DCATAP.availability, None, URIRefOrLiteral), - ('compress_format', DCAT.compressFormat, None, URIRefOrLiteral), - ('package_format', DCAT.packageFormat, None, URIRefOrLiteral) - ] - - self._add_triples_from_dict(resource_dict, distribution, items) - - def graph_from_catalog(self, catalog_dict, catalog_ref): - - # call super method - super(EuropeanDCATAP2Profile, self).graph_from_catalog(catalog_dict, catalog_ref) - - -class SchemaOrgProfile(RDFProfile): - ''' - An RDF profile based on the schema.org Dataset - - More information and specification: - - http://schema.org/Dataset - - Mapping between schema.org Dataset and DCAT: - - https://www.w3.org/wiki/WebSchemas/Datasets - ''' - def graph_from_dataset(self, dataset_dict, dataset_ref): - - g = self.g - - # Namespaces - self._bind_namespaces() - - g.add((dataset_ref, RDF.type, SCHEMA.Dataset)) - - # Basic fields - self._basic_fields_graph(dataset_ref, dataset_dict) - - # Catalog - self._catalog_graph(dataset_ref, dataset_dict) - - # Groups - self._groups_graph(dataset_ref, dataset_dict) - - # Tags - self._tags_graph(dataset_ref, dataset_dict) - - # Lists - self._list_fields_graph(dataset_ref, dataset_dict) - - # Publisher - self._publisher_graph(dataset_ref, dataset_dict) - - # Temporal - self._temporal_graph(dataset_ref, dataset_dict) - - # Spatial - self._spatial_graph(dataset_ref, dataset_dict) - - # Resources - self._resources_graph(dataset_ref, dataset_dict) - - # Additional fields - self.additional_fields(dataset_ref, dataset_dict) - - def additional_fields(self, dataset_ref, dataset_dict): - ''' - Adds any additional fields. - - For a custom schema you should extend this class and - implement this method. - ''' - pass - - def _add_date_triple(self, subject, predicate, value, _type=Literal): - ''' - Adds a new triple with a date object - - Dates are parsed using dateutil, and if the date obtained is correct, - added to the graph as an SCHEMA.DateTime value. - - If there are parsing errors, the literal string value is added. - ''' - if not value: - return - try: - default_datetime = datetime.datetime(1, 1, 1, 0, 0, 0) - _date = parse_date(value, default=default_datetime) - - self.g.add((subject, predicate, _type(_date.isoformat()))) - except ValueError: - self.g.add((subject, predicate, _type(value))) - - def _bind_namespaces(self): - self.g.namespace_manager.bind('schema', namespaces['schema'], replace=True) - - def _basic_fields_graph(self, dataset_ref, dataset_dict): - items = [ - ('identifier', SCHEMA.identifier, None, Literal), - ('title', SCHEMA.name, None, Literal), - ('notes', SCHEMA.description, None, Literal), - ('version', SCHEMA.version, ['dcat_version'], Literal), - ('issued', SCHEMA.datePublished, ['metadata_created'], Literal), - ('modified', SCHEMA.dateModified, ['metadata_modified'], Literal), - ('license', SCHEMA.license, ['license_url', 'license_title'], Literal), - ] - self._add_triples_from_dict(dataset_dict, dataset_ref, items) - - items = [ - ('issued', SCHEMA.datePublished, ['metadata_created'], Literal), - ('modified', SCHEMA.dateModified, ['metadata_modified'], Literal), - ] - - self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) - - # Dataset URL - dataset_url = url_for('dataset.read', - id=dataset_dict['name'], - _external=True) - self.g.add((dataset_ref, SCHEMA.url, Literal(dataset_url))) - - def _catalog_graph(self, dataset_ref, dataset_dict): - data_catalog = BNode() - self.g.add((dataset_ref, SCHEMA.includedInDataCatalog, data_catalog)) - self.g.add((data_catalog, RDF.type, SCHEMA.DataCatalog)) - self.g.add((data_catalog, SCHEMA.name, Literal(config.get('ckan.site_title')))) - self.g.add((data_catalog, SCHEMA.description, Literal(config.get('ckan.site_description')))) - self.g.add((data_catalog, SCHEMA.url, Literal(config.get('ckan.site_url')))) - - def _groups_graph(self, dataset_ref, dataset_dict): - for group in dataset_dict.get('groups', []): - group_url = url_for(controller='group', - action='read', - id=group.get('id'), - _external=True) - about = BNode() - - self.g.add((about, RDF.type, SCHEMA.Thing)) - - self.g.add((about, SCHEMA.name, Literal(group['name']))) - self.g.add((about, SCHEMA.url, Literal(group_url))) - - self.g.add((dataset_ref, SCHEMA.about, about)) - - def _tags_graph(self, dataset_ref, dataset_dict): - for tag in dataset_dict.get('tags', []): - self.g.add((dataset_ref, SCHEMA.keywords, Literal(tag['name']))) - - def _list_fields_graph(self, dataset_ref, dataset_dict): - items = [ - ('language', SCHEMA.inLanguage, None, Literal), - ] - self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) - - def _publisher_graph(self, dataset_ref, dataset_dict): - if any([ - self._get_dataset_value(dataset_dict, 'publisher_uri'), - self._get_dataset_value(dataset_dict, 'publisher_name'), - dataset_dict.get('organization'), - ]): - - publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri') - publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) - publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') - if publisher_uri: - publisher_details = CleanedURIRef(publisher_uri) - elif not publisher_name and publisher_uri_fallback: - # neither URI nor name are available, use organization as fallback - publisher_details = CleanedURIRef(publisher_uri_fallback) - else: - # No publisher_uri - publisher_details = BNode() - - self.g.add((publisher_details, RDF.type, SCHEMA.Organization)) - self.g.add((dataset_ref, SCHEMA.publisher, publisher_details)) - - # In case no name and URI are available, again fall back to organization. - # If no name but an URI is available, the name literal remains empty to - # avoid mixing organization and dataset values. - if not publisher_name and not publisher_uri and dataset_dict.get('organization'): - publisher_name = dataset_dict['organization']['title'] - self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name))) - - contact_point = BNode() - self.g.add((contact_point, RDF.type, SCHEMA.ContactPoint)) - self.g.add((publisher_details, SCHEMA.contactPoint, contact_point)) - - self.g.add((contact_point, SCHEMA.contactType, Literal('customer service'))) - - publisher_url = self._get_dataset_value(dataset_dict, 'publisher_url') - if not publisher_url and dataset_dict.get('organization'): - publisher_url = dataset_dict['organization'].get('url') or config.get('ckan.site_url') - - self.g.add((contact_point, SCHEMA.url, Literal(publisher_url))) - items = [ - ('publisher_email', SCHEMA.email, ['contact_email', 'maintainer_email', 'author_email'], Literal), - ('publisher_name', SCHEMA.name, ['contact_name', 'maintainer', 'author'], Literal), - ] - - self._add_triples_from_dict(dataset_dict, contact_point, items) - - def _temporal_graph(self, dataset_ref, dataset_dict): - start = self._get_dataset_value(dataset_dict, 'temporal_start') - end = self._get_dataset_value(dataset_dict, 'temporal_end') - if start or end: - if start and end: - self.g.add((dataset_ref, SCHEMA.temporalCoverage, Literal('%s/%s' % (start, end)))) - elif start: - self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, start) - elif end: - self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, end) - - def _spatial_graph(self, dataset_ref, dataset_dict): - spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') - spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') - spatial_geom = self._get_dataset_value(dataset_dict, 'spatial') - - if spatial_uri or spatial_text or spatial_geom: - if spatial_uri: - spatial_ref = URIRef(spatial_uri) - else: - spatial_ref = BNode() - - self.g.add((spatial_ref, RDF.type, SCHEMA.Place)) - self.g.add((dataset_ref, SCHEMA.spatialCoverage, spatial_ref)) - - if spatial_text: - self.g.add((spatial_ref, SCHEMA.description, Literal(spatial_text))) - - if spatial_geom: - geo_shape = BNode() - self.g.add((geo_shape, RDF.type, SCHEMA.GeoShape)) - self.g.add((spatial_ref, SCHEMA.geo, geo_shape)) - - # the spatial_geom typically contains GeoJSON - self.g.add((geo_shape, - SCHEMA.polygon, - Literal(spatial_geom))) - - def _resources_graph(self, dataset_ref, dataset_dict): - g = self.g - for resource_dict in dataset_dict.get('resources', []): - distribution = URIRef(resource_uri(resource_dict)) - g.add((dataset_ref, SCHEMA.distribution, distribution)) - g.add((distribution, RDF.type, SCHEMA.DataDownload)) - - self._distribution_graph(distribution, resource_dict) - - def _distribution_graph(self, distribution, resource_dict): - # Simple values - self._distribution_basic_fields_graph(distribution, resource_dict) - - # Lists - self._distribution_list_fields_graph(distribution, resource_dict) - - # Format - self._distribution_format_graph(distribution, resource_dict) - - # URL - self._distribution_url_graph(distribution, resource_dict) - - # Numbers - self._distribution_numbers_graph(distribution, resource_dict) - - def _distribution_basic_fields_graph(self, distribution, resource_dict): - items = [ - ('name', SCHEMA.name, None, Literal), - ('description', SCHEMA.description, None, Literal), - ('license', SCHEMA.license, ['rights'], Literal), - ] - - self._add_triples_from_dict(resource_dict, distribution, items) - - items = [ - ('issued', SCHEMA.datePublished, None, Literal), - ('modified', SCHEMA.dateModified, None, Literal), - ] - - self._add_date_triples_from_dict(resource_dict, distribution, items) - - def _distribution_list_fields_graph(self, distribution, resource_dict): - items = [ - ('language', SCHEMA.inLanguage, None, Literal), - ] - self._add_list_triples_from_dict(resource_dict, distribution, items) - - def _distribution_format_graph(self, distribution, resource_dict): - if resource_dict.get('format'): - self.g.add((distribution, SCHEMA.encodingFormat, - Literal(resource_dict['format']))) - elif resource_dict.get('mimetype'): - self.g.add((distribution, SCHEMA.encodingFormat, - Literal(resource_dict['mimetype']))) - - def _distribution_url_graph(self, distribution, resource_dict): - url = resource_dict.get('url') - download_url = resource_dict.get('download_url') - if download_url: - self.g.add((distribution, SCHEMA.contentUrl, Literal(download_url))) - if (url and not download_url) or (url and url != download_url): - self.g.add((distribution, SCHEMA.url, Literal(url))) - - def _distribution_numbers_graph(self, distribution, resource_dict): - if resource_dict.get('size'): - self.g.add((distribution, SCHEMA.contentSize, Literal(resource_dict['size']))) diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py new file mode 100644 index 00000000..a80a48c6 --- /dev/null +++ b/ckanext/dcat/profiles/__init__.py @@ -0,0 +1,24 @@ +from .base import RDFProfile, CleanedURIRef +from .base import ( + RDF, + XSD, + SKOS, + RDFS, + DCAT, + DCATAP, + DCT, + ADMS, + VCARD, + FOAF, + SCHEMA, + LOCN, + GSP, + OWL, + SPDX, + GEOJSON_IMT, +) + +from .euro_dcat_ap import EuropeanDCATAPProfile +from .euro_dcat_ap_2 import EuropeanDCATAP2Profile +from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile +from .schemaorg import SchemaOrgProfile diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py new file mode 100644 index 00000000..0b307bdd --- /dev/null +++ b/ckanext/dcat/profiles/base.py @@ -0,0 +1,1152 @@ +import datetime +import json +from urllib.parse import quote + +from dateutil.parser import parse as parse_date +from rdflib import term, URIRef, BNode, Literal +from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS +from geomet import wkt, InvalidGeoJSONException + +from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound +from ckan.model.license import LicenseRegister +from ckan.lib.helpers import resource_formats +from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS +from ckanext.dcat.validators import is_year, is_year_month, is_date + +DCT = Namespace("http://purl.org/dc/terms/") +DCAT = Namespace("http://www.w3.org/ns/dcat#") +DCATAP = Namespace("http://data.europa.eu/r5r/") +ADMS = Namespace("http://www.w3.org/ns/adms#") +VCARD = Namespace("http://www.w3.org/2006/vcard/ns#") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") +SCHEMA = Namespace("http://schema.org/") +TIME = Namespace("http://www.w3.org/2006/time") +LOCN = Namespace("http://www.w3.org/ns/locn#") +GSP = Namespace("http://www.opengis.net/ont/geosparql#") +OWL = Namespace("http://www.w3.org/2002/07/owl#") +SPDX = Namespace("http://spdx.org/rdf/terms#") + +namespaces = { + "dct": DCT, + "dcat": DCAT, + "dcatap": DCATAP, + "adms": ADMS, + "vcard": VCARD, + "foaf": FOAF, + "schema": SCHEMA, + "time": TIME, + "skos": SKOS, + "locn": LOCN, + "gsp": GSP, + "owl": OWL, + "spdx": SPDX, +} + +PREFIX_MAILTO = "mailto:" + +GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json" + +DEFAULT_SPATIAL_FORMATS = ["wkt"] + +ROOT_DATASET_FIELDS = [ + 'name', + 'title', + 'url', + 'version', + 'tags', + 'license_id', + 'maintainer', + 'maintainer_email', + 'author', + 'author_email', +] + + +class URIRefOrLiteral(object): + """Helper which creates an URIRef if the value appears to be an http URL, + or a Literal otherwise. URIRefs are also cleaned using CleanedURIRef. + + Like CleanedURIRef, this is a factory class. + """ + + def __new__(cls, value): + try: + stripped_value = value.strip() + if isinstance(value, str) and ( + stripped_value.startswith("http://") + or stripped_value.startswith("https://") + ): + uri_obj = CleanedURIRef(value) + # although all invalid chars checked by rdflib should have been quoted, try to serialize + # the object. If it breaks, use Literal instead. + uri_obj.n3() + # URI is fine, return the object + return uri_obj + else: + return Literal(value) + except Exception: + # In case something goes wrong: use Literal + return Literal(value) + + +class CleanedURIRef(object): + """Performs some basic URL encoding on value before creating an URIRef object. + + This is a factory for URIRef objects, which allows usage as type in graph.add() + without affecting the resulting node types. That is, + g.add(..., URIRef) and g.add(..., CleanedURIRef) will result in the exact same node type. + """ + + @staticmethod + def _careful_quote(value): + # only encode this limited subset of characters to avoid more complex URL parsing + # (e.g. valid ? in query string vs. ? as value). + # can be applied multiple times, as encoded %xy is left untouched. Therefore, no + # unquote is necessary beforehand. + quotechars = " !\"$'()*,;<>[]{|}\\^`" + for c in quotechars: + value = value.replace(c, quote(c)) + return value + + def __new__(cls, value): + if isinstance(value, str): + value = CleanedURIRef._careful_quote(value.strip()) + return URIRef(value) + + +class RDFProfile(object): + """Base class with helper methods for implementing RDF parsing profiles + + This class should not be used directly, but rather extended to create + custom profiles + """ + + _dataset_schema = None + + # Cache for mappings of licenses URL/title to ID built when needed in + # _license(). + _licenceregister_cache = None + + # Cache for organization_show details (used for publisher fallback) + _org_cache: dict = {} + + def __init__(self, graph, dataset_type="dataset", compatibility_mode=False): + """Class constructor + Graph is an rdflib.Graph instance. + A scheming dataset type can be provided, in which case the scheming schema + will be loaded so it can be used by profiles. + In compatibility mode, some fields are modified to maintain + compatibility with previous versions of the ckanext-dcat parsers + (eg adding the `dcat_` prefix or storing comma separated lists instead + of JSON dumps). + """ + + self.g = graph + + self.compatibility_mode = compatibility_mode + + try: + schema_show = get_action("scheming_dataset_schema_show") + try: + schema = schema_show({}, {"type": dataset_type}) + except ObjectNotFound: + raise ObjectNotFound(f"Unknown dataset schema: {dataset_type}") + + self._dataset_schema = schema + + except KeyError: + pass + + def _datasets(self): + """ + Generator that returns all DCAT datasets on the graph + + Yields term.URIRef objects that can be used on graph lookups + and queries + """ + for dataset in self.g.subjects(RDF.type, DCAT.Dataset): + yield dataset + + def _distributions(self, dataset): + """ + Generator that returns all DCAT distributions on a particular dataset + + Yields term.URIRef objects that can be used on graph lookups + and queries + """ + for distribution in self.g.objects(dataset, DCAT.distribution): + yield distribution + + def _keywords(self, dataset_ref): + """ + Returns all DCAT keywords on a particular dataset + """ + keywords = self._object_value_list(dataset_ref, DCAT.keyword) or [] + # Split keywords with commas + keywords_with_commas = [k for k in keywords if "," in k] + for keyword in keywords_with_commas: + keywords.remove(keyword) + keywords.extend([k.strip() for k in keyword.split(",")]) + return keywords + + def _object(self, subject, predicate): + """ + Helper for returning the first object for this subject and predicate + + Both subject and predicate must be rdflib URIRef or BNode objects + + Returns an rdflib reference (URIRef or BNode) or None if not found + """ + for _object in self.g.objects(subject, predicate): + return _object + return None + + def _object_value(self, subject, predicate): + """ + Given a subject and a predicate, returns the value of the object + + Both subject and predicate must be rdflib URIRef or BNode objects + + If found, the string representation is returned, else an empty string + """ + default_lang = config.get("ckan.locale_default", "en") + fallback = "" + for o in self.g.objects(subject, predicate): + if isinstance(o, Literal): + if o.language and o.language == default_lang: + return str(o) + # Use first object as fallback if no object with the default language is available + elif fallback == "": + fallback = str(o) + else: + return str(o) + return fallback + + def _object_value_multiple_predicate(self, subject, predicates): + """ + Given a subject and a list of predicates, returns the value of the object + according to the order in which it was specified. + + Both subject and predicates must be rdflib URIRef or BNode objects + + If found, the string representation is returned, else an empty string + """ + object_value = "" + for predicate in predicates: + object_value = self._object_value(subject, predicate) + if object_value: + break + + return object_value + + def _object_value_int(self, subject, predicate): + """ + Given a subject and a predicate, returns the value of the object as an + integer + + Both subject and predicate must be rdflib URIRef or BNode objects + + If the value can not be parsed as intger, returns None + """ + object_value = self._object_value(subject, predicate) + if object_value: + try: + return int(float(object_value)) + except ValueError: + pass + return None + + def _object_value_int_list(self, subject, predicate): + """ + Given a subject and a predicate, returns the value of the object as a + list of integers + + Both subject and predicate must be rdflib URIRef or BNode objects + + If the value can not be parsed as integer, returns an empty list + """ + object_values = [] + for object in self.g.objects(subject, predicate): + if object: + try: + object_values.append(int(float(object))) + except ValueError: + pass + return object_values + + def _object_value_float_list(self, subject, predicate): + """ + Given a subject and a predicate, returns the value of the object as a + list of floats + + Both subject and predicate must be rdflib URIRef or BNode objects + + If the value can not be parsed as a float, returns an empty list + """ + object_values = [] + for object in self.g.objects(subject, predicate): + if object: + try: + object_values.append(float(object)) + except ValueError: + pass + return object_values + + def _object_value_list(self, subject, predicate): + """ + Given a subject and a predicate, returns a list with all the values of + the objects + + Both subject and predicate must be rdflib URIRef or BNode objects + + If no values found, returns an empty string + """ + return [str(o) for o in self.g.objects(subject, predicate)] + + def _get_vcard_property_value( + self, subject, predicate, predicate_string_property=None + ): + """ + Given a subject, a predicate and a predicate for the simple string property (optional), + returns the value of the object. Trying to read the value in the following order + * predicate_string_property + * predicate + + All subject, predicate and predicate_string_property must be rdflib URIRef or BNode objects + + If no value is found, returns an empty string + """ + + result = "" + if predicate_string_property: + result = self._object_value(subject, predicate_string_property) + + if not result: + obj = self._object(subject, predicate) + if isinstance(obj, BNode): + result = self._object_value(obj, VCARD.hasValue) + else: + result = self._object_value(subject, predicate) + + return result + + def _time_interval(self, subject, predicate, dcat_ap_version=1): + """ + Returns the start and end date for a time interval object + + Both subject and predicate must be rdflib URIRef or BNode objects + + It checks for time intervals defined with DCAT, W3C Time hasBeginning & hasEnd + and schema.org startDate & endDate. + + Note that partial dates will be expanded to the first month / day + value, eg '1904' -> '1904-01-01'. + + Returns a tuple with the start and end date values, both of which + can be None if not found + """ + + start_date = end_date = None + + if dcat_ap_version == 1: + start_date, end_date = self._read_time_interval_schema_org( + subject, predicate + ) + if start_date or end_date: + return start_date, end_date + return self._read_time_interval_time(subject, predicate) + elif dcat_ap_version == 2: + start_date, end_date = self._read_time_interval_dcat(subject, predicate) + if start_date or end_date: + return start_date, end_date + start_date, end_date = self._read_time_interval_time(subject, predicate) + if start_date or end_date: + return start_date, end_date + return self._read_time_interval_schema_org(subject, predicate) + + def _read_time_interval_schema_org(self, subject, predicate): + start_date = end_date = None + + for interval in self.g.objects(subject, predicate): + start_date = self._object_value(interval, SCHEMA.startDate) + end_date = self._object_value(interval, SCHEMA.endDate) + + if start_date or end_date: + return start_date, end_date + + return start_date, end_date + + def _read_time_interval_dcat(self, subject, predicate): + start_date = end_date = None + + for interval in self.g.objects(subject, predicate): + start_date = self._object_value(interval, DCAT.startDate) + end_date = self._object_value(interval, DCAT.endDate) + + if start_date or end_date: + return start_date, end_date + + return start_date, end_date + + def _read_time_interval_time(self, subject, predicate): + start_date = end_date = None + + for interval in self.g.objects(subject, predicate): + start_nodes = [t for t in self.g.objects(interval, TIME.hasBeginning)] + end_nodes = [t for t in self.g.objects(interval, TIME.hasEnd)] + if start_nodes: + start_date = self._object_value_multiple_predicate( + start_nodes[0], + [TIME.inXSDDateTimeStamp, TIME.inXSDDateTime, TIME.inXSDDate], + ) + if end_nodes: + end_date = self._object_value_multiple_predicate( + end_nodes[0], + [TIME.inXSDDateTimeStamp, TIME.inXSDDateTime, TIME.inXSDDate], + ) + + if start_date or end_date: + return start_date, end_date + + return start_date, end_date + + def _insert_or_update_temporal(self, dataset_dict, key, value): + temporal = next( + (item for item in dataset_dict["extras"] if (item["key"] == key)), None + ) + if temporal: + temporal["value"] = value + else: + dataset_dict["extras"].append({"key": key, "value": value}) + + def _publisher(self, subject, predicate): + """ + Returns a dict with details about a dct:publisher entity, a foaf:Agent + + Both subject and predicate must be rdflib URIRef or BNode objects + + Examples: + + + + Publishing Organization for dataset 1 + contact@some.org + http://some.org + + + + + { + 'uri': 'http://orgs.vocab.org/some-org', + 'name': 'Publishing Organization for dataset 1', + 'email': 'contact@some.org', + 'url': 'http://some.org', + 'type': 'http://purl.org/adms/publishertype/NonProfitOrganisation', + } + + + + { + 'uri': 'http://publications.europa.eu/resource/authority/corporate-body/EURCOU' + } + + Returns keys for uri, name, email, url and type with the values set to + an empty string if they could not be found + """ + + publisher = {} + + for agent in self.g.objects(subject, predicate): + + publisher["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" + + publisher["name"] = self._object_value(agent, FOAF.name) + + publisher["email"] = self._object_value(agent, FOAF.mbox) + + publisher["url"] = self._object_value(agent, FOAF.homepage) + + publisher["type"] = self._object_value(agent, DCT.type) + + return publisher + + def _contact_details(self, subject, predicate): + """ + Returns a dict with details about a vcard expression + + Both subject and predicate must be rdflib URIRef or BNode objects + + Returns keys for uri, name and email with the values set to + an empty string if they could not be found + """ + + contact = {} + + for agent in self.g.objects(subject, predicate): + + contact["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" + + contact["name"] = self._get_vcard_property_value( + agent, VCARD.hasFN, VCARD.fn + ) + + contact["email"] = self._without_mailto( + self._get_vcard_property_value(agent, VCARD.hasEmail) + ) + + return contact + + def _parse_geodata(self, spatial, datatype, cur_value): + """ + Extract geodata with the given datatype from the spatial data and check if it contains a valid GeoJSON + or WKT geometry. + + Returns the String or None if the value is no valid GeoJSON or WKT geometry. + """ + for geometry in self.g.objects(spatial, datatype): + if geometry.datatype == URIRef(GEOJSON_IMT) or not geometry.datatype: + try: + json.loads(str(geometry)) + cur_value = str(geometry) + except (ValueError, TypeError): + pass + if not cur_value and geometry.datatype == GSP.wktLiteral: + try: + cur_value = json.dumps(wkt.loads(str(geometry))) + except (ValueError, TypeError): + pass + return cur_value + + def _spatial(self, subject, predicate): + """ + Returns a dict with details about the spatial location + + Both subject and predicate must be rdflib URIRef or BNode objects + + Returns keys for uri, text or geom with the values set to + None if they could not be found. + + Geometries are always returned in GeoJSON. If only WKT is provided, + it will be transformed to GeoJSON. + + Check the notes on the README for the supported formats: + + https://github.com/ckan/ckanext-dcat/#rdf-dcat-to-ckan-dataset-mapping + """ + + uri = None + text = None + geom = None + bbox = None + cent = None + + for spatial in self.g.objects(subject, predicate): + + if isinstance(spatial, URIRef): + uri = str(spatial) + + if isinstance(spatial, Literal): + text = str(spatial) + + if (spatial, RDF.type, DCT.Location) in self.g: + geom = self._parse_geodata(spatial, LOCN.geometry, geom) + bbox = self._parse_geodata(spatial, DCAT.bbox, bbox) + cent = self._parse_geodata(spatial, DCAT.centroid, cent) + for label in self.g.objects(spatial, SKOS.prefLabel): + text = str(label) + for label in self.g.objects(spatial, RDFS.label): + text = str(label) + + return { + "uri": uri, + "text": text, + "geom": geom, + "bbox": bbox, + "centroid": cent, + } + + def _license(self, dataset_ref): + """ + Returns a license identifier if one of the distributions license is + found in CKAN license registry. If no distribution's license matches, + an empty string is returned. + + The first distribution with a license found in the registry is used so + that if distributions have different licenses we'll only get the first + one. + """ + if self._licenceregister_cache is not None: + license_uri2id, license_title2id = self._licenceregister_cache + else: + license_uri2id = {} + license_title2id = {} + for license_id, license in list(LicenseRegister().items()): + license_uri2id[license.url] = license_id + license_title2id[license.title] = license_id + self._licenceregister_cache = license_uri2id, license_title2id + + for distribution in self._distributions(dataset_ref): + # If distribution has a license, attach it to the dataset + license = self._object(distribution, DCT.license) + if license: + # Try to find a matching license comparing URIs, then titles + license_id = license_uri2id.get(license.toPython()) + if not license_id: + license_id = license_title2id.get( + self._object_value(license, DCT.title) + ) + if license_id: + return license_id + return "" + + def _access_rights(self, subject, predicate): + """ + Returns the rights statement or an empty string if no one is found. + """ + + result = "" + obj = self._object(subject, predicate) + if obj: + if ( + isinstance(obj, BNode) + and self._object(obj, RDF.type) == DCT.RightsStatement + ): + result = self._object_value(obj, RDFS.label) + elif isinstance(obj, Literal) or isinstance(obj, URIRef): + # unicode_safe not include Literal or URIRef + result = str(obj) + return result + + def _distribution_format(self, distribution, normalize_ckan_format=True): + """ + Returns the Internet Media Type and format label for a distribution + + Given a reference (URIRef or BNode) to a dcat:Distribution, it will + try to extract the media type (previously knowm as MIME type), eg + `text/csv`, and the format label, eg `CSV` + + Values for the media type will be checked in the following order: + + 1. literal value of dcat:mediaType + 2. literal value of dct:format if it contains a '/' character + 3. value of dct:format if it is an instance of dct:IMT, eg: + + + + + 4. value of dct:format if it is an URIRef and appears to be an IANA type + + Values for the label will be checked in the following order: + + 1. literal value of dct:format if it not contains a '/' character + 2. label of dct:format if it is an instance of dct:IMT (see above) + 3. value of dct:format if it is an URIRef and doesn't look like an IANA type + + If `normalize_ckan_format` is True the label will + be tried to match against the standard list of formats that is included + with CKAN core + (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json) + This allows for instance to populate the CKAN resource format field + with a format that view plugins, etc will understand (`csv`, `xml`, + etc.) + + Return a tuple with the media type and the label, both set to None if + they couldn't be found. + """ + + imt = None + label = None + + imt = self._object_value(distribution, DCAT.mediaType) + + _format = self._object(distribution, DCT["format"]) + if isinstance(_format, Literal): + if not imt and "/" in _format: + imt = str(_format) + else: + label = str(_format) + elif isinstance(_format, (BNode, URIRef)): + if self._object(_format, RDF.type) == DCT.IMT: + if not imt: + imt = str(self.g.value(_format, default=None)) + label = self._object_value(_format, RDFS.label) + elif isinstance(_format, URIRef): + # If the URIRef does not reference a BNode, it could reference an IANA type. + # Otherwise, use it as label. + format_uri = str(_format) + if "iana.org/assignments/media-types" in format_uri and not imt: + imt = format_uri + else: + label = format_uri + + if (imt or label) and normalize_ckan_format: + + format_registry = resource_formats() + + if imt in format_registry: + label = format_registry[imt][1] + elif label in format_registry: + label = format_registry[label][1] + + return imt, label + + def _get_dict_value(self, _dict, key, default=None): + """ + Returns the value for the given key on a CKAN dict + + By default a key on the root level is checked. If not found, extras + are checked, both with the key provided and with `dcat_` prepended to + support legacy fields. + + If not found, returns the default value, which defaults to None + """ + + if key in _dict: + return _dict[key] + + for extra in _dict.get("extras", []): + if extra["key"] == key or extra["key"] == "dcat_" + key: + return extra["value"] + + return default + + def _read_list_value(self, value): + items = [] + # List of values + if isinstance(value, list): + items = value + elif value and isinstance(value, str): + try: + items = json.loads(value) + if isinstance(items, ((int, float, complex))): + items = [items] # JSON list + except ValueError: + if "," in value: + # Comma-separated list + items = value.split(",") + else: + items = [value] # Normal text value + return items + + def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): + """ + Adds spatial triples to the graph. Assumes that value is a GeoJSON string + or object. + """ + spatial_formats = aslist( + config.get( + "ckanext.dcat.output_spatial_format", DEFAULT_SPATIAL_FORMATS + ) + ) + + if isinstance(value, str): + try: + value = json.loads(value) + except (TypeError, ValueError): + return + + if "wkt" in spatial_formats: + # WKT, because GeoDCAT-AP says so + try: + self.g.add( + ( + spatial_ref, + predicate, + Literal( + wkt.dumps(value, decimals=4), + datatype=GSP.wktLiteral, + ), + ) + ) + except (TypeError, ValueError, InvalidGeoJSONException): + pass + + if "geojson" in spatial_formats: + # GeoJSON + self.g.add((spatial_ref, predicate, Literal(json.dumps(value), datatype=GEOJSON_IMT))) + + + def _add_spatial_to_dict(self, dataset_dict, key, spatial): + if spatial.get(key): + dataset_dict["extras"].append( + { + "key": "spatial_{0}".format(key) if key != "geom" else "spatial", + "value": spatial.get(key), + } + ) + + def _schema_field(self, key): + """ + Returns the schema field information if the provided key exists as a field in + the dataset schema (if one was provided) + """ + if not self._dataset_schema: + return None + + for field in self._dataset_schema["dataset_fields"]: + if field["field_name"] == key: + return field + + def _schema_resource_field(self, key): + """ + Returns the schema field information if the provided key exists as a field in + the resources fields of the dataset schema (if one was provided) + """ + if not self._dataset_schema: + return None + + for field in self._dataset_schema["resource_fields"]: + if field["field_name"] == key: + return field + + def _set_dataset_value(self, dataset_dict, key, value): + """ + Sets the value for a given key in a CKAN dataset dict + If a dataset schema was provided, the schema will be checked to see if + a custom field is present for the key. If so the key will be stored at + the dict root level, otherwise it will be stored as an extra. + Standard CKAN fields (defined in ROOT_DATASET_FIELDS) are always stored + at the root level. + """ + if self._schema_field(key) or key in ROOT_DATASET_FIELDS: + dataset_dict[key] = value + else: + if not dataset_dict.get("extras"): + dataset_dict["extras"] = [] + dataset_dict["extras"].append({"key": key, "value": value}) + + return dataset_dict + + def _set_list_dataset_value(self, dataset_dict, key, value): + schema_field = self._schema_field(key) + if schema_field and "scheming_multiple_text" in schema_field["validators"]: + return self._set_dataset_value(dataset_dict, key, value) + else: + return self._set_dataset_value(dataset_dict, key, json.dumps(value)) + + def _set_list_resource_value(self, resource_dict, key, value): + schema_field = self._schema_resource_field(key) + if schema_field and "scheming_multiple_text" in schema_field["validators"]: + resource_dict[key] = value + else: + resource_dict[key] = json.dumps(value) + + return resource_dict + + def _get_dataset_value(self, dataset_dict, key, default=None): + """ + Returns the value for the given key on a CKAN dict + + Check `_get_dict_value` for details + """ + return self._get_dict_value(dataset_dict, key, default) + + def _get_resource_value(self, resource_dict, key, default=None): + """ + Returns the value for the given key on a CKAN dict + + Check `_get_dict_value` for details + """ + return self._get_dict_value(resource_dict, key, default) + + def _add_date_triples_from_dict(self, _dict, subject, items): + self._add_triples_from_dict(_dict, subject, items, date_value=True) + + def _add_list_triples_from_dict(self, _dict, subject, items): + self._add_triples_from_dict(_dict, subject, items, list_value=True) + + def _add_triples_from_dict( + self, _dict, subject, items, list_value=False, date_value=False + ): + for item in items: + key, predicate, fallbacks, _type = item + self._add_triple_from_dict( + _dict, + subject, + predicate, + key, + fallbacks=fallbacks, + list_value=list_value, + date_value=date_value, + _type=_type, + ) + + def _add_triple_from_dict( + self, + _dict, + subject, + predicate, + key, + fallbacks=None, + list_value=False, + date_value=False, + _type=Literal, + _datatype=None, + value_modifier=None, + ): + """ + Adds a new triple to the graph with the provided parameters + + The subject and predicate of the triple are passed as the relevant + RDFLib objects (URIRef or BNode). As default, the object is a + literal value, which is extracted from the dict using the provided key + (see `_get_dict_value`). If the value for the key is not found, then + additional fallback keys are checked. + Using `value_modifier`, a function taking the extracted value and + returning a modified value can be passed. + If a value was found, the modifier is applied before adding the value. + + If `list_value` or `date_value` are True, then the value is treated as + a list or a date respectively (see `_add_list_triple` and + `_add_date_triple` for details. + """ + value = self._get_dict_value(_dict, key) + if not value and fallbacks: + for fallback in fallbacks: + value = self._get_dict_value(_dict, fallback) + if value: + break + + # if a modifying function was given, apply it to the value + if value and callable(value_modifier): + value = value_modifier(value) + + if value and list_value: + self._add_list_triple(subject, predicate, value, _type, _datatype) + elif value and date_value: + self._add_date_triple(subject, predicate, value, _type) + elif value: + # Normal text value + # ensure URIRef items are preprocessed (space removal/url encoding) + if _type == URIRef: + _type = CleanedURIRef + if _datatype: + object = _type(value, datatype=_datatype) + else: + object = _type(value) + self.g.add((subject, predicate, object)) + + def _add_list_triple( + self, subject, predicate, value, _type=Literal, _datatype=None + ): + """ + Adds as many triples to the graph as values + + Values are literal strings, if `value` is a list, one for each + item. If `value` is a string there is an attempt to split it using + commas, to support legacy fields. + """ + items = self._read_list_value(value) + + for item in items: + # ensure URIRef items are preprocessed (space removal/url encoding) + if _type == URIRef: + _type = CleanedURIRef + if _datatype: + object = _type(item, datatype=_datatype) + else: + object = _type(item) + self.g.add((subject, predicate, object)) + + def _add_date_triple(self, subject, predicate, value, _type=Literal): + """ + Adds a new triple with a date object + + If the value is one of xsd:gYear, xsd:gYearMonth or xsd:date. If not + the value will be parsed using dateutil, and if the date obtained is correct, + added to the graph as an xsd:dateTime value. + + If there are parsing errors, the literal string value is added. + """ + if not value: + return + + if is_year(value): + self.g.add((subject, predicate, _type(value, datatype=XSD.gYear))) + elif is_year_month(value): + self.g.add((subject, predicate, _type(value, datatype=XSD.gYearMonth))) + elif is_date(value): + self.g.add((subject, predicate, _type(value, datatype=XSD.date))) + else: + try: + default_datetime = datetime.datetime(1, 1, 1, 0, 0, 0) + _date = parse_date(value, default=default_datetime) + + self.g.add( + (subject, predicate, _type(_date.isoformat(), datatype=XSD.dateTime)) + ) + except ValueError: + self.g.add((subject, predicate, _type(value))) + + def _last_catalog_modification(self): + """ + Returns the date and time the catalog was last modified + + To be more precise, the most recent value for `metadata_modified` on a + dataset. + + Returns a dateTime string in ISO format, or None if it could not be + found. + """ + context = {"ignore_auth": True} + result = get_action("package_search")( + context, + { + "sort": "metadata_modified desc", + "rows": 1, + }, + ) + if result and result.get("results"): + return result["results"][0]["metadata_modified"] + return None + + def _add_mailto(self, mail_addr): + """ + Ensures that the mail address has an URIRef-compatible mailto: prefix. + Can be used as modifier function for `_add_triple_from_dict`. + """ + if mail_addr: + return PREFIX_MAILTO + self._without_mailto(mail_addr) + else: + return mail_addr + + def _without_mailto(self, mail_addr): + """ + Ensures that the mail address string has no mailto: prefix. + """ + if mail_addr: + return str(mail_addr).replace(PREFIX_MAILTO, "") + else: + return mail_addr + + def _get_source_catalog(self, dataset_ref): + """ + Returns Catalog reference that is source for this dataset. + + Catalog referenced in dct:hasPart is returned, + if dataset is linked there, otherwise main catalog + will be returned. + + This will not be used if ckanext.dcat.expose_subcatalogs + configuration option is set to False. + """ + if not asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)): + return + catalogs = set(self.g.subjects(DCAT.dataset, dataset_ref)) + root = self._get_root_catalog_ref() + try: + catalogs.remove(root) + except KeyError: + pass + assert len(catalogs) in (0, 1,), ( + "len %s" % catalogs + ) + if catalogs: + return catalogs.pop() + return root + + def _get_root_catalog_ref(self): + roots = list(self.g.subjects(DCT.hasPart)) + if not roots: + roots = list(self.g.subjects(RDF.type, DCAT.Catalog)) + return roots[0] + + def _get_or_create_spatial_ref(self, dataset_dict, dataset_ref): + for spatial_ref in self.g.objects(dataset_ref, DCT.spatial): + if spatial_ref: + return spatial_ref + + # Create new spatial_ref + spatial_uri = self._get_dataset_value(dataset_dict, "spatial_uri") + if spatial_uri: + spatial_ref = CleanedURIRef(spatial_uri) + else: + spatial_ref = BNode() + self.g.add((spatial_ref, RDF.type, DCT.Location)) + self.g.add((dataset_ref, DCT.spatial, spatial_ref)) + return spatial_ref + + # Public methods for profiles to implement + + def parse_dataset(self, dataset_dict, dataset_ref): + """ + Creates a CKAN dataset dict from the RDF graph + + The `dataset_dict` is passed to all the loaded profiles before being + yielded, so it can be further modified by each one of them. + `dataset_ref` is an rdflib URIRef object + that can be used to reference the dataset when querying the graph. + + Returns a dataset dict that can be passed to eg `package_create` + or `package_update` + """ + return dataset_dict + + def _extract_catalog_dict(self, catalog_ref): + """ + Returns list of key/value dictionaries with catalog + """ + + out = [] + sources = ( + ( + "source_catalog_title", + DCT.title, + ), + ( + "source_catalog_description", + DCT.description, + ), + ( + "source_catalog_homepage", + FOAF.homepage, + ), + ( + "source_catalog_language", + DCT.language, + ), + ( + "source_catalog_modified", + DCT.modified, + ), + ) + + for key, predicate in sources: + val = self._object_value(catalog_ref, predicate) + if val: + out.append({"key": key, "value": val}) + + out.append( + { + "key": "source_catalog_publisher", + "value": json.dumps(self._publisher(catalog_ref, DCT.publisher)), + } + ) + return out + + def graph_from_catalog(self, catalog_dict, catalog_ref): + """ + Creates an RDF graph for the whole catalog (site) + + The class RDFLib graph (accessible via `self.g`) should be updated on + this method + + `catalog_dict` is a dict that can contain literal values for the + dcat:Catalog class like `title`, `homepage`, etc. `catalog_ref` is an + rdflib URIRef object that must be used to reference the catalog when + working with the graph. + """ + pass + + def graph_from_dataset(self, dataset_dict, dataset_ref): + """ + Given a CKAN dataset dict, creates an RDF graph + + The class RDFLib graph (accessible via `self.g`) should be updated on + this method + + `dataset_dict` is a dict with the dataset metadata like the one + returned by `package_show`. `dataset_ref` is an rdflib URIRef object + that must be used to reference the dataset when working with the graph. + """ + pass diff --git a/ckanext/dcat/profiles/euro_dcat_ap.py b/ckanext/dcat/profiles/euro_dcat_ap.py new file mode 100644 index 00000000..b0057110 --- /dev/null +++ b/ckanext/dcat/profiles/euro_dcat_ap.py @@ -0,0 +1,615 @@ +import json +from decimal import Decimal, DecimalException + +from rdflib import term, URIRef, BNode, Literal +import ckantoolkit as toolkit + +from ckan.lib.munge import munge_tag + +from ckanext.dcat.utils import ( + resource_uri, + DCAT_EXPOSE_SUBCATALOGS, + DCAT_CLEAN_TAGS, + publisher_uri_organization_fallback, +) +from .base import RDFProfile, URIRefOrLiteral, CleanedURIRef +from .base import ( + RDF, + XSD, + SKOS, + RDFS, + DCAT, + DCT, + ADMS, + VCARD, + FOAF, + SCHEMA, + LOCN, + GSP, + OWL, + SPDX, + GEOJSON_IMT, + namespaces, +) + +config = toolkit.config + + +DISTRIBUTION_LICENSE_FALLBACK_CONFIG = "ckanext.dcat.resource.inherit.license" + + +class EuropeanDCATAPProfile(RDFProfile): + """ + An RDF profile based on the DCAT-AP for data portals in Europe + + More information and specification: + + https://joinup.ec.europa.eu/asset/dcat_application_profile + + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + + dataset_dict["extras"] = [] + dataset_dict["resources"] = [] + + # Basic fields + for key, predicate in ( + ("title", DCT.title), + ("notes", DCT.description), + ("url", DCAT.landingPage), + ("version", OWL.versionInfo), + ): + value = self._object_value(dataset_ref, predicate) + if value: + dataset_dict[key] = value + + if not dataset_dict.get("version"): + # adms:version was supported on the first version of the DCAT-AP + value = self._object_value(dataset_ref, ADMS.version) + if value: + dataset_dict["version"] = value + + # Tags + # replace munge_tag to noop if there's no need to clean tags + do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False)) + tags_val = [ + munge_tag(tag) if do_clean else tag for tag in self._keywords(dataset_ref) + ] + tags = [{"name": tag} for tag in tags_val] + dataset_dict["tags"] = tags + + # Extras + + # Simple values + for key, predicate in ( + ("issued", DCT.issued), + ("modified", DCT.modified), + ("identifier", DCT.identifier), + ("version_notes", ADMS.versionNotes), + ("frequency", DCT.accrualPeriodicity), + ("provenance", DCT.provenance), + ("dcat_type", DCT.type), + ): + value = self._object_value(dataset_ref, predicate) + if value: + dataset_dict["extras"].append({"key": key, "value": value}) + + # Lists + for key, predicate, in ( + ("language", DCT.language), + ("theme", DCAT.theme), + ("alternate_identifier", ADMS.identifier), + ("conforms_to", DCT.conformsTo), + ("documentation", FOAF.page), + ("related_resource", DCT.relation), + ("has_version", DCT.hasVersion), + ("is_version_of", DCT.isVersionOf), + ("source", DCT.source), + ("sample", ADMS.sample), + ): + values = self._object_value_list(dataset_ref, predicate) + if values: + dataset_dict["extras"].append({"key": key, "value": json.dumps(values)}) + + # Contact details + contact = self._contact_details(dataset_ref, DCAT.contactPoint) + if not contact: + # adms:contactPoint was supported on the first version of DCAT-AP + contact = self._contact_details(dataset_ref, ADMS.contactPoint) + + if contact: + for key in ("uri", "name", "email"): + if contact.get(key): + dataset_dict["extras"].append( + {"key": "contact_{0}".format(key), "value": contact.get(key)} + ) + + # Publisher + publisher = self._publisher(dataset_ref, DCT.publisher) + for key in ("uri", "name", "email", "url", "type"): + if publisher.get(key): + dataset_dict["extras"].append( + {"key": "publisher_{0}".format(key), "value": publisher.get(key)} + ) + + # Temporal + start, end = self._time_interval(dataset_ref, DCT.temporal) + if start: + dataset_dict["extras"].append({"key": "temporal_start", "value": start}) + if end: + dataset_dict["extras"].append({"key": "temporal_end", "value": end}) + + # Spatial + spatial = self._spatial(dataset_ref, DCT.spatial) + for key in ("uri", "text", "geom"): + self._add_spatial_to_dict(dataset_dict, key, spatial) + + # Dataset URI (explicitly show the missing ones) + dataset_uri = str(dataset_ref) if isinstance(dataset_ref, term.URIRef) else "" + dataset_dict["extras"].append({"key": "uri", "value": dataset_uri}) + + # access_rights + access_rights = self._access_rights(dataset_ref, DCT.accessRights) + if access_rights: + dataset_dict["extras"].append( + {"key": "access_rights", "value": access_rights} + ) + + # License + if "license_id" not in dataset_dict: + dataset_dict["license_id"] = self._license(dataset_ref) + + # Source Catalog + if toolkit.asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)): + catalog_src = self._get_source_catalog(dataset_ref) + if catalog_src is not None: + src_data = self._extract_catalog_dict(catalog_src) + dataset_dict["extras"].extend(src_data) + + # Resources + for distribution in self._distributions(dataset_ref): + + resource_dict = {} + + # Simple values + for key, predicate in ( + ("name", DCT.title), + ("description", DCT.description), + ("access_url", DCAT.accessURL), + ("download_url", DCAT.downloadURL), + ("issued", DCT.issued), + ("modified", DCT.modified), + ("status", ADMS.status), + ("license", DCT.license), + ): + value = self._object_value(distribution, predicate) + if value: + resource_dict[key] = value + + resource_dict["url"] = self._object_value( + distribution, DCAT.downloadURL + ) or self._object_value(distribution, DCAT.accessURL) + # Lists + for key, predicate in ( + ("language", DCT.language), + ("documentation", FOAF.page), + ("conforms_to", DCT.conformsTo), + ): + values = self._object_value_list(distribution, predicate) + if values: + resource_dict[key] = json.dumps(values) + + # rights + rights = self._access_rights(distribution, DCT.rights) + if rights: + resource_dict["rights"] = rights + + # Format and media type + normalize_ckan_format = toolkit.asbool( + config.get("ckanext.dcat.normalize_ckan_format", True) + ) + imt, label = self._distribution_format(distribution, normalize_ckan_format) + + if imt: + resource_dict["mimetype"] = imt + + if label: + resource_dict["format"] = label + elif imt: + resource_dict["format"] = imt + + # Size + size = self._object_value_int(distribution, DCAT.byteSize) + if size is not None: + resource_dict["size"] = size + + # Checksum + for checksum in self.g.objects(distribution, SPDX.checksum): + algorithm = self._object_value(checksum, SPDX.algorithm) + checksum_value = self._object_value(checksum, SPDX.checksumValue) + if algorithm: + resource_dict["hash_algorithm"] = algorithm + if checksum_value: + resource_dict["hash"] = checksum_value + + # Distribution URI (explicitly show the missing ones) + resource_dict["uri"] = ( + str(distribution) if isinstance(distribution, term.URIRef) else "" + ) + + # Remember the (internal) distribution reference for referencing in + # further profiles, e.g. for adding more properties + resource_dict["distribution_ref"] = str(distribution) + + dataset_dict["resources"].append(resource_dict) + + if self.compatibility_mode: + # Tweak the resulting dict to make it compatible with previous + # versions of the ckanext-dcat parsers + for extra in dataset_dict["extras"]: + if extra["key"] in ( + "issued", + "modified", + "publisher_name", + "publisher_email", + ): + + extra["key"] = "dcat_" + extra["key"] + + if extra["key"] == "language": + extra["value"] = ",".join(sorted(json.loads(extra["value"]))) + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + g = self.g + + for prefix, namespace in namespaces.items(): + g.bind(prefix, namespace) + + g.add((dataset_ref, RDF.type, DCAT.Dataset)) + + # Basic fields + items = [ + ("title", DCT.title, None, Literal), + ("notes", DCT.description, None, Literal), + ("url", DCAT.landingPage, None, URIRef), + ("identifier", DCT.identifier, ["guid", "id"], URIRefOrLiteral), + ("version", OWL.versionInfo, ["dcat_version"], Literal), + ("version_notes", ADMS.versionNotes, None, Literal), + ("frequency", DCT.accrualPeriodicity, None, URIRefOrLiteral), + ("access_rights", DCT.accessRights, None, URIRefOrLiteral), + ("dcat_type", DCT.type, None, Literal), + ("provenance", DCT.provenance, None, Literal), + ] + self._add_triples_from_dict(dataset_dict, dataset_ref, items) + + # Tags + for tag in dataset_dict.get("tags", []): + g.add((dataset_ref, DCAT.keyword, Literal(tag["name"]))) + + # Dates + items = [ + ("issued", DCT.issued, ["metadata_created"], Literal), + ("modified", DCT.modified, ["metadata_modified"], Literal), + ] + self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) + + # Lists + items = [ + ("language", DCT.language, None, URIRefOrLiteral), + ("theme", DCAT.theme, None, URIRef), + ("conforms_to", DCT.conformsTo, None, Literal), + ("alternate_identifier", ADMS.identifier, None, URIRefOrLiteral), + ("documentation", FOAF.page, None, URIRefOrLiteral), + ("related_resource", DCT.relation, None, URIRefOrLiteral), + ("has_version", DCT.hasVersion, None, URIRefOrLiteral), + ("is_version_of", DCT.isVersionOf, None, URIRefOrLiteral), + ("source", DCT.source, None, URIRefOrLiteral), + ("sample", ADMS.sample, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + + # Contact details + if any( + [ + self._get_dataset_value(dataset_dict, "contact_uri"), + self._get_dataset_value(dataset_dict, "contact_name"), + self._get_dataset_value(dataset_dict, "contact_email"), + self._get_dataset_value(dataset_dict, "maintainer"), + self._get_dataset_value(dataset_dict, "maintainer_email"), + self._get_dataset_value(dataset_dict, "author"), + self._get_dataset_value(dataset_dict, "author_email"), + ] + ): + + contact_uri = self._get_dataset_value(dataset_dict, "contact_uri") + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + g.add((contact_details, RDF.type, VCARD.Organization)) + g.add((dataset_ref, DCAT.contactPoint, contact_details)) + + self._add_triple_from_dict( + dataset_dict, + contact_details, + VCARD.fn, + "contact_name", + ["maintainer", "author"], + ) + # Add mail address as URIRef, and ensure it has a mailto: prefix + self._add_triple_from_dict( + dataset_dict, + contact_details, + VCARD.hasEmail, + "contact_email", + ["maintainer_email", "author_email"], + _type=URIRef, + value_modifier=self._add_mailto, + ) + + # Publisher + publisher_ref = None + + if dataset_dict.get("publisher"): + # Scheming publisher field: will be handled in a separate profile + pass + elif any( + [ + self._get_dataset_value(dataset_dict, "publisher_uri"), + self._get_dataset_value(dataset_dict, "publisher_name"), + ] + ): + # Legacy publisher_* extras + publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri") + publisher_name = self._get_dataset_value(dataset_dict, "publisher_name") + if publisher_uri: + publisher_ref = CleanedURIRef(publisher_uri) + else: + # No publisher_uri + publisher_ref = BNode() + publisher_details = { + "name": publisher_name, + "email": self._get_dataset_value(dataset_dict, "publisher_email"), + "url": self._get_dataset_value(dataset_dict, "publisher_url"), + "type": self._get_dataset_value(dataset_dict, "publisher_type"), + } + elif dataset_dict.get("organization"): + # Fall back to dataset org + org_id = dataset_dict["organization"]["id"] + org_dict = None + if org_id in self._org_cache: + org_dict = self._org_cache[org_id] + else: + try: + org_dict = toolkit.get_action("organization_show")( + {"ignore_auth": True}, {"id": org_id} + ) + self._org_cache[org_id] = org_dict + except toolkit.ObjectNotFound: + pass + if org_dict: + publisher_ref = CleanedURIRef( + publisher_uri_organization_fallback(dataset_dict) + ) + publisher_details = { + "name": org_dict.get("title"), + "email": org_dict.get("email"), + "url": org_dict.get("url"), + "type": org_dict.get("dcat_type"), + } + # Add to graph + if publisher_ref: + g.add((publisher_ref, RDF.type, FOAF.Organization)) + g.add((dataset_ref, DCT.publisher, publisher_ref)) + items = [ + ("name", FOAF.name, None, Literal), + ("email", FOAF.mbox, None, Literal), + ("url", FOAF.homepage, None, URIRef), + ("type", DCT.type, None, URIRefOrLiteral), + ] + self._add_triples_from_dict(publisher_details, publisher_ref, items) + + # Temporal + start = self._get_dataset_value(dataset_dict, "temporal_start") + end = self._get_dataset_value(dataset_dict, "temporal_end") + if start or end: + temporal_extent = BNode() + + g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) + if start: + self._add_date_triple(temporal_extent, SCHEMA.startDate, start) + if end: + self._add_date_triple(temporal_extent, SCHEMA.endDate, end) + g.add((dataset_ref, DCT.temporal, temporal_extent)) + + # Spatial + spatial_text = self._get_dataset_value(dataset_dict, "spatial_text") + spatial_geom = self._get_dataset_value(dataset_dict, "spatial") + + if spatial_text or spatial_geom: + spatial_ref = self._get_or_create_spatial_ref(dataset_dict, dataset_ref) + + if spatial_text: + g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text))) + + if spatial_geom: + self._add_spatial_value_to_graph( + spatial_ref, LOCN.geometry, spatial_geom + ) + + # Use fallback license if set in config + resource_license_fallback = None + if toolkit.asbool(config.get(DISTRIBUTION_LICENSE_FALLBACK_CONFIG, False)): + if "license_id" in dataset_dict and isinstance( + URIRefOrLiteral(dataset_dict["license_id"]), URIRef + ): + resource_license_fallback = dataset_dict["license_id"] + elif "license_url" in dataset_dict and isinstance( + URIRefOrLiteral(dataset_dict["license_url"]), URIRef + ): + resource_license_fallback = dataset_dict["license_url"] + + # Resources + for resource_dict in dataset_dict.get("resources", []): + + distribution = CleanedURIRef(resource_uri(resource_dict)) + + g.add((dataset_ref, DCAT.distribution, distribution)) + + g.add((distribution, RDF.type, DCAT.Distribution)) + + # Simple values + items = [ + ("name", DCT.title, None, Literal), + ("description", DCT.description, None, Literal), + ("status", ADMS.status, None, URIRefOrLiteral), + ("rights", DCT.rights, None, URIRefOrLiteral), + ("license", DCT.license, None, URIRefOrLiteral), + ("access_url", DCAT.accessURL, None, URIRef), + ("download_url", DCAT.downloadURL, None, URIRef), + ] + + self._add_triples_from_dict(resource_dict, distribution, items) + + # Lists + items = [ + ("documentation", FOAF.page, None, URIRefOrLiteral), + ("language", DCT.language, None, URIRefOrLiteral), + ("conforms_to", DCT.conformsTo, None, Literal), + ] + self._add_list_triples_from_dict(resource_dict, distribution, items) + + # Set default license for distribution if needed and available + if resource_license_fallback and not (distribution, DCT.license, None) in g: + g.add( + ( + distribution, + DCT.license, + URIRefOrLiteral(resource_license_fallback), + ) + ) + + # Format + mimetype = resource_dict.get("mimetype") + fmt = resource_dict.get("format") + + # IANA media types (either URI or Literal) should be mapped as mediaType. + # In case format is available and mimetype is not set or identical to format, + # check which type is appropriate. + if fmt and (not mimetype or mimetype == fmt): + if ( + "iana.org/assignments/media-types" in fmt + or not fmt.startswith("http") + and "/" in fmt + ): + # output format value as dcat:mediaType instead of dct:format + mimetype = fmt + fmt = None + else: + # Use dct:format + mimetype = None + + if mimetype: + g.add((distribution, DCAT.mediaType, URIRefOrLiteral(mimetype))) + + if fmt: + g.add((distribution, DCT["format"], URIRefOrLiteral(fmt))) + + # URL fallback and old behavior + url = resource_dict.get("url") + download_url = resource_dict.get("download_url") + access_url = resource_dict.get("access_url") + # Use url as fallback for access_url if access_url is not set and download_url is not equal + if url and not access_url: + if (not download_url) or (download_url and url != download_url): + self._add_triple_from_dict( + resource_dict, distribution, DCAT.accessURL, "url", _type=URIRef + ) + + # Dates + items = [ + ("issued", DCT.issued, ["created"], Literal), + ("modified", DCT.modified, ["metadata_modified"], Literal), + ] + + self._add_date_triples_from_dict(resource_dict, distribution, items) + + # Numbers + if resource_dict.get("size"): + try: + g.add( + ( + distribution, + DCAT.byteSize, + Literal(Decimal(resource_dict["size"]), datatype=XSD.decimal), + ) + ) + except (ValueError, TypeError, DecimalException): + g.add((distribution, DCAT.byteSize, Literal(resource_dict["size"]))) + # Checksum + if resource_dict.get("hash"): + checksum = BNode() + g.add((checksum, RDF.type, SPDX.Checksum)) + g.add( + ( + checksum, + SPDX.checksumValue, + Literal(resource_dict["hash"], datatype=XSD.hexBinary), + ) + ) + + if resource_dict.get("hash_algorithm"): + g.add( + ( + checksum, + SPDX.algorithm, + URIRefOrLiteral(resource_dict["hash_algorithm"]), + ) + ) + + g.add((distribution, SPDX.checksum, checksum)) + + def graph_from_catalog(self, catalog_dict, catalog_ref): + + g = self.g + + for prefix, namespace in namespaces.items(): + g.bind(prefix, namespace) + + g.add((catalog_ref, RDF.type, DCAT.Catalog)) + + # Basic fields + items = [ + ("title", DCT.title, config.get("ckan.site_title"), Literal), + ( + "description", + DCT.description, + config.get("ckan.site_description"), + Literal, + ), + ("homepage", FOAF.homepage, config.get("ckan.site_url"), URIRef), + ( + "language", + DCT.language, + config.get("ckan.locale_default", "en"), + URIRefOrLiteral, + ), + ] + for item in items: + key, predicate, fallback, _type = item + if catalog_dict: + value = catalog_dict.get(key, fallback) + else: + value = fallback + if value: + g.add((catalog_ref, predicate, _type(value))) + + # Dates + modified = self._last_catalog_modification() + if modified: + self._add_date_triple(catalog_ref, DCT.modified, modified) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py new file mode 100644 index 00000000..02c726d3 --- /dev/null +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -0,0 +1,328 @@ +import json +from decimal import Decimal, DecimalException + +from rdflib import URIRef, BNode, Literal +from ckanext.dcat.utils import resource_uri + +from .base import URIRefOrLiteral, CleanedURIRef +from .base import ( + RDF, + SKOS, + DCAT, + DCATAP, + DCT, + XSD, + SCHEMA, +) + +from .euro_dcat_ap import EuropeanDCATAPProfile + + +class EuropeanDCATAP2Profile(EuropeanDCATAPProfile): + """ + An RDF profile based on the DCAT-AP 2 for data portals in Europe + + More information and specification: + + https://joinup.ec.europa.eu/asset/dcat_application_profile + + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + + # call super method + super(EuropeanDCATAP2Profile, self).parse_dataset(dataset_dict, dataset_ref) + + # Standard values + value = self._object_value(dataset_ref, DCAT.temporalResolution) + if value: + dataset_dict["extras"].append({"key": "temporal_resolution", "value": value}) + + # Lists + for key, predicate in ( + ("is_referenced_by", DCT.isReferencedBy), + ("applicable_legislation", DCATAP.applicableLegislation), + ("hvd_category", DCATAP.hvdCategory), + ): + values = self._object_value_list(dataset_ref, predicate) + if values: + dataset_dict["extras"].append({"key": key, "value": json.dumps(values)}) + # Temporal + start, end = self._time_interval(dataset_ref, DCT.temporal, dcat_ap_version=2) + if start: + self._insert_or_update_temporal(dataset_dict, "temporal_start", start) + if end: + self._insert_or_update_temporal(dataset_dict, "temporal_end", end) + + # Spatial + spatial = self._spatial(dataset_ref, DCT.spatial) + for key in ("bbox", "centroid"): + self._add_spatial_to_dict(dataset_dict, key, spatial) + + # Spatial resolution in meters + spatial_resolution = self._object_value_float_list( + dataset_ref, DCAT.spatialResolutionInMeters + ) + if spatial_resolution: + # For some reason we incorrectly allowed lists in this property at some point + # keep support for it but default to single value + value = ( + spatial_resolution[0] if len(spatial_resolution) == 1 + else json.dumps(spatial_resolution) + ) + dataset_dict["extras"].append( + { + "key": "spatial_resolution_in_meters", + "value": value, + } + ) + + # Resources + for distribution in self._distributions(dataset_ref): + distribution_ref = str(distribution) + for resource_dict in dataset_dict.get("resources", []): + # Match distribution in graph and distribution in resource dict + if resource_dict and distribution_ref == resource_dict.get( + "distribution_ref" + ): + # Simple values + for key, predicate in ( + ("availability", DCATAP.availability), + ("compress_format", DCAT.compressFormat), + ("package_format", DCAT.packageFormat), + ): + value = self._object_value(distribution, predicate) + if value: + resource_dict[key] = value + + # Lists + for key, predicate in ( + ("applicable_legislation", DCATAP.applicableLegislation), + ): + values = self._object_value_list(distribution, predicate) + if values: + resource_dict[key] = json.dumps(values) + + # Access services + access_service_list = [] + + for access_service in self.g.objects( + distribution, DCAT.accessService + ): + access_service_dict = {} + + # Simple values + for key, predicate in ( + ("availability", DCATAP.availability), + ("title", DCT.title), + ("endpoint_description", DCAT.endpointDescription), + ("license", DCT.license), + ("access_rights", DCT.accessRights), + ("description", DCT.description), + ): + value = self._object_value(access_service, predicate) + if value: + access_service_dict[key] = value + # List + for key, predicate in ( + ("endpoint_url", DCAT.endpointURL), + ("serves_dataset", DCAT.servesDataset), + ): + values = self._object_value_list(access_service, predicate) + if values: + access_service_dict[key] = values + + # Access service URI (explicitly show the missing ones) + access_service_dict["uri"] = ( + str(access_service) + if isinstance(access_service, URIRef) + else "" + ) + + # Remember the (internal) access service reference for referencing in + # further profiles, e.g. for adding more properties + access_service_dict["access_service_ref"] = str(access_service) + + access_service_list.append(access_service_dict) + + if access_service_list: + resource_dict["access_services"] = json.dumps( + access_service_list + ) + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + # call super method + super(EuropeanDCATAP2Profile, self).graph_from_dataset( + dataset_dict, dataset_ref + ) + + # Standard values + self._add_triple_from_dict( + dataset_dict, + dataset_ref, + DCAT.temporalResolution, + "temporal_resolution", + _datatype=XSD.duration, + ) + + # Lists + for key, predicate, fallbacks, type, datatype in ( + ("is_referenced_by", DCT.isReferencedBy, None, URIRefOrLiteral, None), + ( + "applicable_legislation", + DCATAP.applicableLegislation, + None, + URIRefOrLiteral, + None, + ), + ("hvd_category", DCATAP.hvdCategory, None, URIRefOrLiteral, None), + ): + self._add_triple_from_dict( + dataset_dict, + dataset_ref, + predicate, + key, + list_value=True, + fallbacks=fallbacks, + _type=type, + _datatype=datatype, + ) + + # Temporal + + # The profile for DCAT-AP 1 stored triples using schema:startDate, + # remove them to avoid duplication + for temporal in self.g.objects(dataset_ref, DCT.temporal): + if SCHEMA.startDate in [t for t in self.g.predicates(temporal, None)]: + self.g.remove((temporal, None, None)) + self.g.remove((dataset_ref, DCT.temporal, temporal)) + + start = self._get_dataset_value(dataset_dict, "temporal_start") + end = self._get_dataset_value(dataset_dict, "temporal_end") + if start or end: + temporal_extent_dcat = BNode() + + self.g.add((temporal_extent_dcat, RDF.type, DCT.PeriodOfTime)) + if start: + self._add_date_triple(temporal_extent_dcat, DCAT.startDate, start) + if end: + self._add_date_triple(temporal_extent_dcat, DCAT.endDate, end) + self.g.add((dataset_ref, DCT.temporal, temporal_extent_dcat)) + + # spatial + spatial_bbox = self._get_dataset_value(dataset_dict, "spatial_bbox") + spatial_cent = self._get_dataset_value(dataset_dict, "spatial_centroid") + + if spatial_bbox or spatial_cent: + spatial_ref = self._get_or_create_spatial_ref(dataset_dict, dataset_ref) + + if spatial_bbox: + self._add_spatial_value_to_graph(spatial_ref, DCAT.bbox, spatial_bbox) + + if spatial_cent: + self._add_spatial_value_to_graph( + spatial_ref, DCAT.centroid, spatial_cent + ) + + # Spatial resolution in meters + spatial_resolution_in_meters = self._read_list_value( + self._get_dataset_value(dataset_dict, "spatial_resolution_in_meters") + ) + if spatial_resolution_in_meters: + for value in spatial_resolution_in_meters: + try: + self.g.add( + ( + dataset_ref, + DCAT.spatialResolutionInMeters, + Literal(Decimal(value), datatype=XSD.decimal), + ) + ) + except (ValueError, TypeError, DecimalException): + self.g.add( + (dataset_ref, DCAT.spatialResolutionInMeters, Literal(value)) + ) + + # Resources + for resource_dict in dataset_dict.get("resources", []): + + distribution = CleanedURIRef(resource_uri(resource_dict)) + + # Simple values + items = [ + ("availability", DCATAP.availability, None, URIRefOrLiteral), + ("compress_format", DCAT.compressFormat, None, URIRefOrLiteral), + ("package_format", DCAT.packageFormat, None, URIRefOrLiteral), + ] + + self._add_triples_from_dict(resource_dict, distribution, items) + + # Lists + items = [ + ( + "applicable_legislation", + DCATAP.applicableLegislation, + None, + URIRefOrLiteral, + ), + ] + self._add_list_triples_from_dict(resource_dict, distribution, items) + + # Access services + access_service_list = resource_dict.get("access_services", []) + if isinstance(access_service_list, str): + try: + access_service_list = json.loads(access_service_list) + except ValueError: + access_service_list = [] + + for access_service_dict in access_service_list: + + access_service_uri = access_service_dict.get("uri") + if access_service_uri: + access_service_node = CleanedURIRef(access_service_uri) + else: + access_service_node = BNode() + # Remember the (internal) access service reference for referencing in + # further profiles + access_service_dict["access_service_ref"] = str(access_service_node) + + self.g.add((distribution, DCAT.accessService, access_service_node)) + + self.g.add((access_service_node, RDF.type, DCAT.DataService)) + + # Simple values + items = [ + ("availability", DCATAP.availability, None, URIRefOrLiteral), + ("license", DCT.license, None, URIRefOrLiteral), + ("access_rights", DCT.accessRights, None, URIRefOrLiteral), + ("title", DCT.title, None, Literal), + ("endpoint_description", DCAT.endpointDescription, None, URIRefOrLiteral), + ("description", DCT.description, None, Literal), + ] + + self._add_triples_from_dict( + access_service_dict, access_service_node, items + ) + + # Lists + items = [ + ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral), + ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict( + access_service_dict, access_service_node, items + ) + + if access_service_list: + resource_dict["access_services"] = json.dumps(access_service_list) + + def graph_from_catalog(self, catalog_dict, catalog_ref): + + # call super method + super(EuropeanDCATAP2Profile, self).graph_from_catalog( + catalog_dict, catalog_ref + ) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py new file mode 100644 index 00000000..5fdd4ced --- /dev/null +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -0,0 +1,220 @@ +import json + +from rdflib import URIRef, BNode, Literal +from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral +from .base import ( + RDF, + XSD, + DCAT, + DCT, + VCARD, + FOAF, + SCHEMA, + SKOS, + LOCN, +) + + +class EuropeanDCATAPSchemingProfile(RDFProfile): + """ + This is a compatibilty profile meant to add support for ckanext-scheming to the existing + `euro_dcat_ap` and `euro_dcat_ap_2` profiles. + It does not add or remove any properties from these profiles, it just transforms the + resulting dataset_dict so it is compatible with a ckanext-scheming schema + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + """ + Modify the dataset_dict generated by the euro_dcat_ap andeuro_dcat_ap_2 profiles + to make it compatible with the scheming file definitions: + * Move extras to root level fields + * Parse lists (multiple text preset) + * Turn namespaced extras into repeating subfields + """ + + if not self._dataset_schema: + # Not using scheming + return dataset_dict + + # Move extras to root + + extras_to_remove = [] + extras = dataset_dict.get("extras", []) + for extra in extras: + if self._schema_field(extra["key"]): + # This is a field defined in the dataset schema + dataset_dict[extra["key"]] = extra["value"] + extras_to_remove.append(extra["key"]) + + dataset_dict["extras"] = [e for e in extras if e["key"] not in extras_to_remove] + + # Parse lists + def _parse_list_value(data_dict, field_name): + schema_field = self._schema_field( + field_name + ) or self._schema_resource_field(field_name) + + if schema_field and "scheming_multiple_text" in schema_field.get( + "validators", [] + ): + if isinstance(data_dict[field_name], str): + try: + data_dict[field_name] = json.loads(data_dict[field_name]) + except ValueError: + pass + + for field_name in dataset_dict.keys(): + _parse_list_value(dataset_dict, field_name) + + for resource_dict in dataset_dict.get("resources", []): + for field_name in resource_dict.keys(): + _parse_list_value(resource_dict, field_name) + + # Repeating subfields + new_fields_mapping = { + "temporal_coverage": "temporal" + } + for schema_field in self._dataset_schema["dataset_fields"]: + if "repeating_subfields" in schema_field: + # Check if existing extras need to be migrated + field_name = schema_field["field_name"] + new_extras = [] + new_dict = {} + check_name = new_fields_mapping.get(field_name, field_name) + for extra in dataset_dict.get("extras", []): + if extra["key"].startswith(f"{check_name}_"): + subfield = extra["key"][extra["key"].index("_") + 1 :] + if subfield in [ + f["field_name"] for f in schema_field["repeating_subfields"] + ]: + new_dict[subfield] = extra["value"] + else: + new_extras.append(extra) + else: + new_extras.append(extra) + if new_dict: + dataset_dict[field_name] = [new_dict] + dataset_dict["extras"] = new_extras + + # Repeating subfields: resources + for schema_field in self._dataset_schema["resource_fields"]: + if "repeating_subfields" in schema_field: + # Check if value needs to be load from JSON + field_name = schema_field["field_name"] + for resource_dict in dataset_dict.get("resources", []): + if resource_dict.get(field_name) and isinstance( + resource_dict[field_name], str + ): + try: + # TODO: load only subfields in schema? + resource_dict[field_name] = json.loads( + resource_dict[field_name] + ) + except ValueError: + pass + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + """ + Add triples to the graph from new repeating subfields + """ + + def _not_empty_dict(data_dict): + return any(data_dict.values()) + + contact = dataset_dict.get("contact") + if isinstance(contact, list) and len(contact) and _not_empty_dict(contact[0]): + for item in contact: + contact_uri = item.get("uri") + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + self.g.add((contact_details, RDF.type, VCARD.Organization)) + self.g.add((dataset_ref, DCAT.contactPoint, contact_details)) + + self._add_triple_from_dict(item, contact_details, VCARD.fn, "name") + # Add mail address as URIRef, and ensure it has a mailto: prefix + self._add_triple_from_dict( + item, + contact_details, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + + publisher = dataset_dict.get("publisher") + if isinstance(publisher, list) and len(publisher) and _not_empty_dict(publisher[0]): + publisher = publisher[0] + publisher_uri = publisher.get("uri") + if publisher_uri: + publisher_ref = CleanedURIRef(publisher_uri) + else: + publisher_ref = BNode() + + self.g.add((publisher_ref, RDF.type, FOAF.Organization)) + self.g.add((dataset_ref, DCT.publisher, publisher_ref)) + + self._add_triple_from_dict(publisher, publisher_ref, FOAF.name, "name") + self._add_triple_from_dict( + publisher, publisher_ref, FOAF.homepage, "url", _type=URIRef + ) + self._add_triple_from_dict( + publisher, publisher_ref, DCT.type, "type", _type=URIRefOrLiteral + ) + self._add_triple_from_dict( + publisher, + publisher_ref, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + + temporal = dataset_dict.get("temporal_coverage") + if isinstance(temporal, list) and len(temporal) and _not_empty_dict(temporal[0]): + for item in temporal: + temporal_ref = BNode() + self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime)) + if item.get("start"): + self._add_date_triple(temporal_ref, DCAT.startDate, item["start"]) + if item.get("end"): + self._add_date_triple(temporal_ref, DCAT.endDate, item["end"]) + self.g.add((dataset_ref, DCT.temporal, temporal_ref)) + + spatial = dataset_dict.get("spatial_coverage") + if isinstance(spatial, list) and len(spatial) and _not_empty_dict(spatial[0]): + for item in spatial: + if item.get("uri"): + spatial_ref = CleanedURIRef(item["uri"]) + else: + spatial_ref = BNode() + self.g.add((spatial_ref, RDF.type, DCT.Location)) + self.g.add((dataset_ref, DCT.spatial, spatial_ref)) + + if item.get("text"): + self.g.add((spatial_ref, SKOS.prefLabel, Literal(item["text"]))) + + for field in [ + ("geom", LOCN.geometry), + ("bbox", DCAT.bbox), + ("centroid", DCAT.centroid), + ]: + if item.get(field[0]): + self._add_spatial_value_to_graph( + spatial_ref, field[1], item[field[0]] + ) + + resources = dataset_dict.get("resources", []) + for resource in resources: + if resource.get("access_services"): + if isinstance(resource["access_services"], str): + try: + resource["access_services"] = json.loads( + resource["access_services"] + ) + except ValueError: + pass diff --git a/ckanext/dcat/profiles/schemaorg.py b/ckanext/dcat/profiles/schemaorg.py new file mode 100644 index 00000000..3b3ec3b0 --- /dev/null +++ b/ckanext/dcat/profiles/schemaorg.py @@ -0,0 +1,339 @@ +import datetime + +from dateutil.parser import parse as parse_date +from rdflib import URIRef, BNode, Literal +from ckantoolkit import url_for, config + +from ckanext.dcat.utils import resource_uri, publisher_uri_organization_fallback +from .base import RDFProfile, CleanedURIRef +from .base import ( + RDF, + SCHEMA, +) + + +class SchemaOrgProfile(RDFProfile): + """ + An RDF profile based on the schema.org Dataset + + More information and specification: + + http://schema.org/Dataset + + Mapping between schema.org Dataset and DCAT: + + https://www.w3.org/wiki/WebSchemas/Datasets + """ + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + g = self.g + + # Namespaces + self._bind_namespaces() + + g.add((dataset_ref, RDF.type, SCHEMA.Dataset)) + + # Basic fields + self._basic_fields_graph(dataset_ref, dataset_dict) + + # Catalog + self._catalog_graph(dataset_ref, dataset_dict) + + # Groups + self._groups_graph(dataset_ref, dataset_dict) + + # Tags + self._tags_graph(dataset_ref, dataset_dict) + + # Lists + self._list_fields_graph(dataset_ref, dataset_dict) + + # Publisher + self._publisher_graph(dataset_ref, dataset_dict) + + # Temporal + self._temporal_graph(dataset_ref, dataset_dict) + + # Spatial + self._spatial_graph(dataset_ref, dataset_dict) + + # Resources + self._resources_graph(dataset_ref, dataset_dict) + + # Additional fields + self.additional_fields(dataset_ref, dataset_dict) + + def additional_fields(self, dataset_ref, dataset_dict): + """ + Adds any additional fields. + + For a custom schema you should extend this class and + implement this method. + """ + pass + + def _add_date_triple(self, subject, predicate, value, _type=Literal): + """ + Adds a new triple with a date object + + Dates are parsed using dateutil, and if the date obtained is correct, + added to the graph as an SCHEMA.DateTime value. + + If there are parsing errors, the literal string value is added. + """ + if not value: + return + try: + default_datetime = datetime.datetime(1, 1, 1, 0, 0, 0) + _date = parse_date(value, default=default_datetime) + + self.g.add((subject, predicate, _type(_date.isoformat()))) + except ValueError: + self.g.add((subject, predicate, _type(value))) + + def _bind_namespaces(self): + self.g.namespace_manager.bind("schema", SCHEMA, replace=True) + + def _basic_fields_graph(self, dataset_ref, dataset_dict): + items = [ + ("identifier", SCHEMA.identifier, None, Literal), + ("title", SCHEMA.name, None, Literal), + ("notes", SCHEMA.description, None, Literal), + ("version", SCHEMA.version, ["dcat_version"], Literal), + ("issued", SCHEMA.datePublished, ["metadata_created"], Literal), + ("modified", SCHEMA.dateModified, ["metadata_modified"], Literal), + ("license", SCHEMA.license, ["license_url", "license_title"], Literal), + ] + self._add_triples_from_dict(dataset_dict, dataset_ref, items) + + items = [ + ("issued", SCHEMA.datePublished, ["metadata_created"], Literal), + ("modified", SCHEMA.dateModified, ["metadata_modified"], Literal), + ] + + self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) + + # Dataset URL + dataset_url = url_for("dataset.read", id=dataset_dict["name"], _external=True) + self.g.add((dataset_ref, SCHEMA.url, Literal(dataset_url))) + + def _catalog_graph(self, dataset_ref, dataset_dict): + data_catalog = BNode() + self.g.add((dataset_ref, SCHEMA.includedInDataCatalog, data_catalog)) + self.g.add((data_catalog, RDF.type, SCHEMA.DataCatalog)) + self.g.add((data_catalog, SCHEMA.name, Literal(config.get("ckan.site_title")))) + self.g.add( + ( + data_catalog, + SCHEMA.description, + Literal(config.get("ckan.site_description")), + ) + ) + self.g.add((data_catalog, SCHEMA.url, Literal(config.get("ckan.site_url")))) + + def _groups_graph(self, dataset_ref, dataset_dict): + for group in dataset_dict.get("groups", []): + group_url = url_for( + controller="group", action="read", id=group.get("id"), _external=True + ) + about = BNode() + + self.g.add((about, RDF.type, SCHEMA.Thing)) + + self.g.add((about, SCHEMA.name, Literal(group["name"]))) + self.g.add((about, SCHEMA.url, Literal(group_url))) + + self.g.add((dataset_ref, SCHEMA.about, about)) + + def _tags_graph(self, dataset_ref, dataset_dict): + for tag in dataset_dict.get("tags", []): + self.g.add((dataset_ref, SCHEMA.keywords, Literal(tag["name"]))) + + def _list_fields_graph(self, dataset_ref, dataset_dict): + items = [ + ("language", SCHEMA.inLanguage, None, Literal), + ] + self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + + def _publisher_graph(self, dataset_ref, dataset_dict): + if any( + [ + self._get_dataset_value(dataset_dict, "publisher_uri"), + self._get_dataset_value(dataset_dict, "publisher_name"), + dataset_dict.get("organization"), + ] + ): + + publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri") + publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) + publisher_name = self._get_dataset_value(dataset_dict, "publisher_name") + if publisher_uri: + publisher_details = CleanedURIRef(publisher_uri) + elif not publisher_name and publisher_uri_fallback: + # neither URI nor name are available, use organization as fallback + publisher_details = CleanedURIRef(publisher_uri_fallback) + else: + # No publisher_uri + publisher_details = BNode() + + self.g.add((publisher_details, RDF.type, SCHEMA.Organization)) + self.g.add((dataset_ref, SCHEMA.publisher, publisher_details)) + + # In case no name and URI are available, again fall back to organization. + # If no name but an URI is available, the name literal remains empty to + # avoid mixing organization and dataset values. + if ( + not publisher_name + and not publisher_uri + and dataset_dict.get("organization") + ): + publisher_name = dataset_dict["organization"]["title"] + self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name))) + + contact_point = BNode() + self.g.add((contact_point, RDF.type, SCHEMA.ContactPoint)) + self.g.add((publisher_details, SCHEMA.contactPoint, contact_point)) + + self.g.add((contact_point, SCHEMA.contactType, Literal("customer service"))) + + publisher_url = self._get_dataset_value(dataset_dict, "publisher_url") + if not publisher_url and dataset_dict.get("organization"): + publisher_url = dataset_dict["organization"].get("url") or config.get( + "ckan.site_url" + ) + + self.g.add((contact_point, SCHEMA.url, Literal(publisher_url))) + items = [ + ( + "publisher_email", + SCHEMA.email, + ["contact_email", "maintainer_email", "author_email"], + Literal, + ), + ( + "publisher_name", + SCHEMA.name, + ["contact_name", "maintainer", "author"], + Literal, + ), + ] + + self._add_triples_from_dict(dataset_dict, contact_point, items) + + def _temporal_graph(self, dataset_ref, dataset_dict): + start = self._get_dataset_value(dataset_dict, "temporal_start") + end = self._get_dataset_value(dataset_dict, "temporal_end") + if start or end: + if start and end: + self.g.add( + ( + dataset_ref, + SCHEMA.temporalCoverage, + Literal("%s/%s" % (start, end)), + ) + ) + elif start: + self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, start) + elif end: + self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, end) + + def _spatial_graph(self, dataset_ref, dataset_dict): + spatial_uri = self._get_dataset_value(dataset_dict, "spatial_uri") + spatial_text = self._get_dataset_value(dataset_dict, "spatial_text") + spatial_geom = self._get_dataset_value(dataset_dict, "spatial") + + if spatial_uri or spatial_text or spatial_geom: + if spatial_uri: + spatial_ref = URIRef(spatial_uri) + else: + spatial_ref = BNode() + + self.g.add((spatial_ref, RDF.type, SCHEMA.Place)) + self.g.add((dataset_ref, SCHEMA.spatialCoverage, spatial_ref)) + + if spatial_text: + self.g.add((spatial_ref, SCHEMA.description, Literal(spatial_text))) + + if spatial_geom: + geo_shape = BNode() + self.g.add((geo_shape, RDF.type, SCHEMA.GeoShape)) + self.g.add((spatial_ref, SCHEMA.geo, geo_shape)) + + # the spatial_geom typically contains GeoJSON + self.g.add((geo_shape, SCHEMA.polygon, Literal(spatial_geom))) + + def _resources_graph(self, dataset_ref, dataset_dict): + g = self.g + for resource_dict in dataset_dict.get("resources", []): + distribution = URIRef(resource_uri(resource_dict)) + g.add((dataset_ref, SCHEMA.distribution, distribution)) + g.add((distribution, RDF.type, SCHEMA.DataDownload)) + + self._distribution_graph(distribution, resource_dict) + + def _distribution_graph(self, distribution, resource_dict): + # Simple values + self._distribution_basic_fields_graph(distribution, resource_dict) + + # Lists + self._distribution_list_fields_graph(distribution, resource_dict) + + # Format + self._distribution_format_graph(distribution, resource_dict) + + # URL + self._distribution_url_graph(distribution, resource_dict) + + # Numbers + self._distribution_numbers_graph(distribution, resource_dict) + + def _distribution_basic_fields_graph(self, distribution, resource_dict): + items = [ + ("name", SCHEMA.name, None, Literal), + ("description", SCHEMA.description, None, Literal), + ("license", SCHEMA.license, ["rights"], Literal), + ] + + self._add_triples_from_dict(resource_dict, distribution, items) + + items = [ + ("issued", SCHEMA.datePublished, None, Literal), + ("modified", SCHEMA.dateModified, None, Literal), + ] + + self._add_date_triples_from_dict(resource_dict, distribution, items) + + def _distribution_list_fields_graph(self, distribution, resource_dict): + items = [ + ("language", SCHEMA.inLanguage, None, Literal), + ] + self._add_list_triples_from_dict(resource_dict, distribution, items) + + def _distribution_format_graph(self, distribution, resource_dict): + if resource_dict.get("format"): + self.g.add( + (distribution, SCHEMA.encodingFormat, Literal(resource_dict["format"])) + ) + elif resource_dict.get("mimetype"): + self.g.add( + ( + distribution, + SCHEMA.encodingFormat, + Literal(resource_dict["mimetype"]), + ) + ) + + def _distribution_url_graph(self, distribution, resource_dict): + url = resource_dict.get("url") + download_url = resource_dict.get("download_url") + if download_url: + self.g.add((distribution, SCHEMA.contentUrl, Literal(download_url))) + if (url and not download_url) or (url and url != download_url): + self.g.add((distribution, SCHEMA.url, Literal(url))) + + def _distribution_numbers_graph(self, distribution, resource_dict): + if resource_dict.get("size"): + self.g.add( + (distribution, SCHEMA.contentSize, Literal(resource_dict["size"])) + ) diff --git a/ckanext/dcat/schemas/__init__.py b/ckanext/dcat/schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml new file mode 100644 index 00000000..8f9f4afc --- /dev/null +++ b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml @@ -0,0 +1,384 @@ +scheming_version: 2 +dataset_type: dataset +about: Full DCAT AP 2.1 schema +about_url: http://github.com/ckan/ckanext-dcat + +dataset_fields: + +- field_name: title + label: Title + preset: title + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes + label: Description + required: true + form_snippet: markdown.html + help_text: A free-text account of the dataset. + +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + help_text: Entity responsible for making the dataset available. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + label: Version notes + validators: ignore_missing unicode_safe + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset. + +- field_name: frequency + label: Frequency + help_text: The frequency at which dataset is published. + +- field_name: provenance + label: Provenance + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A statement about the lineage of the dataset. + +- field_name: dcat_type + label: Type + help_text: The type of the dataset. + # TODO: controlled vocabulary? + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the dataset. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. + +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + form_snippet: markdown.html + display_snippet: markdown.html + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. + +- field_name: alternate_identifier + label: Other identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. + +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + +#- field_name: hvd_category +# label: HVD Category +# preset: multiple_text +# validators: ignore_missing scheming_multiple_text +# TODO: implement separately as part of wider HVD support + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name + label: Name + form_placeholder: + help_text: A descriptive title for the resource. + +- field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: mimetype + label: Media type + validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. + +- field_name: compress_format + label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. + +- field_name: package_format + label: Package format + help_text: The format of the file in which one or more data files are grouped together. + +- field_name: size + label: Size + validators: ignore_missing int_validator + form_snippet: number.html + display_snippet: file_size.html + help_text: File size in bytes + +- field_name: hash + label: Hash + help_text: Checksum of the downloaded file. + +- field_name: hash_algorithm + label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. + +- field_name: rights + label: Rights + form_snippet: markdown.html + display_snippet: markdown.html + help_text: Some statement about the rights associated with the resource. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: status + label: Status + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_description + label: Endpoint description + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + validators: ignore_missing scheming_multiple_text + + help_text: A data service that gives access to the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml new file mode 100644 index 00000000..ed386d67 --- /dev/null +++ b/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml @@ -0,0 +1,147 @@ +scheming_version: 2 +dataset_type: dataset +about: Recommended fields for DCAT AP 2.1 schema +about_url: http://github.com/ckan/ckanext-dcat + +dataset_fields: + +- field_name: title + label: Title + preset: title + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes + label: Description + required: true + form_snippet: markdown.html + help_text: A free-text account of the dataset. + +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + help_text: Entity responsible for making the dataset available. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name + label: Name + form_placeholder: + help_text: A descriptive title for the resource. + +- field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. diff --git a/ckanext/dcat/schemas/presets.yaml b/ckanext/dcat/schemas/presets.yaml new file mode 100644 index 00000000..88be7b0c --- /dev/null +++ b/ckanext/dcat/schemas/presets.yaml @@ -0,0 +1,12 @@ +scheming_presets_version: 1 +about: Presets for the ckanext-dcat extension +about_url": "http://github.com/ckan/ckanext-dcat" + +presets: + +- preset_name: dcat_date + values: + # Note: use datetime.html or datetime_tz.html if you want to inclue an input for time + form_snippet: date.html + display_snippet: dcat_date.html + validators: ignore_missing dcat_date convert_to_json_if_datetime diff --git a/ckanext/dcat/schemas/publisher_organization.yaml b/ckanext/dcat/schemas/publisher_organization.yaml new file mode 100644 index 00000000..3d1f7d3b --- /dev/null +++ b/ckanext/dcat/schemas/publisher_organization.yaml @@ -0,0 +1,35 @@ +scheming_version: 2 +about_url: http://github.com/ckan/ckanext-dcat +description: > + An organization schema that implements the properties supported + by default in the dct:publisher property of a dcat:Dataset + +fields: + +- field_name: title + label: Name + validators: ignore_missing unicode_safe + form_snippet: large_text.html + form_attrs: {data-module: slug-preview-target} + +- field_name: name + label: URL + validators: not_empty unicode_safe name_validator group_name_validator + form_snippet: slug.html + form_placeholder: my-theme + +- field_name: notes + label: Description + form_snippet: markdown.html + form_placeholder: A little information about this organization. + +- field_name: email + label: Email + display_snippet: email.html + +- field_name: url + label: URL + display_snippet: link.html + +- field_name: dcat_type + label: Type diff --git a/ckanext/dcat/templates/home/index.html b/ckanext/dcat/templates/home/index.html index d92671e4..28e45378 100644 --- a/ckanext/dcat/templates/home/index.html +++ b/ckanext/dcat/templates/home/index.html @@ -3,7 +3,7 @@ {{ super() }} {% with endpoint=h.dcat_get_endpoint('catalog') %} - + {% endwith %} diff --git a/ckanext/dcat/templates/package/read_base.html b/ckanext/dcat/templates/package/read_base.html index ea178d06..4c6023c6 100644 --- a/ckanext/dcat/templates/package/read_base.html +++ b/ckanext/dcat/templates/package/read_base.html @@ -1,14 +1,18 @@ {% ckan_extends %} {% block links %} + {{ super() }} - {% with endpoint=h.dcat_get_endpoint('dataset') %} - - - - - {% endwith %} + + {% if h.dcat_endpoints_enabled() %} + {% with endpoint=h.dcat_get_endpoint('dataset') %} + + + + + {% endwith %} + {% endif %} {% endblock -%} -{% block body_extras %} +{% block scripts %} {{ super() }} {% block structured_data %} {# @@ -18,7 +22,6 @@ More information about structured data: https://developers.google.com/search/docs/guides/intro-structured-data #} - {% if h.helper_available('structured_data') %}