forked from ckan/ckan-docker
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathuri_label_extractor.py
62 lines (53 loc) · 1.92 KB
/
uri_label_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# SPDX-FileCopyrightText: 2024 PNED G.I.E.
#
# SPDX-License-Identifier: Apache-2.0
import xml.etree.ElementTree as ET
import requests
def get_rdf_about(url):
"""
Function that reads a XML file from url, and lists the attribute `rdf:about` of all `rdf:Description`.
"""
response = requests.get(url)
root = ET.fromstring(response.content)
rdf_about = []
for description in root.findall(
".//rdf:Description", {"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
):
rdf_about.append(
description.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about")
)
return rdf_about
def get_rdf_prefLabel(url):
"""
Function that reads a XML file from url, and retrieves all childs `skos:prefLabel` of `rdf:Description`, where the key is `xml:lang` and text of the tag as value.
"""
response = requests.get(url)
root = ET.fromstring(response.content)
rdf_prefLabel = {}
for description in root.findall(
".//rdf:Description", {"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
):
for prefLabel in description.findall(
".//skos:prefLabel", {"skos": "http://www.w3.org/2004/02/skos/core#"}
):
rdf_prefLabel[
prefLabel.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
] = prefLabel.text
return rdf_prefLabel
def write_file(file_name, data):
"""
Function that writes into a file a list of strings
"""
with open(file_name, "w") as f:
for line in data:
f.write(line + "\n")
if __name__ == "__main__":
url = "https://publications.europa.eu/resource/authority/country"
rdf_about = get_rdf_about(url)
lines = []
for i in rdf_about:
labels = get_rdf_prefLabel(i)
for k, v in labels.items():
if k == "en":
lines.append(f'{i},"{v}",{k}')
write_file("uri_labels.csv", lines)