-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxpoJson2rdf.py
194 lines (167 loc) · 9.65 KB
/
xpoJson2rdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python
# coding: utf-8
# XPO JSON to RDF
# * Converts the JSON representation of the XPO knowledge graph to RDF
# * The version of the XPO JSON file is from this [GitHub repository](https://github.com/e-spaulding/xpo)
# * a 2023 paper describing it is [here](https://github.com/Ebiquity/xpo_rdf/blob/main/ISA2023_DWD_Overlay_Workshop_Paper.pdf)
# * The resulting RDF graph uses two name space prefixes defined in purl that redirect to the xpo.nt file in the repository on GitHub
# * xpo: http://purl.org/xpo/
# * dwd: http://purl.org/dwd/
# * We tried using a simpler approach that would make minor changes to the JSON file to make it JSON-LD and then using a standard tool to covert it to an RDF representation, but the format of the original xpo JSON file required to many structural changes.
#
# * we might want to extend this to add an addional schema statements to allow more inferences to be drawn
from rdflib import Graph, URIRef, Namespace, BNode, Literal
from rdflib.namespace import RDF, OWL, RDFS
import shortuuid
import json
def bnode(prefix = ''):
# custom BNode-like function adds a prefix to a short uuid sequence
if prefix:
return BNode(prefix + '_' + shortuuid.uuid()[:5])
else:
return BNode(shortuuid.uuid()[:5])
# Create and inital RDF graph and bind some namespace prefixes in it
GRAPH = Graph()
# unsure if we should use xpo or dwd as the profix
XPO = Namespace('http://purl.org/xpo/')
DWD = Namespace('http://purl.org/dwd/')
# use this namespace for the ontology
ONT = XPO
WD = Namespace('http://www.wikidata.org/wiki/')
WDP = Namespace('http://www.wikidata.org/wiki/Property:')
GRAPH.bind("dwd", DWD)
GRAPH.bind("xpo", XPO)
GRAPH.bind("owl", OWL)
GRAPH.bind("rdf", RDF)
GRAPH.bind('rdfs', RDFS)
GRAPH.bind('wd', WD)
# Get the XPO JSON file and extract the four subsets of data
# * each is a dictionary where the keys are node ids and the values are dictionaries of their properties and values.
# * for some prperaties the value is a list -- for these with generate multiple edges, one for each value in the list.
# * we start by extracing the four top-level categories: events, entities, relations, and temopral_relations
xpo_data = json.load(open('xpo.json'))
event = xpo_data['events']
entity = xpo_data['entities']
relation = xpo_data['relations']
temporal_relation = xpo_data['temporal_relations']
# This is a list of all of the properties found in version 5.4.7
# these were extracted from the json file
all_properties = ['arguments', 'comment', 'constraints', 'curated_by',
'dwd_arg_name', 'entities', 'events', 'ldc_argument_output_value',
'ldc_arguments', 'ldc_code', 'ldc_constraints', 'ldc_name', 'ldc_types',
'mapping_types', 'name', 'other_pb_rolesets', 'overlay_parents',
'pb_mapping', 'pb_roleset', 'related_qnodes', 'relations', 'short_name',
'similar_nodes', 'similarity_type', 'template', 'template_curation',
'temporal_relations', 'type', 'version', 'wd_description', 'wd_node',
'wd_slot']
def convert_generic(data, DWD=XPO, type="?", stop=0):
count = 0 # just used for testing
for (node, edges) in data.items():
count += 1
if stop > 0 and count > stop:
break
node = URIRef(DWD+node)
for (property, value) in edges.items():
if property in ['type', 'comment', 'curated_by', 'description',
'wd_node', 'name', 'wd_description', 'template',
'template_curation', 'pb_roleset']:
# properties with a string value
GRAPH.add((node, URIRef(DWD + property), Literal(value)))
elif property == "overlay_parents":
if isinstance(value, list):
for v in value:
overlay_node = bnode('OVERLAY')
GRAPH.add((node, DWD.overlay, overlay_node))
#GRAPH.add((overlay_node, RDF.type, DWD.overlay)) #overlay?
GRAPH.add((overlay_node, DWD.overlay_parent, URIRef(WD+v['wd_node'])))
GRAPH.add((overlay_node, DWD.overlay_parent_name, Literal(v['name'])))
else:
print(f"Bad property-values {property} {value}")
elif property == 'similar_nodes':
if isinstance(value, list):
for v in value:
similar_node = bnode('SIMILAR')
GRAPH.add((node, DWD.similarNode, similar_node))
# GRAPH.add((similar_node, RDF.type, DWD.similar_node))
GRAPH.add((similar_node, DWD.wd_node, URIRef(WD+v['wd_node'])))
GRAPH.add((similar_node, DWD.name, Literal(v['name'])))
GRAPH.add((similar_node, DWD.similarity_type, URIRef(DWD+v['similarity_type'])))
else:
print(f"Bad property-values {property} {value}")
elif property == 'ldc_types':
if isinstance(value, list):
# should be a list of dicts
for v in value:
ldc_type_node = bnode('LDCTYPE')
GRAPH.add((node, DWD.ldc_type, ldc_type_node))
for (vname, vvalue) in v.items():
if vname == 'name':
GRAPH.add((ldc_type_node, DWD.name, Literal(vvalue)))
elif vname == 'ldc_code':
GRAPH.add((ldc_type_node, DWD.ldc_code, Literal(vvalue)))
elif vname == 'other_pb_rolesets':
for pb_roleset in vvalue:
GRAPH.add((ldc_type_node, DWD.other_pb_roleset, Literal(pb_roleset)))
elif vname == 'ldc_arguments':
for ldc_arg in vvalue:
ldc_arg_node = bnode('LDCARG')
GRAPH.add((ldc_type_node, DWD.ldc_argument, ldc_arg_node))
for (ldc_arg_name, ldc_arg_value) in ldc_arg.items():
if ldc_arg_name in ["ldc_name", "ldc_argument_output_value","dwd_arg_name"]:
# all have simple string values
GRAPH.add((ldc_arg_node, DWD.ldc_code, Literal(ldc_arg_value)))
if ldc_arg_name == "ldc_contraints":
for ent_type in ldc_arg_value:
graph.add((ldc_arg_node, DWD.ldc_constraint, Literal(ent_type)))
else:
print(f"Bad LDC_types property (unrecognized): {node} {property} {vname}")
else:
print(f"Bad property-values {property} {value} (not a list)")
elif property == "arguments":
if isinstance(value, list):
for arg in value:
# an arg should have a name, short_name and constraints
arg_node = bnode('ARG')
GRAPH.add((node, DWD.argument, arg_node))
if 'name' in arg: GRAPH.add((arg_node, DWD.name, Literal(arg['name'])))
if 'short_name' in arg: GRAPH.add((arg_node, DWD.short_name, Literal(arg['short_name'])))
for arg_constraint in arg['constraints']:
const_node = bnode('CONSTRAINT')
GRAPH.add((arg_node, DWD.constraint, const_node))
GRAPH.add((const_node, DWD.name, Literal(arg_constraint['name'])))
GRAPH.add((const_node, DWD.wd_node, URIRef(WD + arg_constraint['wd_node'])))
else:
print(f"Bad property-values {node} {property} {value} (not a list)")
elif property == "related_qnodes":
if isinstance(value, list):
for v in value:
wdnode = bnode('WDNODE')
GRAPH.add((node, DWD.related_qnode, wdnode))
GRAPH.add((wdnode, DWD.wd_node, URIRef(WD+v['wd_node'])))
GRAPH.add((wdnode, DWD.name, Literal(v['name'])))
else:
#raise exception, should be a list
print(f"Bad property-values {node} {property} {value}")
# not recognized properties...
else:
if isinstance(value, list):
print(f"Unrecognized property with list of values for type {type}: {property} {value}")
if value == []:
#no values for this property, so ignore
pass
for v in value:
GRAPH.add((node, URIRef(DWD + property), Literal(v)))
else:
print(f"Unrecognized property for {type}: {property} {value}")
GRAPH.add((node, URIRef(DWD + property), Literal(value)))
return count-1
N = 0 # set to a small number for testing
for (xpo_type, data) in [('event', event),
('entity', entity),
('relation', relation),
('temporal_relation', temporal_relation)]:
n = convert_generic(data, type=xpo_type, stop=N)
print(f"Found {n} {xpo_type}")
# ### Output RDF graph to xpo.ttl and xpo.nt
GRAPH.serialize(destination="xpo.ttl", format='turtle')
GRAPH.serialize(destination="xpo.nt", format='nt')