-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.py
124 lines (104 loc) · 3.79 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
import json
def cleanup(br):
out = {}
for field in br:
if br[field] != [] and br[field] != "":
out[field] = br[field]
return out
def title(br):
for field in br:
if field.endswith("_title"):
return br[field]
return "[Not Title found]"
def date(br):
for field in br:
if field.endswith("_publicationDate"):
# Only output the year, i.e. first 4 characters
return br[field][:4]
return "s.d."
def type(br):
for field in br:
if field == "type":
# Only output
words = br[field].split("_")
starts = [c[0] for c in words]
return "[" + ''.join(starts) + "]"
return "[]"
def authors(br):
for field in br:
if field.endswith("_contributors") and br[field] != []:
authorsArray = []
for contributor in br[field]:
if "roleType" in contributor and contributor["roleType"] == "AUTHOR":
if "heldBy" in contributor:
if "nameString" in contributor["heldBy"]:
authorsArray.append(contributor["heldBy"]["nameString"])
else:
authorsArray.append("[no nameString]")
return ' / '.join(authorsArray)
def citation(br):
output = "" + type(br) + " " + title(br) + " (" + date(br) + ")" + " " + br["_id"]
return output
# read JSON data
with open('data.json', encoding='utf8') as f:
fullData = json.load(f)
# clean it up and save in a dict by its id
data = {}
for br in fullData:
id = br["_id"]
data[id] = cleanup(br)
# some statistic
print("There are currently", len(data), "entries in this data dump.")
types = {}
for i in data:
if "type" in data[i]:
if data[i]["type"] in types:
types[data[i]["type"]] += 1
else:
types[data[i]["type"]] = 1
else:
print("\033[91mWARNING:\033[0m No type given for", i)
print("Type statistics", types)
# show single entry by id
print("Show one entry by id:",
json.dumps(data["5c17665613b8bb034c4e7239"], indent=4))
print("Show one entry by id:",
json.dumps(data["5bab9d86c3bd212c24356625"], indent=4))
# look for duplicate ids
seen = {}
for i in data:
for field in data[i]:
if field.endswith("_identifiers"):
for id in data[i][field]:
if "scheme" in id and "literalValue" in id:
if id["scheme"] == "ISSN" or id["scheme"] == "ZDB_ID" or id["scheme"] == "ISBN":
break
key = id["scheme"] + ":" + id["literalValue"]
if key in seen:
print("Duplicate found", key, "is in", seen[key], "and", data[i]["_id"])
else:
seen[key] = data[i]["_id"]
else:
print("Error with identifiers in", i, id)
# add reverse property children from partOf relation
for i in data:
if "partOf" in data[i] and data[i]["partOf"] != "":
if data[i]["partOf"] in data:
parentElement = data[data[i]["partOf"]]
if "children" in parentElement:
parentElement["children"].append(i)
else:
parentElement["children"] = [i]
else:
print("WARNING: Key", data[i]["partOf"], "not found but given as partOf in", i)
# list all resources with its children or as standalone
dataList = sorted(data.items(), key=lambda kv: title(kv[1]).lower())
for key, element in dataList:
if "children" in element:
print("\n+ " + citation(element))
for child in element["children"]:
print("|_", citation(data[child]))
else:
if "partOf" not in element:
print("\n- " + citation(element))