-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathaada.py
81 lines (64 loc) · 1.91 KB
/
aada.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup, Tag
from SPARQLWrapper import SPARQLWrapper, JSON
# Get content from the AADA website
soup = BeautifulSoup(open("temp.html"), 'lxml')
alumni = soup.select(".alumni_casting_listing figcaption")
# print(alumni)
imdb_ids = []
ibdb_ids = []
imdb_years = {}
qid_years = {}
for alumnus in alumni:
last_name = alumnus.span.text
year = alumnus.em.span.text
if isinstance(alumnus.a, Tag):
url = alumnus.a['href']
if 'imdb'in url:
imdb = url.split('/')[4]
imdb_ids.append(imdb)
imdb_years[imdb] = year
if 'ibdb' in url:
ibdb = url.split('=')[1]
ibdb_ids.append(ibdb)
imdb_ids_list = ('" "').join(imdb_ids)
query = """
SELECT ?item ?imdb WHERE {{
VALUES ?imdb {{ "{ids}" }}
OPTIONAL {{ ?item wdt:P345 ?imdb . }}
}}
""".format(ids=imdb_ids_list)
print(query)
# Get what is already in Wikidata
endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results = results['results']['bindings']
print(results)
matched = []
unmatched = []
for imdb in imdb_ids:
for r in results:
print(r)
if 'item' in r and imdb == r['imdb']['value']:
qid = r['item']['value'].split('/')[-1]
if qid:
matched.append(qid)
qid_years[qid] = imdb_years[imdb]
else:
unmatched.append(imdb)
print('Matched')
print(matched)
print('Unmatched')
print(unmatched)
for m in matched:
print('\t'.join([m,
'P69',
'Q389336',
'P582',
"+{}-01-01T00:00:00Z/09".format(qid_years[m]),
'S854',
'"https://www.aada.edu/alumni/notable-alumni#decade:all/orderby:all/display:panel/perpage:All"']))