-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathenrich_data.py
111 lines (81 loc) · 3.52 KB
/
enrich_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import wikipediaapi
import json
import requests
from bs4 import BeautifulSoup, Comment
from pathlib import Path
def get_wiki_text(wiki_url):
wiki_wiki = wikipediaapi.Wikipedia('PythonRequests', 'en')
# Parse the page title from the URL
page_title = wiki_url.split("/")[-1]
# Fetch the Wikipedia page
page = wiki_wiki.page(page_title)
# Check if the page exists
if page.exists():
# Print the text content of the page
return page.text
else:
return ""
def get_bp_text(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
if response.status_code == 200:
# Find the element with id "content"
content_element = soup.find(id="content")
# Check if the "content" element was found
if content_element:
# Print or process the content_html as needed
return content_element.get_text()
else:
print(f"Failed to find content id on '{url}'.")
return ""
else:
print(f"Failed to fetch the page '{url}'.")
return ""
def get_web_text(url):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script, style tags and comments
for script in soup(["script", "style"]):
script.extract() # Removes tag
# Remove comments
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()
# Get text from soup object
return " ".join(soup.stripped_strings)
else:
print(f"Failed to fetch the page '{url}'.")
return ""
except:
print(f"{url} failed hard for web get")
return ""
if __name__ == "__main__":
w = get_wiki_text("https://en.wikipedia.org/wiki/Loren_Taylor")
#print(w)
m = get_bp_text("https://ballotpedia.org/Loren_Taylor")
# Define the path to the file
file_path = Path("data.json")
# Read the file and load its JSON content
with file_path.open('r', encoding='utf-8') as f:
data = json.load(f)
# Add text for people
for person, person_data in data["People"].items():
print(person)
if data["People"][person].get("bp_url"):
data["People"][person]["bp_text"] = get_bp_text(data["People"][person].get("bp_url"))
if data["People"][person].get("wiki_url"):
data["People"][person]["wiki_text"] = get_wiki_text(data["People"][person].get("wiki_url"))
if data["People"][person].get("first_url"):
data["People"][person]["first_text"] = get_web_text(data["People"][person].get("first_url"))
for prop, prop_data in data["Propositions"].items():
print(prop)
if data["Propositions"][prop].get("bp_url"):
data["Propositions"][prop]["bp_text"] = get_bp_text(data["Propositions"][prop].get("bp_url"))
if data["Propositions"][prop].get("wiki_url"):
data["Propositions"][prop]["wiki_text"] = get_wiki_text(data["Propositions"][prop].get("wiki_url"))
if data["Propositions"][prop].get("first_url"):
data["Propositions"][prop]["first_text"] = get_wiki_text(data["Propositions"][prop].get("first_url"))
output_file_path = Path("data-enrich.json")
with output_file_path.open('w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)