-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeIndividualPage.py
87 lines (72 loc) · 2.53 KB
/
scrapeIndividualPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# scrapeIndividualPage.py
#
# Team Spider Silks
# Author: Charlie Bushman
#
# Takes in the URL for an indiviudal event page and then scrapes the following
# data fields:
# Start Time, End Time, Location, From Site, Description, Category, For
#
# Takes in arguments of the URL to be scraped and then the name of the event
# and then the date of the event
import urllib.request
import re
import sys
# Reads in html
params = sys.argv[1].split("***")
url = params[0]
fp = urllib.request.urlopen(url)
byteStr = fp.read()
htmlStr = byteStr.decode("utf8")
fp.close()
# RegEx to find start and end times
# Preceeding string is:
# <span class="time">
# and following is:
# </span> <span class="divider">/</span>
searches = re.findall("<span class=\"time\">.+</span> <span class=\"divider\">/</span>", htmlStr)
startTime = "00:00:00"
endTime = "00:00:00"
if "8211" not in searches[0]:
startTime = re.sub("<span class=\"time\">", "", searches[0])
startTime = re.sub("</span> <span class=\"divider\">/</span>", "", startTime)
endTime = ""
else:
startTime = re.sub("<span class=\"time\">", "", searches[0])
startTime = re.sub("</span> <span class=\"divider\">/</span>", "", startTime)
endTime = re.sub(".+ – ", "", startTime)
startTime = re.sub(" – .*", "", startTime)
# Implement this please
#print (int(startTime[0:1])+12)
#if "am" not in endTime:
#print (startTime[0:2])
# RegEx to find location
# Preceeding string is:
# <span class="divider">/</span> <span class="location">
# and following string is:
# </span></div>
searches = re.findall("<span class=\"divider\">/</span> <span class=\"location\">.+</span></div>", htmlStr)
location = "Nowhere"
for search in searches:
location = re.sub("<span class=\"divider\">/</span> <span class=\"location\">", "", search)
location = re.sub("</span></div>", "", location)
# RegEx to find description
# Preceeding string is:
# <p class="description">
# and following string is:
# </p>
searches = re.findall("<p class=\"description\">.+</p>", htmlStr)
description = "Boring"
for search in searches:
description = re.sub("<p class=\"description\">", "", search)
description = re.sub("</p>", "", description)
# RegEx to find the For field
# Preceeding string is:
# </div><div class="audiences"><h4>For:</h4>
# and the following line is:
# </div><div class="export">
# Export data into a .txt file
name = params[1]
date = params[2]
with open('allData' + date + '.txt', 'a+') as f:
f.write(name + ", " + date + ", " + url + ", " + startTime + ", " + endTime + ", " + location + ", " + description)