-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlMovies.py
103 lines (76 loc) · 2.3 KB
/
crawlMovies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json, requests, re, db
from bs4 import BeautifulSoup
from string import ascii_uppercase
from pymongo import MongoClient
from datetime import datetime
client = MongoClient(db.conn_string)
db = client.oscar
non_decimal = re.compile(r'[^\d.]+')
major_sites = ["NUM"]
movies = []
# feed alphabet into major_site crawl
for c in ascii_uppercase:
major_sites.append(c)
# loop major sites
for site in major_sites:
page = 1
while True:
# fetch table
url = "http://www.boxofficemojo.com/movies/alphabetical.htm?letter=" + site + "&p=.htm&page=" + str(page)
r = requests.get(url)
print url
# load html file into parser
soup = BeautifulSoup(r.text, "html.parser")
# crawl rows
rows = soup.find_all("tr")
parsedCounter = 0
for row in rows:
if "<a href=\"/movies/?id=" in str(row) and "$" in str(row):
cells = row.find_all("td")
parsedCounter += 1
# total gross
totalGross = cells[2].get_text().replace("$", "").replace(",", "")
totalGross = non_decimal.sub('', totalGross)
if totalGross:
totalGross = int(totalGross)
# start date
startDate = cells[6].get_text()
if startDate and "/" in startDate:
startDate = startDate.split("/")
if len(startDate) == 3:
year = int(startDate[2])
month = int(startDate[0])
day = int(startDate[1])
startDate = datetime(year, month, day)
else:
startDate = None
else:
startDate = None
if year:
if year < 2015:
continue
# box office if
link = cells[0].find("a").get("href")
if link:
link = link.replace("/movies/?id=", "").replace(".htm", "")
# fetch director
dir_url = "http://www.boxofficemojo.com/movies/?id=" + link + ".htm"
dir_r = requests.get(dir_url)
# load html file into parser
dir_soup = BeautifulSoup(dir_r.text, "html.parser")
directors = []
for a in dir_soup.find_all("a"):
if "/people/chart/?view=Director" in a.get("href"):
directors.append(a.get_text())
movie = {
"name": cells[0].get_text(),
"boxOfficeId": link,
"totalGross": totalGross,
"release": startDate,
"directors": directors
}
db.boxoffice_movies.replace_one({"boxOfficeId": movie["boxOfficeId"]}, movie, upsert=True)
page += 1
if site == "NUM" or parsedCounter == 0:
break
print "done"