-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
140 lines (114 loc) · 5.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import math
import openpyxl
import requests
import pandas as pd
from tabulate import tabulate
from bs4 import BeautifulSoup
# from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as webdriver;
MAX_REVIEWS_PAGES_PER_PRODUCT = 10;
baseURL = "https://www.amazon.ca"
headers = ({
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
# options.add_argument('--headless')
diver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def scrape_page(url):
print("scraping: ", url)
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
return soup;
def scrape_rendered_page(url):
diver.get(url)
# time.sleep(1)
html = diver.page_source
soup = BeautifulSoup(html, "html.parser")
return soup;
def scrape_reviews(review_cards, reviews):
for count, card in enumerate(review_cards, start=1):
print("Scraping review ", count, "/", len(review_cards), "...")
profile_link = card.find("a", class_='a-profile')
a_row = card.find("div", class_="a-row")
name = a_row.find("span", class_="a-profile-name").text.strip()
rating = float(a_row.find("span", class_="a-icon-alt").text.split(" ")[0])
review_title = a_row.find(attrs={'data-hook': 'review-title'}).find("span").text.strip()
review_body = a_row.find("span", {'data-hook': 'review-body'}).find("span").text.strip()
review_date_and_location = a_row.find("span", {'data-hook': 'review-date'}).text.strip()
review_date = review_date_and_location.split(" on ")[1]
review_location = review_date_and_location.split(" in ")[1].split(" on ")[0]
location_and_or_occupation = ""
if profile_link:
print("Performing rendered profile scrape...")
profile_url = baseURL + profile_link['href']
profile_page = scrape_rendered_page(profile_url)
profile_card = profile_page.find(id="profile_v5")
name = profile_card.find("span", class_="a-size-extra-large")
name = name.text.strip() if name else ""
occupation_and_location = profile_card.find("span", class_="a-size-base a-color-base")
if (occupation_and_location):
location_and_or_occupation = occupation_and_location
# seperator = " | "
# if seperator in occupation_and_location:
# occupation = occupation_and_location.text.split(' | ')[0]
# profile_location = occupation_and_location.text.split(' | ')[1]
# else:
# profile_location = occupation_and_location
review = {'Name': name, 'Location': review_location, 'Rating': rating, 'Date': review_date,
'Occupation and or location': location_and_or_occupation,
'Title': review_title,
'Body': review_body}
reviews.append(review)
return reviews
def get_urls():
df = pd.read_csv('data/in.csv', header=None)
return df.values
def scrape_product(url, reviews):
try:
print("Scraping product page for reviews list page... ", url)
soup = scrape_rendered_page(url)
results = soup.find(id="cr-pagination-footer-0")
print("Reviews list page found")
link_url = results.find("a", class_="a-link-emphasis a-text-bold")['href']
new_url = baseURL + link_url
except:
print("ERROR: User-Agent Expired, try another: https://developers.whatismybrowser.com/useragents/explore/")
return 0
print("Scraping reviews list page...")
soup2 = scrape_rendered_page(new_url)
# find number of pages to scrape in reviews list, maximum 10 pages (100 reviews)
total_num_reviews = \
soup2.find("div", {'data-hook': 'cr-filter-info-review-rating-count'}).text.split(', ')[1].split(" ")[0]
total_num_reviews = int(total_num_reviews)
number_of_review_pages_to_scrape = math.ceil(
total_num_reviews / MAX_REVIEWS_PAGES_PER_PRODUCT) if total_num_reviews < MAX_REVIEWS_PAGES_PER_PRODUCT * 10 else 10
# go through the review pages and scrape the review data
for page_number in range(1, number_of_review_pages_to_scrape + 1):
pageUrl = new_url + '&pageNumber=' + str(page_number)
print("Scraping page number ", page_number, "... ", pageUrl)
soup3 = scrape_rendered_page(pageUrl)
review_list = soup3.find(id="cm_cr-review_list")
review_cards = review_list.find_all("div", {'data-hook': 'review'})
reviews = scrape_reviews(review_cards, reviews)
return reviews
def main():
urls = get_urls()
reviews = []
try:
for url in urls:
print("url", url[0])
reviews = scrape_product(url[0], reviews)
finally:
print("Scraping Finished, converting data into cool tabular view...")
df = pd.DataFrame(reviews)
print(tabulate(df, headers='keys', tablefmt='github'))
os.makedirs('data', exist_ok=True)
df.to_excel('data/out.xlsx', index=False)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
main()