-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscrap.py
51 lines (48 loc) · 1.86 KB
/
webscrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# import libraries
import urllib.request
from bs4 import BeautifulSoup
# specify the url
# urlpage = 'https://groceries.asda.com/search/yogurt'
# print(urlpage)
# page = urllib.request.urlopen(urlpage)
# soup = BeautifulSoup(page, 'html.parser')
# results = soup.find_all('div', attrs={'class': 'co-product'})
# print('Number of results', len(results))
#With Selenium
import urllib.request
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
import pandas as pd
# specify the url
urlpage = 'https://groceries.asda.com/search/yogurt'
print(urlpage)
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("--headless")
# With headless mode
driver = webdriver.Chrome(executable_path="./chromedriver.exe", options = chrome_options)
# get web page
driver.get(urlpage)
# execute script to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
# sleep for 30s
# at time of publication, Nov 2018:
# results = driver.find_elements_by_xpath("//*[@id='componentsContainer']//*[contains(@id,'listingsContainer')]//*[@class='product active']//*[@class='title productTitle']")
# updated Nov 2019:
results = driver.find_elements_by_xpath("//*[@class=' co-product-list__main-cntr']//*[@class=' co-item ']//*[@class='co-product']//*[@class='co-item__title-container']//*[@class='co-product__title']")
print('Number of results', len(results))
data = []
# loop over results
time.sleep(30)
for result in results:
product_name = result.text
link = result.find_element_by_tag_name('a')
product_link = link.get_attribute("href")
# append dict to array
data.append({"product" : product_name, "link" : product_link})
df = pd.DataFrame(data)
print(df)
df.to_csv('asdaYogurtLink.csv')
driver.quit()