-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
37 lines (29 loc) · 1.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from botasaurus import *
from botasaurus.cache import DontCache
from urls import urls
import random
def write_output(data, result):
bt.write_json(bt.remove_nones(result), 'products')
bt.write_csv(bt.remove_nones(result), 'products')
def clean_url(url):
return url.replace("https://www.dentalkart.com/", "").replace("/", '-')
@browser(parallel=lambda : min(16, bt.calc_max_parallel_browsers()), headless=True, cache=True, output=write_output, reuse_driver=True)
def scrape_products_task(driver: AntiDetectDriver, url):
driver.get(url)
save_name = clean_url(url)
title_selector = 'h1.productInformation_heading__28NgY'
price_selector = 'span.productInformation_discounted_price__nmQpW > span:nth-child(2)'
title = driver.get_element_or_none_by_selector(title_selector, bt.Wait.VERY_LONG)
price = driver.get_element_or_none_by_selector(price_selector, bt.Wait.VERY_LONG) if title else None
if price is None or title is None:
driver.save_screenshot(save_name)
return DontCache(None)
else:
new_data = {"title": title.text, "price": price.text, "url": url}
print(new_data)
return new_data
if __name__ == "__main__":
my_urls = [url['loc'] for url in urls]
random.shuffle(my_urls) # Shuffle the urls to make it look more natural and not automated.
# Initiate the web scraping task
scrape_products_task(my_urls)