-
Notifications
You must be signed in to change notification settings - Fork 4
/
example_compare_crawlers.py
68 lines (50 loc) · 2.05 KB
/
example_compare_crawlers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
With this script you can compare solutions.
This should be treated with a grain of salt, since all of them are called through OS subprocess.
To make crawlers results more objective all are called that way
In my setup it was around:
requests: 2.9 [s]
beautiful soup: 4.1 [s]
playwright: 10.42 [s]
selenium: not installed / missing
selenium undetected: 12.62 [s]
# TODO check if status code is valid for all
"""
import time
import subprocess
__version__ = "0.0.1"
# change test webpage to see if other pages can be scraped using different scrapers
test_webpage = "https://google.com"
def call_process(input_script):
start_time = time.time()
try:
subprocess.check_call("poetry run python {} --url {} --output-file {} --timeout 20".format(input_script, test_webpage, "out.txt"), timeout=20)
except Exception as e:
return 100000000
return time.time() - start_time
def call_requests():
return call_process("crawlerrequests.py")
def call_crawleebeautiful():
return call_process("crawleebeautifulsoup.py")
def call_crawleeplaywright():
return call_process("crawleeplaywright.py")
def call_seleniumchromeheadless():
return call_process("crawlerseleniumheadless.py")
def call_seleniumchromeundetected():
return call_process("crawlerseleniumundetected.py")
def call_seleniumbase():
return call_process("crawlerseleniumbase.py")
def main():
time_requests = call_requests()
time_crawleebeautiful = call_crawleebeautiful()
time_crawleeplaywright = call_crawleeplaywright()
time_seleniumchromeheadless = call_seleniumchromeheadless()
time_seleniumchromeundetected = call_seleniumchromeundetected()
time_seleniumbase = call_seleniumbase()
print(f"Requests:{time_requests} [s]")
print(f"crawleebeautifulsoup:{time_crawleebeautiful} [s]")
print(f"crawleeplaywright:{time_crawleeplaywright} [s]")
print(f"seleniumchromeheadless:{time_seleniumchromeheadless} [s]")
print(f"seleniumchromeundetected:{time_seleniumchromeundetected} [s]")
print(f"seleniumbase:{time_seleniumbase} [s]")
main()