forked from scrapfly/scrapfly-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindeed.py
96 lines (81 loc) · 3.47 KB
/
indeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
This is an example web scraper for indeed.com used in scrapfly blog article:
https://scrapfly.io/blog/how-to-scrape-indeedcom/
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import json
import math
import os
import re
from typing import Dict, List
import urllib
from loguru import logger as log
from scrapfly import ScrapeApiResponse, ScrapeConfig, ScrapflyClient, ScrapflyScrapeError
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# Indeed.com requires Anti Scraping Protection bypass feature.
"asp": True,
"country": "US",
}
def parse_search_page(result):
"""Find hidden web data of search results in Indeed.com search page HTML"""
data = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', result.content)
data = json.loads(data[0])
return {
"results": data["metaData"]["mosaicProviderJobCardsModel"]["results"],
"meta": data["metaData"]["mosaicProviderJobCardsModel"]["tierSummaries"],
}
def _add_url_parameter(url, **kwargs):
"""Add or replace GET parameters in a URL"""
url_parts = list(urllib.parse.urlparse(url))
query = dict(urllib.parse.parse_qsl(url_parts[4]))
query.update(kwargs)
url_parts[4] = urllib.parse.urlencode(query)
return urllib.parse.urlunparse(url_parts)
async def scrape_search(url: str, max_results: int = 1000) -> List[Dict]:
"""Scrape Indeed.com search for job listing previews"""
log.info(f"scraping search: {url}")
result_first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
data_first_page = parse_search_page(result_first_page)
results = data_first_page["results"]
total_results = sum(category["jobCount"] for category in data_first_page["meta"])
# there's a page limit on indeed.com of 1000 results per search
if total_results > max_results:
total_results = max_results
print(f"scraping remaining {(total_results - 10) / 10} pages")
other_pages = [
ScrapeConfig(_add_url_parameter(url, start=offset), **BASE_CONFIG)
for offset in range(10, total_results + 10, 10)
]
log.info("found total pages {} search pages", math.ceil(total_results / 10))
async for result in SCRAPFLY.concurrent_scrape(other_pages):
if not isinstance(result, ScrapflyScrapeError):
data = parse_search_page(result)
results.extend(data["results"])
else:
log.error(f"failed to scrape {result.api_response.config['url']}, got: {result.message}")
return results
def parse_job_page(result: ScrapeApiResponse):
"""parse job data from job listing page"""
data = re.findall(r"_initialData=(\{.+?\});", result.content)
data = json.loads(data[0])
data = data["jobInfoWrapperModel"]["jobInfoModel"]
return {
"description": data['sanitizedJobDescription'],
**data["jobMetadataHeaderModel"],
**(data["jobTagModel"] or {}),
**data["jobInfoHeaderModel"],
}
async def scrape_jobs(job_keys: List[str]):
"""scrape job page"""
log.info(f"scraping {len(job_keys)} job listings")
results = []
urls = [
f"https://www.indeed.com/viewjob?jk={job_key}"
for job_key in job_keys
]
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
async for result in SCRAPFLY.concurrent_scrape(to_scrape):
results.append(parse_job_page(result))
return results