-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
68 lines (55 loc) · 2.45 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
async function main(){
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
const listings = await scrapListing(page);
const listingJobDescriptions = await scrapeJobDescriptions(page, listings)
}
async function scrapeJobDescriptions(page, listings){
for(var i = 0; i<listings.length; i++){
// CREATES CHROMIUM BROWSWER FOR LISTING PAGE
await page.goto(listings[i].url);
//GRABS HTML AND SLEEPS 1 SECOND
const html = await page.content();
const $ = cheerio.load(html)
// SCRAPS JOB DESCRIPTION
const jobDescription = $("#postingbody").text().replace("QR Code Link to This Post", "").trim();
listings[i].jobDescription = jobDescription;
// SCRAPS COMPENSATION
const compensation = $(".attrgroup > span:nth-child(1) > b").text();
listings[i].compensation = compensation;
// PRINTS LISTING INFO
printInfo(listings[i]);
await new Promise(r => setTimeout(r, 1000));
}
}
async function scrapListing(page){
// CREATES CHROMIUM BROWSER W/ PUPPETEER
await page.goto("https://sfbay.craigslist.org/d/software-qa-dba-etc/search/sof");
// GRABS HTML CONTENTS OF PAGE
const html = await page.content();
// PASS HTML INTO PARSER
const $ = cheerio.load(html);
const listings = $(".result-info").map((index, element) => {
// SCRAPS DATA FROM ".result-hood" CLASS
const neighborhoodElement = $(element).find(".result-hood");
const neighborhood = $(neighborhoodElement).text().trim().replace(/[()]/g,'');
// SCRAPS DATA FROM ".result-date" CLASS
const timeElement = $(element).find(".result-date");
const datePosted = new Date($(timeElement).attr("datetime"));
// SCRAPS DATA FROM ".result-title" CLASS
const titleElement = $(element).find(".result-title");
const title = $(titleElement).text();
const url = $(titleElement).attr("href");
return { title, url, datePosted, neighborhood };
}).get();
return listings;
}
function printInfo(listing){
for (let [key, value] of Object.entries(listing)) {
console.log(String(key).toUpperCase() + ": " + value);
}
console.log("-------------------------------------------------------");
}
main();