-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlistcrawler.js
103 lines (86 loc) · 2.61 KB
/
listcrawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
var request = require("request");
var fs = require("fs");
var Crawler = require("simplecrawler");
var exportUrl = "http://en.wikipedia.org/w/index.php?title=Special:Export";
var domain = "en.wikipedia.org";
var basePath = "/wiki/";
/*** ADJUST THIS ARRAY TO ADAPT SITES TO CRAWL ***/
// entries must match wikipedia subpaths
var sitesToCrawl = [ "Lists_of_writers" ];
var dependentPages = [];
var pageCrawler = new Crawler(domain);
// add page entries to crawler queue
sitesToCrawl.forEach(function(name) {
pageCrawler.queue.add(
pageCrawler.initialProtocol,
domain,
pageCrawler.initialPort,
basePath + name,
1
);
});
// configure crawler
pageCrawler.parseHTMLComments = false;
pageCrawler.parseScriptTags = false;
pageCrawler.downloadUnsupported = false;
pageCrawler.interval = 5; // in ms
pageCrawler.maxConcurrency = 50;
pageCrawler.domainWhitelist = [ "en.wikipedia.org" ];
pageCrawler.maxDepth = 2;
// only look for internal wiki links
var articleCondition = pageCrawler.addFetchCondition(function(parsedURL) {
return parsedURL.uriPath.match(/^\/wiki\/list(s)?_of_/i);
});
// remove base path from uri
function getNameFromUriPath(uriPath) {
return uriPath.replace(basePath, "");
}
// use list of wiki page names and create a query string for the wiki exporter
function buildExportQuery(pages) {
function buildPagesQuery() {
var pageDelimiter = "%0A";
return pages.join(pageDelimiter);
}
var queryOptions = {
action: "submit",
curonly: true,
pages: buildPagesQuery()
};
return "&action="+queryOptions.action+"&pages="+queryOptions.pages+"&curonly="+queryOptions.curonly+"&limit=1";
}
// send list of page names to wiki exporter and store into dump.xml
function exportDependentPages(pages, callback) {
var url = exportUrl + buildExportQuery(pages);
var options = {
url: url,
headers: {
"Accept-Encoding": "gzip,deflate"
}
};
var ws = fs.createWriteStream("dump.xml");
ws.on("finish", callback);
request
.post(options)
.on("error", function(err) {
console.error(err);
})
.pipe(ws);
}
// start crawler
console.log("START CRAWLING, BE PATIENT.");
pageCrawler
.on("fetchcomplete", function(queueItem) {
console.log(queueItem.url, getNameFromUriPath(queueItem.path), pageCrawler.queue.length);
dependentPages.push( getNameFromUriPath(queueItem.path) );
})
.on("fetcherror", function() {
throw new Error("fetcherror with", arguments);
})
.on("complete", function() {
console.log("COMPLETED CRAWLING, START DOWNLOAD, BE PATIENT.");
exportDependentPages(
dependentPages,
function() { console.log("FINISHED DOWNLOADING. Look at dump.xml!"); }
);
});
pageCrawler.start();