-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
96 lines (83 loc) · 3.56 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from asyncio import ensure_future, get_event_loop, wait_for, Queue
from aiohttp import ClientSession
from config import log, TARGET_URL, ACCEPT_HEADERS, CONCURRENCY, TIMEOUT, OUTPUT_FILENAME
from biplist import writePlist
from urllib.parse import urljoin, urldefrag, unquote
from lxml import html
from cgi import parse_header
from cssutils import getUrls, parseString
async def crawler(client, url_queue, archive):
while True:
url = await url_queue.get()
try:
log.debug(url)
headers = ACCEPT_HEADERS
headers['Referer'] = archive['top']
response = await client.get(url, headers=headers)
if response.status != 200:
log.warn('BAD RESPONSE: {}: {}'.format(response.status, url))
else:
data = await response.read()
content_type, params = parse_header(response.headers['content-type'])
item = {
"WebResourceData": data,
"WebResourceMIMEType": content_type,
"WebResourceURL": url
}
if 'charset' in params:
item['WebResourceTextEncodingName'] = params['charset']
# TODO: attempt to reproduce the way HTTP headers are stored (NSKeyedArchiver?)
archive['items'].append(item)
archive['seen'][url] = True
if 'text/html' == content_type:
dom = html.fromstring(data)
patterns = ['//img/@src', '//img/@data-src', '//img/@data-src-retina', '//script/@src', "//link[@rel='stylesheet']/@href"]
for path in patterns:
for attr in dom.xpath(path):
log.debug("{}: {} {}".format(path, url, attr))
url = unquote(urljoin(url, urldefrag(attr)[0]))
if url not in archive['seen']:
archive['seen'][url] = True
await url_queue.put(url)
elif 'text/css' == content_type:
# TODO: nested @import and better path inference
for attr in getUrls(parseString(data)):
log.debug(attr)
url = unquote(urljoin(url, urldefrag(attr)[0]))
if url not in archive['seen']:
archive['seen'][url] = True
await url_queue.put(url)
except Exception as exc:
log.warn('Exception {}:'.format(exc), exc_info=True)
finally:
url_queue.task_done()
async def scrape(client, url):
tasks = []
url_queue = Queue()
archive = {
'top': url,
'seen': {},
'items': []
}
await url_queue.put(url)
def task_completed(future):
exc = future.exception()
if exc:
log.error('Worker finished with error: {} '.format(exc), exc_info=True)
for _ in range(CONCURRENCY):
crawler_future = ensure_future(crawler(client, url_queue, archive))
crawler_future.add_done_callback(task_completed)
tasks.append(crawler_future)
await wait_for(url_queue.join(), TIMEOUT)
for task in tasks:
task.cancel()
client.close()
webarchive = {
'WebMainResource': archive['items'].pop(0),
'WebSubresources': archive['items']
}
writePlist(webarchive, OUTPUT_FILENAME)
if __name__ == '__main__':
client = ClientSession()
loop = get_event_loop()
loop.run_until_complete(scrape(client, TARGET_URL))