-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
98 lines (78 loc) · 3.12 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pycurl
import re
################
# The following code was originally based on pycurl's examples/quickstart/response_headers.py
try: # Flipped the try and except clauses to silence a persistent inspection warning
from StringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
def setupResp():
r = pycurl.Curl()
r.setopt(pycurl.USERAGENT,
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36")
r.setopt(pycurl.FOLLOWLOCATION, 1)
r.setopt(pycurl.SSL_VERIFYPEER, 1)
r.setopt(pycurl.SSL_VERIFYHOST, 2)
# r.setopt(pycurl.CAINFO, certFile)
# r.setopt(pycurl.URL, URL)
return r
class scrapingTool(object):
def __init__(self):
self.headers = {}
self.r = setupResp()
def setCertFile(self, certFile):
self.r.setopt(pycurl.CAINFO, certFile)
def close(self):
self.r.close()
def header_function(self, header_line):
# HTTP standard specifies that headers are encoded in iso-8859-1.
# On Python 3, decoding step is required.
header_line = header_line.decode('iso-8859-1')
# Header lines include the first status line (HTTP/1.x ...).
# We are going to ignore all lines that don't have a colon in them.
# This will botch headers that are split on multiple lines...
if ':' not in header_line:
return
# Break the header line into header name and value.
name, value = header_line.split(':', 1)
# Remove whitespace that may be present. Header lines include the trailing newline, and there may be whitespace around the colon.
name = name.strip()
value = value.strip()
name = name.lower() # Header names are case insensitive. Still, force the name to lowercase here. For some reason.
# Now we can actually record the header name and value.
# Note: this only works when headers are not duplicated, see below.
self.headers[name] = value
def getBody(self, url):
headers = self.headers
r = self.r
if url in [None, ""]:
raise ("No url to scrape.") # Should never occur, but just to be safe
r.setopt(pycurl.URL, url)
buffer = BytesIO()
r.setopt(r.WRITEFUNCTION, buffer.write)
r.setopt(r.HEADERFUNCTION, self.header_function)
r.perform()
# r.close()
# Figure out what encoding was sent with the response, if any. Check against lowercased header name.
encoding = None
if 'content-type' in headers:
content_type = headers['content-type'].lower()
match = re.search('charset=(\S+)', content_type)
if match:
encoding = match.group(1)
# print('Decoding using', encoding)
if encoding is None:
# Default encoding for HTML is iso-8859-1.
# Other content types may have different default encoding, or in case of binary data, may have no encoding at all.
encoding = 'iso-8859-1'
# print('Assuming encoding is', encoding)
body = buffer.getvalue()
# Decode using the encoding we figured out.
# print(body.decode(encoding))
return body.decode(encoding)
scraper = scrapingTool()
def getScraper():
""" Use getScraper to get global access to shared instance of scrapingTool, 'scraper' """
# if not scraper: print('scraper does not exist')
# elif scraper: print('scraper exists')
return scraper