-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
216 lines (178 loc) · 7.07 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# -*- coding: utf-8 -*-
import urllib2
import urllib
import os
import errno
import urlparse
from bs4 import BeautifulSoup
import socket
import time
import re
class Spider:
def __init__(self):
self.base_url = ''
self.save_path = 'downloads/'
timeout = 25
# 这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置
socket.setdefaulttimeout(timeout)
self.downloadSuccess = False
self.index_page = 'index.html'
# TODO 处理下面两个方法
def get_content(self, url):
response = urllib2.urlopen(url)
content = response.read().decode('utf-8')
response.close()
return content
def getContent(self, url, save_path):
self.base_url = url
# TODO:如果根路径就是首页则下载
if not os.path.exists(save_path):
self.mkdir_p(save_path)
urllib.urlretrieve(url, save_path + self.index_page, self.download_callbackfunc)
return self.get_content(url)
def get_img_in_css(self, url):
request = urllib2.Request(url)
html = urllib2.urlopen(request).read().decode('utf-8')
# background: url("../images/arrow.png") no-repeat scroll right 14px top 18px rgba(0, 0, 0, 0);
#re_string = r'background.*url\("(.*)"\)'
re_string = r'url\("(.*)"\)'
match = re.compile(re_string)
img_links = re.findall(match, html)
img_result = []
for img in img_links:
# TODO:只有一层相对目录的情况,考虑多层或无限层解析相对目录
if img.count('../') == 1:
img_result.append(url[:url.rfind('/')+1] + img.replace('../', ''))
#print img, img.count('../'), url.rfind('/'), url[:url.rfind('/')]
return img_result
def getUrls(self, content):
soup = BeautifulSoup(''.join(content))
result = []
#a
links = soup.findAll('a')
for link in links:
try:
temp = link['href']
if not temp.startswith('#'): # 排除锚点
result.append(temp)
except KeyError:
pass # or some other fallback action
#css
links = soup.findAll('link')
for link in links:
try:
temp = link['href']
result.append(temp)
except KeyError:
pass # or some other fallback action
#js
links = soup.findAll('script')
for link in links:
try:
temp = link['src']
result.append(temp)
except KeyError:
pass # or some other fallback action
#images
links = soup.findAll('img')
for link in links:
try:
temp = link['src']
result.append(temp)
except KeyError:
pass # or some other fallback action
links = soup.findAll('frame')
for link in links:
try:
temp = link['src']
result.append(temp)
except KeyError:
pass # or some other fallback action
return result
def getdirc(self, url):
idx = url.rfind('/')
return url[:idx]
'''下载文件的回调函数
@blocknum: 已经下载的数据块
@blocksize: 数据块的大小
@totalsize: 远程文件的大小
'''
def download_callbackfunc(self,blocknum, blocksize, totalsize):
percent = 100.0 * blocknum * blocksize / totalsize
if percent > 100:
percent = 100
self.downloadSuccess = True
print "%.2f%%"% percent
# 下载到本地
def download(self, url, save_path):
# TODO:是否基路径就是首页
if len(save_path) == 0:
save_path = self.save_path
if not os.path.exists(save_path):
self.mkdir_p(save_path)
if self.base_url.endswith('index.html'):
self.base_url = self.base_url.replace('index.html', '')
absolute_url = urlparse.urljoin(self.base_url, url)
dirs = self.getdirc(save_path + url)
if not os.path.exists(dirs):
self.mkdir_p(dirs)
print 'create directory:' + dirs
if not os.path.exists(save_path + url):
sleep_time = 1
while True:
try:
time.sleep(sleep_time)
print 'downloading file:' + absolute_url
urllib.urlretrieve(absolute_url, save_path + url, self.download_callbackfunc)
# TODO: 递归下载子html页面的资源
content = self.get_content(self.base_url + url)
urls = spider.getUrls(content)
if len(urls) > 0:
for url in {}.fromkeys(urls).keys():
if url != '':
self.download(self.base_url + url)
# 只有下载成功方能退出
if self.downloadSuccess:
break
# 如果发生了10054或者IOError或者XXXError
except:
# 多睡5秒,重新执行以上的download.因为做了检查点的缘故,上面的程序会从抛出异常的地方继续执行。
# 防止了因为网络连接不稳定带来的程序中断
sleep_time += 5
print('enlarge sleep time:', sleep_time)
# 递归创建文件夹
def mkdir_p(self, path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def isAbsolute(self, link):
for item in ['http://', 'https://']:
try:
if link.index(item) >= 0:
return True
except:
pass
return False
if __name__ == "__main__":
spider = Spider()
#base_url = 'http://infinite-woodland-5276.herokuapp.com/index.html'
base_url = 'http://feathersui.com/documentation/'
save_path = "test/"
content = spider.getContent(base_url, save_path);
for href in {}.fromkeys(spider.getUrls(content)).keys():
if not href.__contains__('googleapis'):
spider.download(href, save_path)
print href
# 单独下载css中的背景图
'''imgs = spider.get_img_in_css('http://infinite-woodland-5276.herokuapp.com/assets/stylesheets/themes.min.css')
for img in imgs:
temp = img.replace(base_url, '')
dirs = save_path + temp[:temp.rfind('/')]
if not os.path.exists(dirs):
spider.mkdir_p(dirs)
urllib.urlretrieve(img, save_path + temp)
print 'downloading img: ', img'''