-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcrawl_Occident_girls.py
257 lines (197 loc) · 8.44 KB
/
crawl_Occident_girls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# coding:utf-8
"""
user: 五根弦的吉他
time: 2019-2-2
function: 爬取色情图片
"""
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import random, time, requests, re, bs4
from datetime import datetime
from io import BytesIO
from PIL import Image
class Crawl(object):
def __init__(self):
self.server_main = 'http://你以为我会告诉你网址吗/'
#self.server_type = ''
self.num = 23 # 自定义爬取页数
self.EveryList = []
self.PicNum = 0
self.agent = UserAgent()
self.headers = {'User-Agent':self.agent.random}
# 代理池 超级粗暴!!
self.proxies = [{"http": "http://35.193.0.225:80"},
{"http": "http://142.93.251.113:8080"},
{"http": "http://34.201.67.252:80"},
{"http": "http://157.230.178.46:8080"},
{"http": "http://142.93.177.182:8080"},
{"http": "http://104.248.51.135:8080"},
{"http": "http://68.183.20.164:8080"}
]
def alwaysGet(self, url):
for i in range(10):
URL = url
proxy = random.choice(self.proxies)
print('selected proxy:', proxy)
try:
print('now 1')
res = requests.get(url=URL, headers=self.headers, proxies=proxy, timeout=20)
res.close()
time.sleep(random.uniform(1,2.5))
except:
if i==3:
self.proxies.remove(proxy)
if i==9:
return None
raise ConnectionError
print('have a rest...')
time.sleep(random.uniform(4.0,5.5))
continue
else:
print('now 2')
res.encoding = 'gbk'
self.bf = BeautifulSoup(res.text, features='lxml')
if self.bf is None:
print('None again!')
time.sleep(random.randint(1,2))
continue
return self.bf
def GetType(self):
bf = self.alwaysGet(self.server_main)
if bf == None:
return None
div = bf.find_all('div', class_='i_tit')
print('div:\n',div)
bf_a = BeautifulSoup(str(div),features='lxml')
a = bf_a.find_all('a')
print('a:\n',a,'\nlength:',len(a))
"""
删除下标为偶数的项
"""
for i in range(len(a)):
if i%2==0:
del a[0]
else:
a.append(a[0])
del a[0]
print("final a:\n",a)
return a
def EnterType(self):
self.FirstPage = self.server_main + self.GetType()[2].get('href') # 保留这俩句
self.EveryList.append(self.FirstPage)
def GetNextPage(self):
Flag=0
bf6 = Crawl.alwaysGet(self,url=self.EveryDetailPics[-1])
time.sleep(random.uniform(1,2))
if bf6 is None:
print('NoneType again!!Keep going!')
#self.GetNextPage()
time.sleep(random.randint(3,5))
Crawl.GetNextPage(self)
else:
div = bf6.find_all('div',class_='page page_c')
bf_a = BeautifulSoup(str(div), features='lxml')
for each in bf_a.find_all('a'):
if each.string=='下一页':
self.EveryDetailPics.append(self.FirstPage + each.get('href'))
Flag = True
else: Flag = False
if Flag==True:
#self.GetNextPage()
Crawl.GetNextPage(self)
else:
return self.EveryDetailPics
def GetMainPics(self):
for i in range(len(self.EveryList)):
time.sleep(random.randint(5,9))
print('开始爬取第 %s 页' % str(i+1))
self.EveryMainPics = []
self.EveryDetailPics = []
#bf5 = self.alwaysGet(self.EveryList[i])
bf5 = Crawl.alwaysGet(self,self.EveryList[i] )
if bf5 is None:
continue # ***********
time.sleep(1)
print('get through 1')
if isinstance(bf5, bs4.BeautifulSoup): #########
ul = bf5.find_all('ul', class_='i_pic')
a = BeautifulSoup(str(ul), features='lxml')
print('a.find_all(a):',a.find_all('a'))
for each in a.find_all('a'):
self.EveryMainPics.append(self.server_main + each.get('href'))
for i in range(len(self.EveryMainPics)):
self.EveryDetailPics.append(self.EveryMainPics[i])
bf2 = self.alwaysGet(self.EveryMainPics[i])
if bf2 is None:
continue # *************
time.sleep(random.uniform(1.0, 3.0))
div2 = bf2.find_all('div', class_='page page_c')
bf2_a = BeautifulSoup(str(div2), features='lxml')
for each2 in bf2_a.find_all('a'):
if each2.string=='下一页':
self.EveryDetailPics.append(self.FirstPage + each2.get('href'))
#self.EveryDetailPics = self.GetNextPage() ############
self.GetNextPage()
#return
for eachLink in self.EveryDetailPics:
bf3 = self.alwaysGet(eachLink)
if bf3 is None:
continue # ********
time.sleep(random.uniform(1.5, 3.5))
try:
bf4 = BeautifulSoup(str(bf3.find_all('a', href="javascript:dPlayNext();")[0]), features='lxml')
except:
print('Error')
time.sleep(random.randint(1,2))
continue
jpg_link = bf4.find_all('img', src=re.compile(r'.jpg$'))[0]
JPGLINK = self.server_main + jpg_link.get('src')
try:
response = requests.get(url=JPGLINK, headers=self.headers, proxies=random.choice(self.proxies), timeout=20)
response.close()
time.sleep(random.randint(1,3))
except:
print("爬取第 %s 张图片出错" % str(self.PicNum+1))
time.sleep(random.uniform(4.5,7.5))
self.PicNum += 1
continue
try:
image = Image.open(BytesIO(response.content))
image.save('pic/%s.png' % str(self.PicNum+1))
except:
image = response.content
path = 'pic/%s.png' % str(self.PicNum+1)
with open(path,'wb') as f:
f.write(image)
self.PicNum += 1
print('已经爬取第 %s 张图片' % str(self.PicNum))
time.sleep(random.randint(1,3))
def GetMainPage(self):
bf7 = self.alwaysGet(self.EveryList[-1])
if bf7 is None:
return None
time.sleep(random.uniform(1.0,3.5))
div = bf7.find_all('div', class_='page page_l')
bf_a = BeautifulSoup(str(div), features='lxml')
for each in bf_a.find_all('a'):
if each.string=='下一页':
NextPageLink = self.FirstPage + each.get('href')
self.EveryList.append(NextPageLink)
print("length of list:",len(self.EveryList))
time.sleep(random.randint(1,3))
if len(self.EveryList)==self.num: # 自定义爬取页数
print("self.EveryList:", self.EveryList)
return self.EveryList
print('get done')
self.GetMainPage()
#return self.EveryList
if __name__=='__main__':
start = datetime.now()
getdiv = Crawl()
#getdiv.GetType()
getdiv.EnterType()
getdiv.GetMainPage()
getdiv.GetMainPics()
#getdiv.GetNextPage()
end = datetime.now()
print("Done!\nspent time:",end-start)