-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
executable file
·114 lines (97 loc) · 3.9 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# /usr/env/bin/python3
# -*-coding: utf-8 -*-
import time
from fake_useragent import UserAgent
from tools import get_text
import re
from bs4 import BeautifulSoup
import requests
class CrawlMetaclass(type):
def __new__(mcs, name, base, attrs):
attrs['functions_name'] = []
for key, value in attrs.items():
if 'crawl_' in key:
attrs['functions_name'].append(key)
return type.__new__(mcs, name, base, attrs)
class crawl(object, metaclass=CrawlMetaclass):
def get_proxies(self, name):
proxies = []
for proxy in eval('self.{}()'.format(name)):
proxies.append(proxy)
return proxies
# def crawl_one(self):
# url_one = 'http://www.iphai.com/free/ng'
# text = get_text(url_one)
# pattern = re.compile('<td>\s*(.*?)\s*</td>.*?<td>\s*(.*?)\s*</td>.*?</tr>', re.S)
# result = re.findall(pattern, text)
# for ip, port in result:
# proxy = ip + ':' + port
# yield proxy
#
# def crawl_two(self):
# url_two = 'http://www.iphai.com/free/wg'
# text = get_text(url_two)
# pattern = re.compile('<td>\s*(.*?)\s*</td>.*?<td>\s*(.*?)\s*</td>.*?</tr>', re.S)
# result = re.findall(pattern, text)
# for ip, port in result:
# proxy = ip + ':' + port
# yield proxy
def crawl_three(self):
for i in range(1, 4):
url_three = 'https://www.xicidaili.com/nn/{}'.format(i)
text = get_text(url_three)
pattern = re.compile('</td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>.*?</tr>', re.S)
result = re.findall(pattern, text)
for ip, port in result:
proxy = ip + ':' + port
yield proxy
def crawl_four(self):
for i in range(1, 4):
url_four = 'https://www.kuaidaili.com/free/inha/{}'.format(i)
text = get_text(url_four)
soup = BeautifulSoup(text, 'lxml')
ip = soup.find_all(attrs={'data-title': 'IP'})
port = soup.find_all(attrs={'data-title': 'PORT'})
for j in range(len(ip)):
proxy = ip[j].string + ':' + port[j].string
yield proxy
time.sleep(1)
def crawl_five(self):
for i in range(3, 6):
url_five = 'http://www.66ip.cn/{}.html'.format(i)
text = get_text(url_five)
pattern = re.compile('<tr><td>(.*?)</td><td>(.*?)</td>.*?</tr>', re.S)
result = re.findall(pattern, text)
for ip, port in result:
if not ip == 'ip':
proxy = ip + ':' + port
yield proxy
def crawl_six(self):
for i in range(1, 3):
url_six = 'https://www.freeip.top/?page={}'.format(i)
text = get_text(url_six)
pattern = re.compile('<tr><td>(.*?)</td><td>(.*?)</td>')
result = re.findall(pattern, text)
for ip, port in result:
proxy = ip + ':' + port
yield proxy
def crawl_seven(self):
for i in range(1, 3):
url_seven = 'http://www.ip3366.net/free/?stype=1&page={}'.format(i)
text = get_text(url_seven)
pattern = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
result = re.findall(pattern, text)
for ip, port in result:
proxy = ip + ':' + port
yield proxy
def crawl_eight(self):
for country in ['China-02', 'China-03', 'China-04', 'Taiwan-01']:
url_eight = 'https://premproxy.com/proxy-by-country/{}.htm'.format(country)
text = get_text(url_eight)
soup = BeautifulSoup(text, 'lxml')
result = soup.find_all(attrs={'data-label': 'IP:port '})
for tag in result:
yield tag.string
if __name__ == '__main__':
c = crawl()
print(list(c.crawl_three()))