-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
99 lines (94 loc) · 3.67 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
import time
import requests
import tldextract
from bs4 import BeautifulSoup
import urllib
import distutils.filelist
import chardet
def get_text_from_163(html_doc):
soup = BeautifulSoup(html_doc, 'html.parser')
s = soup.body.find(
"div", ['wrapper clearfix', 'container post_auto clearfix','a_adtemp a_topad js-topad','content area']).find(
"div", ['post_main','article-content','article-box l']).find("div",
['post_content','content','article-text']).find("div", ['post_body','ql-align-justify'])
return s.get_text()
def get_from_163_3g(html_doc):
soup = BeautifulSoup(html_doc, 'html.parser')
s = soup.body.find("main").find('article').find(
'div', class_='article-content').find('div', class_='content').find(
'div', class_='page js-page on n-platform-android')
print(s)
if s is None:
return
return s.get_text()
def save_to_db(url, html):
file = open('link.txt', 'a', encoding='utf-8')
#url 是网址,html是 网页内容
print('%s : %s' % (url, len(html)))
if len(html) == 0:
return
file.write(url)
file.write('\n')
r = requests.get(url)
if r.apparent_encoding == 'Windows-1254':
return
r.encoding = r.apparent_encoding
tld = tldextract.extract(url)
print(tld)
if tld.subdomain == '3g':
news_content = get_from_163_3g(r.text)
if tld.subdomain =='www':
return
else:
news_content = get_text_from_163(r.text)
if news_content is None:
file.close()
else:
file.write(news_content)
file.write('\n')
file.close()
def crawl():
# 1\. download baidu news
#hub_url = 'https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4+%E5%90%88%E4%BD%9C+site%3A+163.com&tn=news&rsv_bp=1&oq=&rsv_sug3=15&rsv_sug2=0&rsv_btype=t&f=8&inputT=4494&rsv_sug4=4494'
#阿里巴巴 合作
hub_url = 'https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%E7%89%B9%E6%96%AF%E6%8B%89+%E6%8A%95%E8%B5%84site%3A+163.com&tn=news&rsv_bp=1&rsv_sug3=1&rsv_sug2=0&oq=&rsv_btype=t&f=8&inputT=1&rsv_sug4=542'
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
}
res = requests.get(hub_url, headers=headers)
#res = requests.get(hub_url)
html = res.text
#print(html)
# 2\. extract news links
## 2.1 extract all links with 'href'
links = re.findall(r'href=[\'"]?(.*?)[\'"\s]', html)
print('find links:', len(links))
news_links = []
## 2.2 filter non-news link
for link in links:
if not link.startswith('http'):
continue
tld = tldextract.extract(link)
print(tld.domain)
print(tld.subdomain)
if tld.domain == 'baidu':
continue
#if tld.subdomain == 'passport' or tld.subdomain == 'i' or tld.subdomain == 'cache' or tld.subdomain == 'www' or tld.subdomain == 'tieba' or tld.subdomain == 'map' or tld.subdomain == 'b2b' or tld.subdomain == 'zhidao' or tld.subdomain == 'image' or tld.subdomain == 'wenku':
if tld.subdomain == 'i':
continue
news_links.append(link)
print('find news links:', len(news_links))
# news_links = ['https://www.163.com/dy/article/GTQJKS6J05525UOI.html']
# 3\. download news and save to database
news_links = list(set(news_links))
print(news_links)
for link in news_links:
html = requests.get(link).text
save_to_db(link, html)
print('works done!')
def main():
crawl()
if __name__ == '__main__':
main()