-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_cqu_year.py
404 lines (332 loc) · 17.4 KB
/
news_cqu_year.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
"""
爬取原网页的html,过滤新闻内容并重新拼接,保留原网页样式。
"""
from news_cqu import *
import sys
# 睡眠时间
sleep_time = 0.1
# 全局字典变量,以键值对(键:对应URL,值:标题)形式存储爬取的数据记录。
dict_data = dict()
# 读取新闻模块每个页面的url,获取新闻模块的每条新闻的归档元数据,并将页面转成pdf格式保存
def get_news_info(url_list, module_url, all_urls, f_data):
global dict_data
# 读取字典中的数据
f_data.seek(0, 0)
content = f_data.read()
if content:
dict_data = json.loads(content)
# 获取配置表的id,赋值给结果表
conf_id = get_conf_id('新闻模块')
# 新闻模块新闻数累加器
sum_i = 0
# 新闻模块页数计数器
page = 1
# 获取栏目名称
news_heading = ''
dict_news = dict()
dict_news = {'网站名称': spider_name, '网站域名': spider_url}
r = requests.get(module_url, headers=headers)
r.encoding = 'UTF-8'
html = etree.HTML(r.text)
cur.execute("SELECT xpath from t_spider_config_xpath where name = %s", '新闻类栏目标题xpath')
xpath = cur.fetchone()
xpath = xpath[0]
# 根据不同的栏目指定不同的xpath
index = all_urls.index(module_url)
xpath = xpath.replace('?', str(index + 2))
try:
news_heading = html.xpath(xpath)
news_heading = ''.join(news_heading)
# print(news_heading)
except IndexError:
print("xpath配置错误!")
except etree.XPathEvalError:
print("数据库里未找到记录!")
# 创建文件夹
# 先判断文件夹是否存在,不存在则创建文件夹
now_dir = os.getcwd()
new_dir = now_dir + '/' + news_heading
dir_judge = os.path.exists(new_dir)
if not dir_judge:
os.mkdir(new_dir)
# 每一页的url
for url in url_list:
r = requests.get(url, headers=headers)
r.encoding = 'UTF-8'
raw_html = r.text
html = etree.HTML(raw_html)
links_list = get_xpath_content(html, '新闻模块网址xpath')
title_list = get_xpath_content(html, '新闻模块标题xpath')
# 每一条新闻的url + 每一个标题
for each_url, title in zip(links_list, title_list):
print('正在爬取 {} 栏目下,第 {} 页 总第 {} 条新闻。'.format(news_heading, page, sum_i + 1))
# 存储每一个新闻模块链接URL的记录,放进字典和数据库,如果已经存在,则不存储
judge = each_url in dict_data.keys()
try:
if not judge:
dict_data[each_url] = title
r = requests.get(each_url, headers=headers)
r.encoding = 'UTF-8'
raw_html = r.text
html = etree.HTML(raw_html)
html_filter = sensitive_word_filter(raw_html)
html_filter = path_rewrite(html_filter)
timestamp = round(time.time())
html_file = new_dir + '/' + str(timestamp) + '.html'
pdf_file = new_dir + '/' + str(timestamp) + '.pdf'
dict_news['所属栏目'] = news_heading
try:
cur.execute("SELECT name from t_spider_config_xpath where name like %s", '新闻模块' + '%')
xpath_name = cur.fetchall()
for each in xpath_name:
dict_news[each[0][4:-5]] = get_xpath_content(html, each[0])
except IndexError:
print("xpath配置错误!")
except etree.XPathEvalError:
print("数据库里未找到记录!")
dict_news['标题'] = title
dict_news['网址'] = each_url
time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dict_news['采集时间'] = time_now
dict_news['采集人'] = '档案馆'
if dict_news['发布时间']:
release_time = dict_news['发布时间']
else:
release_time = None
json_dict = json.dumps(dict_news, ensure_ascii=False, indent=4)
# print(json_dict)
if release_time:
date_release_time = datetime.datetime.strptime(release_time, '%Y-%m-%d').date()
if news_timefilter == date_release_time.year:
print(json_dict)
judge_identifier = not_found_judge(raw_html, r)
# 判断网页是不是404 not found
if judge_identifier:
cur.execute(insert_result, (conf_id, 'detail', each_url, html_filter, html_file, pdf_file,
time_now, news_heading, release_time, json_dict))
conn.commit()
json_data = json.dumps(dict_data)
f_data.seek(0, 0)
f_data.write(json_data)
sum_i += 1
with open(html_file, 'w+', encoding='UTF-8') as f1:
f1.write(html_filter)
# html转pdf
pdfkit.from_url(each_url, pdf_file, configuration=confg)
print('该新闻《{}》pdf格式已转换成功。'.format(title))
time.sleep(sleep_time)
else:
# 将404 not found 记录进数据库
html_filter = '404 not found'
cur.execute(insert_result, (conf_id, 'detail', each_url, html_filter, '', '',
time_now, news_heading, None, json_dict))
conn.commit()
json_data = json.dumps(dict_data)
f_data.seek(0, 0)
f_data.write(json_data)
print('该新闻《{}》网页不存在, 以‘404 not found’为网页内容存入数据库。'.format(title))
sum_i += 1
elif date_release_time.year == news_timefilter - 1:
print('该新闻发布时间早于 {} 年,已退出。'.format(news_timefilter))
break
# 发布时间不是指定年份的新闻则跳过
else:
print('该新闻发布时间晚于 {} 年,已跳过。'.format(news_timefilter))
sum_i += 1
# 忽略发布时间的新闻
else:
print('该新闻没有发布时间,已跳过。')
sum_i += 1
else:
sum_i += 1
print('{} 栏目 的 第 {} 条新闻 已爬取过且保存在数据库中!'.format(news_heading, sum_i))
except IOError:
print("Warning: wkhtmltopdf读取文件失败, 可能是网页无法打开或者图片/css样式丢失。")
except IndexError:
print("该栏目《{}》下的新闻已全部爬取完!".format(news_heading))
break
# 跳出双重循环
else:
print('第{}页已经爬取完'.format(page))
page += 1
continue
break
print("该栏目《{}》下 {} 年的所有新闻已全部爬取完!".format(news_heading, news_timefilter))
# print('{} 栏目下 共有{}页 {}条新闻'.format(news_heading, page - 1, sum_i))
# 读取通知公告简报每个页面的url,获取通知公告简报的每条新闻的归档元数据,并将页面转成pdf格式保存
def get_notice_info(url_list, f_data):
global dict_data
# 读取字典中的数据
f_data.seek(0, 0)
content = f_data.read()
if content:
dict_data = json.loads(content)
# 通知公告数累加器
sum_i = 0
# 通知公告简报页数计数器
page = 1
news_heading = '通知公告简报'
# 获取配置表的id,赋值给结果表
conf_id = get_conf_id(news_heading)
# 通知公告简报
dict_notice = dict()
dict_notice = {'网站名称': spider_name, '网站域名': spider_url}
# 创建文件夹
# 先判断文件夹是否存在,不存在则创建文件夹
now_dir = os.getcwd()
new_dir = now_dir + '/' + news_heading
dir_judge = os.path.exists(new_dir)
if not dir_judge:
os.mkdir(new_dir)
# 每一页的url
for url in url_list:
r = requests.get(url, headers=headers)
r.encoding = 'UTF-8'
raw_html = r.text
html = etree.HTML(raw_html)
links_list = get_xpath_content(html, '通知公告简报网址xpath')
title_list = get_xpath_content(html, '通知公告简报标题xpath')
# 每一条通知的url + 每一个标题
for each_url, title in zip(links_list, title_list):
print('正在爬取 {} 栏目下,第 {} 页 总第 {} 条通知公告。'.format(news_heading, page, sum_i + 1))
# 存储每一个学术预告链接URL的记录,放进字典和数据库,如果已经存在,则不存储
judge = each_url in dict_data.keys()
try:
if not judge:
dict_data[each_url] = title
r = requests.get(each_url, headers=headers)
r.encoding = 'UTF-8'
raw_html = r.text
html = etree.HTML(raw_html)
html_filter = sensitive_word_filter(raw_html)
html_filter = path_rewrite(html_filter)
timestamp = round(time.time())
html_file = new_dir + '/' + str(timestamp) + '.html'
pdf_file = new_dir + '/' + str(timestamp) + '.pdf'
# 对跳转微信公众号文章的链接做处理
if 'weixin' in each_url:
title = html.xpath('//h2[@class="rich_media_title"]/text()')
title = ''.join(title)
title = title.strip()
dict_notice['所属栏目'] = news_heading
# 从数据库获取xpath, 并根据xpath获取内容
try:
cur.execute("SELECT name from t_spider_config_xpath where name like %s",
news_heading + '%')
xpath_name = cur.fetchall()
for each in xpath_name:
# [6:-5]表示去掉开头‘通知公告简报’四个字符和结尾‘xpath’五个字符
dict_notice[each[0][6:-5]] = get_xpath_content(html, each[0])
except IndexError:
print("xpath配置错误!")
except etree.XPathEvalError:
print("数据库里未找到记录!")
dict_notice['标题'] = title
dict_notice['网址'] = each_url
time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dict_notice['采集时间'] = time_now
dict_notice['采集人'] = '档案馆'
if dict_notice['发布时间']:
release_time = dict_notice['发布时间']
else:
release_time = None
json_dict = json.dumps(dict_notice, ensure_ascii=False, indent=4)
# print(json_dict)
if release_time:
date_release_time = datetime.datetime.strptime(release_time, '%Y-%m-%d').date()
if news_timefilter == date_release_time.year:
print(json_dict)
judge_identifier = not_found_judge(raw_html, r)
# 判断网页是不是404 not found
if judge_identifier:
cur.execute(insert_result, (conf_id, 'detail', each_url, html_filter, html_file, pdf_file,
time_now, news_heading, release_time, json_dict))
conn.commit()
json_data = json.dumps(dict_data)
f_data.seek(0, 0)
f_data.write(json_data)
sum_i += 1
with open(html_file, 'w+', encoding='UTF-8') as f1:
f1.write(html_filter)
# html转pdf
pdfkit.from_url(each_url, pdf_file, configuration=confg)
print('该通知《{}》pdf格式已转换成功。'.format(title))
time.sleep(sleep_time)
else:
# 将404 not found 记录进数据库
html_filter = '404 not found'
cur.execute(insert_result, (conf_id, 'detail', each_url, html_filter, '', '',
time_now, news_heading, None, json_dict))
conn.commit()
json_data = json.dumps(dict_data)
f_data.seek(0, 0)
f_data.write(json_data)
print('该通知《{}》网页不存在, 以‘404 not found’为网页内容存入数据库。'.format(title))
sum_i += 1
elif date_release_time.year == news_timefilter - 1:
print('该新闻发布时间早于 {} 年,已退出。'.format(news_timefilter))
break
# 发布时间不是指定年份的新闻则跳过
else:
print('该新闻不是发布于 {} 年的新闻,已跳过。'.format(news_timefilter))
sum_i += 1
# 忽略没有发布时间的新闻
else:
print('该新闻没有发布时间,已跳过。')
sum_i += 1
else:
sum_i += 1
print('{} 栏目 的 第 {} 条通知 已爬取过且保存在数据库中!'.format(news_heading, sum_i))
except IOError:
print("Warning: wkhtmltopdf读取文件失败, 可能是网页无法打开或者图片/css样式丢失。")
except IndexError:
print("该栏目《{}》下的通知公告简报已全部爬取完!".format(news_heading))
break
# 跳出双重循环
else:
print('第{}页已经爬取完'.format(page))
page += 1
continue
break
print("该栏目《{}》下 {} 年的所有新闻已全部爬取完!".format(news_heading, news_timefilter))
# print('{} 栏目下 共有{}页 {}条通知公告简报'.format(news_heading, page - 1, sum_i))
def main():
"""
获取所有的栏目链接
all_news_urls[0-4]: 爬取的第一大类:新闻模块(包括综合新闻、教学科研、招生就业、交流合作、校园生活栏目)
all_news_urls[5]:爬取的第二大类:媒体重大
all_news_urls[6]:爬取的第三大类:通知公告简报
all_news_urls[7]:爬取的第四大类:学术预告
all_news_urls[8]:爬取的第五大类:快讯
all_news_urls[9]:爬取的第六大类:专题
"""
with open('dict_data.txt', 'r+') as f_data:
all_news_urls = all_urls_list(f_data)
# 获取每个栏目下每页的链接
# 爬取的第一大类:新闻模块(包括综合新闻、教学科研、招生就业、交流合作、校园生活栏目)
for url in all_news_urls[:5]:
url_list = get_url_list(url, all_news_urls, f_data)
get_news_info(url_list, url, all_news_urls, f_data)
time.sleep(sleep_time)
# 爬取的第三大类:通知公告简报
url = all_news_urls[6]
url_list = get_url_list(url, all_news_urls, f_data)
get_notice_info(url_list, f_data)
time.sleep(sleep_time)
print('{} {} 的 {} 年的新闻 爬虫任务已完成!'.format(spider_name, spider_url, news_timefilter))
if __name__ == '__main__':
news_timefilter = 0
try:
if sys.argv[1]:
# 爬取新闻的指定日期
news_timefilter = int(sys.argv[1])
except IndexError:
print('未使用年份参数!')
print('请使用指令python3 news_cqu_year.py [指定年份] 传入年份参数!')
if news_timefilter:
main()
# 爬虫结束,更新爬虫状态为-1,停止
cur.execute("UPDATE t_spider_task SET status = -1 WHERE id = %s", task_id)
cur.close()
conn.commit()
conn.close()