-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathXhsTitle.py
132 lines (121 loc) · 5.64 KB
/
XhsTitle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import hashlib
from urllib import parse
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import csv
# 定义当天日期
today = datetime.today()
today_str = today.strftime('%Y%m%d')
class XhsTitle:
def __init__(self, key_name, authorization, sorted_way):
self.key_name = key_name
self.authorization = authorization
self.sorted_way = sorted_way
self.host = 'https://www.xiaohongshu.com'
@staticmethod
def get_x_sign(api):
x_sign = "X"
m = hashlib.md5()
m.update((api + "WSUDD").encode())
x_sign = x_sign + m.hexdigest()
return x_sign
def spider(self, d_page, sort_by='general'):
url = f'/fe_api/burdock/weixin/v2/search/notes?keyword={parse.quote(self.key_name)}&sortBy={sort_by}' \
f'&page={d_page + 1}&pageSize=20&prependNoteIds=&needGifCover=true'
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.38(0x1800262c) NetType/WIFI Language/zh_CN',
'Referer': 'https://servicewechat.com',
'Authorization': self.authorization,
'X-Sign': self.get_x_sign(url)
}
resp = requests.get(url=self.host + url, headers=headers, timeout=5)
if resp.status_code == 200:
res = json.loads(resp.text)
return res['data']['notes'], res['data']['totalCount']
else:
print(f'Fail:{resp.text}')
def getlist_by_name(self, page_range=5):
notes = []
# 目前是每次小程序搜索,出来100条结果,然后分5页进行抓取
for i in range(0, page_range):
tmp = self.spider(d_page=i, sort_by=self.sorted_way)
if len(tmp[0]) <= 0:
break
else:
notes.extend(tmp[0])
print(tmp[0])
return notes
@staticmethod
def get_info(ids):
infolist = []
for id in ids:
url = f"https://www.xiaohongshu.com/explore/{id}"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Connection": "keep-alive",
"Host": "www.xiaohongshu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1"
}
resp = requests.get(url, headers=headers)
resp.encoding = resp.apparent_encoding
html = resp.text
soup = BeautifulSoup(html, 'lxml')
json_str = soup.find(attrs={'type': 'application/ld+json'}).text
json_str = json_str.replace('\n', '').replace('\r\n', '')
info_dic = json.loads(json_str, strict=False)
info_dic['link'] = url
if info_dic['name'] != '':
infolist.append(info_dic)
return infolist
@staticmethod
def get_title_url(xhs_data):
new_data = []
for item in xhs_data:
new_data.append({
'文章链接': f"https://www.xiaohongshu.com/explore/{item['id']}",
'作者主页': f'https://www.xiaohongshu.com/user/profile/{item["user"]["id"]}',
'作者昵称': item['user']['nickname'],
'文章标题': item['title'],
'获赞数量': item['likes'],
'发布时间': item['time'],
'是否认证ID': item['user']['officialVerified']
})
return new_data
def xhs_to_csv(self, data, field, path='x'):
if path == 'x':
with open(f'{today_str}{self.key_name}{self.sorted_way}.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=field)
writer.writeheader()
writer.writerows(data)
print(f'保存成功,文件名为:{today_str}{self.key_name}{self.sorted_way}.csv')
else:
with open(path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=field)
writer.writeheader()
writer.writerows(data)
print(f'保存成功,文件名为:{path}')
if __name__ == "__main__":
# 以下3个均为class XHS的参数
# 需要搜索的关键字
keyName = "请输入要搜索的关键词"
# 授权令牌,可通过charles获取,教程网上很多
authorization = "wxmp.XXXX"
# 排序方式,共3种,general:综合排序,hot_desc:热度排序,create_time_desc:发布时间排序
sortedWay = "general"
fields = ['文章链接', '作者主页', '作者昵称', '文章标题', '获赞数量', '发布时间', '是否认证ID']
# 建立爬虫对象
xhs_spider = XhsTitle(keyName, authorization, sortedWay)
# 获取小程序的页面内容
idList = xhs_spider.getlist_by_name()
# 获取解析后的内容-需要抓取内容的话,也是从xhs_title里面获取内容的链接
xhs_title = xhs_spider.get_title_url(idList)
# 获取文章链接,会生成一个列表
links = [d['文章链接'] for d in xhs_title]
# 输出到csv,最后会生成一个文件名包含KeyName, sortedWay,以及当天日期的csv文件
xhs_spider.xhs_to_csv(xhs_title, fields, path='basic.csv')