Skip to content

Commit

Permalink
perf: 去掉补环境 使用纯算计算xs
Browse files Browse the repository at this point in the history
  • Loading branch information
Cloxl committed Oct 17, 2024
1 parent 619afda commit 29eeb74
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 55 deletions.
79 changes: 79 additions & 0 deletions EncryptHelper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import base64
import hashlib
import json
import struct
import time

from Crypto.Cipher import AES
from Crypto.Util.Padding import pad


class EncryptHelper:
words = [1735160886, 1748382068, 1631021929, 1936684855]
key_bytes = b''.join(struct.pack('>I', word) for word in words)
iv = b'4hrivgw5s342f9b2'

@staticmethod
def get_md5(url: str) -> str:
"""
根据传入的url和params生成MD5摘要
:param url: API的url
:return: MD5摘要
"""
md5_hash = hashlib.md5(('url=' + url).encode('utf-8')).hexdigest()
return md5_hash

@staticmethod
def encrypt_text(text: str) -> str:
"""
根据传入的text生成AES加密后的内容,并将其转为base64编码
:param text: 需要加密的字符串
:return: 加密后的base64编码字符串
"""
text_encoded = base64.b64encode(text.encode())

cipher = AES.new(EncryptHelper.key_bytes, AES.MODE_CBC, EncryptHelper.iv)
ciphertext = cipher.encrypt(pad(text_encoded, AES.block_size))
ciphertext_base64 = base64.b64encode(ciphertext).decode()

return ciphertext_base64

@staticmethod
def base64_to_hex(encoded_data):
decoded_data = base64.b64decode(encoded_data)
hex_string = ''.join([format(byte, '02x') for byte in decoded_data])
return hex_string

@staticmethod
def encrypt_payload(payload: str) -> str:
"""
把小红书加密参数payload转16进制 再使用base64编码
:param payload: 要加密处理的payload内容
:return: 加密后并进行base64编码的字符串
"""
obj = {
"signSvn": "55",
"signType": "x2",
"appID": "xhs-pc-web",
"signVersion": "1",
"payload": EncryptHelper.base64_to_hex(payload)
}
return base64.b64encode(json.dumps(obj, separators=(',', ':')).encode()).decode()

@staticmethod
def encrypt_xs(url: str, a1: str, ts: str) -> str:
"""
将传入的参数加密为小红书的xs
:param url: API请求的URL
:param a1: 签名参数a1
:param ts: 时间戳
:return: 最终的加密签名字符串,前缀为“XYW_”
"""
text = (f'x1={EncryptHelper.get_md5(url)};'
f'x2=0|0|0|1|0|0|1|0|0|0|1|0|0|0|0|1|0|0|0;'
f'x3={a1};'
f'x4={ts};')
return 'XYW_' + EncryptHelper.encrypt_payload(EncryptHelper.encrypt_text(text))
15 changes: 8 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
loguru
openpyxl
requests
pandas
PyExecJS
beautifulsoup4
pillow
loguru~=0.7.2
openpyxl~=3.1.5
requests~=2.32.3
pandas~=2.2.2
PyExecJS~=1.5.1
beautifulsoup4~=4.12.3
pillow~=10.4.0
pycryptodome~=3.21.0
109 changes: 61 additions & 48 deletions xhs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,39 @@
from loguru import logger
from PIL import Image

from EncryptHelper import EncryptHelper

cookies = {
"a1": "",
"web_session": "",
}
user_id = ""
target_like_count = 100
formatter_type = "评论数量"

tmp_json_path = './tmp.json'
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=0, i",
"referer": "https://www.xiaohongshu.com/explore",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cache-control': 'no-cache',
'origin': 'https://www.xiaohongshu.com',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.xiaohongshu.com/',
'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
}
if not user_id:
logger.error("未提取到用户id 无法进行下一步检索!")
exit(10086)
if formatter_type not in ["点赞数量", "收藏数量", "分享数量", "评论数量"]:
logger.error(f"错误: 排序字段 '{formatter_type}' 无效")
exit(10087)

# 处理获取index.html的数据
url = f"https://www.xiaohongshu.com/user/profile/{user_id}"
Expand Down Expand Up @@ -141,28 +146,28 @@ def fetch(url: str) -> Any | None:
user_profile_data = user_profile_data['user']['notes'][0]

rows = []
for profile_data in user_profile_data:
note_card = profile_data["noteCard"]
for fetch_profile_data in user_profile_data:
note_card = fetch_profile_data["noteCard"]
row = {
"笔记标题": note_card.get("displayTitle", ""),

"点赞数量": note_card['interactInfo'].get("likedCount", ""),
"收藏数量": note_card['interactInfo'].get("collectedCount", "未满足要求不爬取"),
"分享数量": note_card['interactInfo'].get("shareCount", "未满足要求不爬取"),
"评论数量": note_card['interactInfo'].get("commentCount", "未满足要求不爬取"),
"收藏数量": note_card['interactInfo'].get("collectedCount", 0),
"分享数量": note_card['interactInfo'].get("shareCount", 0),
"评论数量": note_card['interactInfo'].get("commentCount", 0),

"内容形式": "图文" if note_card.get("type") == "normal" else "视频"
if note_card.get("type") == "video" else note_card.get("type", ""),

"用户昵称": note_card['user'].get("nickname", ""),
"笔记ID": profile_data.get("id", ""),
"笔记链接": f"https://www.xiaohongshu.com/explore/{profile_data['id']}?xsec_token{profile_data['xsecToken']}"
"笔记ID": fetch_profile_data.get("id", ""),
"笔记链接": f"https://www.xiaohongshu.com/explore/{fetch_profile_data['id']}?xsec_token{fetch_profile_data['xsecToken']}"
}
rows.append(row)

if not rows:
logger.error("首次爬取未获取到用户的任何内容")
exit(10086)
exit(10088)

# 处理后续的笔记链接
while True:
Expand All @@ -177,13 +182,9 @@ def fetch(url: str) -> Any | None:

c = f'{url}?{"&".join([f"{key}={value}" for key, value in params.items()])}'

with open('./static/xs.js', 'r', encoding='utf-8') as f:
xsxt = execjs.compile(f.read()).call(
"get_xsxt", c, "undefined", cookies['a1']
)

headers['x-s'] = xsxt['X-s']
headers['x-t'] = str(xsxt['X-t'])
t = str(round(int(time.time()) * 1000))
xs = EncryptHelper.encrypt_xs(url=c, a1=cookies["a1"], ts=t)
headers['x-s'], headers['x-t'] = xs, t

# xsc并不检测 如果需要可以放开这里的注释
with open('static/xs_common.js', 'r', encoding='utf-8') as f:
Expand All @@ -208,16 +209,21 @@ def fetch(url: str) -> Any | None:

data = response.json()["data"]["notes"]
has_more = response.json()["data"]["has_more"]
for profile_data in data:
for fetch_profile_data in data:
row = {
"笔记标题": profile_data.get("display_title", ""),
"点赞数量": profile_data['interact_info'].get("liked_count", ""),
"内容形式": "图文" if profile_data.get("type") == "normal" else "视频"
if profile_data.get("type") == "video" else profile_data.get("type", ""),

"用户昵称": profile_data['user'].get("nickname", ""),
"笔记ID": profile_data['note_id'],
"笔记链接": f"https://www.xiaohongshu.com/explore/{profile_data['note_id']}?xsec_token{profile_data['xsec_token']}"
"笔记标题": fetch_profile_data.get("display_title", ""),

"点赞数量": fetch_profile_data['interact_info'].get("liked_count", ""),
"收藏数量": fetch_profile_data['interact_info'].get("collectedCount", 0),
"分享数量": fetch_profile_data['interact_info'].get("shareCount", 0),
"评论数量": fetch_profile_data['interact_info'].get("commentCount", 0),

"内容形式": "图文" if fetch_profile_data.get("type") == "normal" else "视频"
if fetch_profile_data.get("type") == "video" else fetch_profile_data.get("type", ""),

"用户昵称": fetch_profile_data['user'].get("nickname", ""),
"笔记ID": fetch_profile_data['note_id'],
"笔记链接": f"https://www.xiaohongshu.com/explore/{fetch_profile_data['note_id']}?xsec_token{fetch_profile_data['xsec_token']}"
}
rows.append(row)

Expand All @@ -229,15 +235,12 @@ def fetch(url: str) -> Any | None:
logger.success(f"单次数据爬取成功, 延时{sleeper_time}秒后继续爬取")
time.sleep(sleeper_time)

# 排序 在excel文件中无需再次排序
rows = sorted(rows, key=lambda x: int(x["点赞数量"]), reverse=True)


# 对点赞数量大于指定内容的笔记获取更多数据 并进行下载
pre_path = f"./output/{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
for row in rows:
if int(row['点赞数量']) < target_like_count:
break # 经过排序 前面的项一定 >= target_like_count
continue

response = fetch(row['笔记链接'])
if not response:
Expand All @@ -250,15 +253,19 @@ def fetch(url: str) -> Any | None:
logger.warning(f"网页请求成功 但是提取数据发生错误 跳过当前文章 {row['笔记ID']}")
continue

user_note_info = user_note_data['note']['noteDetailMap'][row['笔记ID']]['note']
row['收藏数量'] = user_note_info['interactInfo'].get('collectedCount', '获取异常')
row['分享数量'] = user_note_info['interactInfo'].get('shareCount', '获取异常')
row['评论数量'] = user_note_info['interactInfo'].get('commentCount', '获取异常')
try:
user_note_info = user_note_data['note']['noteDetailMap'][row['笔记ID']]['note']
row['收藏数量'] = user_note_info['interactInfo'].get('collectedCount', 0)
row['分享数量'] = user_note_info['interactInfo'].get('shareCount', 0)
row['评论数量'] = user_note_info['interactInfo'].get('commentCount', 0)
except KeyError:
logger.warning("小红书返回了空的数据 具体问题还在排查中 \n 相关数据保存在output.xlsx \n 退出程序")
break

pre_path += f"/{row['笔记ID']}"
os.makedirs(pre_path, exist_ok=True)

# 获取文章内容
# 获取文章内容row['笔记ID']
with open(f"{pre_path}/content.txt", "w", encoding="utf-8") as f:
f.write(row['笔记标题'] + '\n\n' + user_note_info['desc'])

Expand All @@ -281,5 +288,11 @@ def fetch(url: str) -> Any | None:
logger.success(f"单次文章爬取成功, 延时{sleeper_time}秒后继续爬取")
time.sleep(sleeper_time)

try:
rows = sorted(rows, key=lambda x: int(x[formatter_type]), reverse=True)
except ValueError:
logger.error(f"排序时出现问题,请确认字段 {formatter_type} 是否存在并且是可以转换为整数的内容")
exit(10089)

df = pd.DataFrame(rows)
df.to_excel('output.xlsx', index=False)

0 comments on commit 29eeb74

Please sign in to comment.