-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathone_song.py
250 lines (219 loc) · 12.3 KB
/
one_song.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# -*- coding: UTF-8 -*-
# 新手上路,不足之处还有很多,还望指教!
# 这是一个简陋地获取网易云音乐某歌单或者某专辑下的所有评论,并找出某个用户(user_id)在此歌单中的
# 所有评论然后保存在数据库中的程序。
# 歌单id(params)、目标用户user_id,以及数据库的写入可根据自己需求更改或注释掉。
import base64
import requests
import json
import time
import random
import threading
import sqlite3
from bs4 import BeautifulSoup
from Crypto.Cipher import AES
import sys
import datetime
#代替输入 music_id music_name
music_id = sys.argv[1]
music_name = sys.argv[2]
#建表,为sqlite 写入做准备
conn = sqlite3.connect('./music.db',check_same_thread = False)
cur = conn.cursor()
sql0 = "CREATE TABLE IF NOT EXISTS music" + str(music_id) + "(music_name text, comment_id real,user_id text,user_name text,avatar_url text,time real,liked_count real,comment text)"
cur.execute(sql0)
conn.commit()
sql = "INSERT INTO music"+str(music_id)+" VALUES(?,?,?,?,?,?,?,?)"
user_agent_list = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
user_agent = random.choice(user_agent_list) # 随机获取代理ip
raw_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Cookie':'_ntes_nnid=73794490c88b2790756a23cb36d25ec1,1507099821594; _ntes_nuid=73794490c88b2790756a23cb36d25ec1; _ngd_tid=LtmNY2JGJkw6wR3HF%2FpG2bY%2BtHhQDmOj; usertrack=c+xxC1nazueHBQJiCi7XAg==; JSESSIONID-WYYY=sJg6dw45PFKjn0VD2OuD0mzqC03xb3CnU3h4ac43kp7r9q9GJos%2BFDVyZmeGtz%5CHciN66cY5KAEW6jlHT%5COv0qzP8T3O3R5cq28%2BXJ3rc%2BkqsI4Y%2BrJIwZczDZGlvq225U%5CNWBP0iEjTnfdUG21swAhZA%5CfX29F4s9M6tz2EK7%2FESIpW%3A1507612773856; _iuqxldmzr_=32; MUSIC_U=e58d5af1daeedff199dcb9d14e06692f2db7395809fd3b393c0d6d53e13de2f484b4ab9877ef4e4ca1595168b12a45da86e425b9057634fc; __remember_me=true; __csrf=63e549f853ed105c4590d6fe622fb4f6',
'Host': 'music.163.com',
'Referer': 'http://music.163.com/',
'User-Agent': user_agent
}
# 以下encSecKey、AES_encrypt等有关解密的函数非原创,来源于知乎,
# 参考:https://www.zhihu.com/question/36081767
# 获取params 注意:评论每一次翻页后的的params都不一样
def get_params(first_param, forth_param):
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
h_encText = AES_encrypt(first_param, first_key.encode(), iv.encode())
h_encText = AES_encrypt(
h_encText.decode(), second_key.encode(), iv.encode())
return h_encText.decode()
# 获取encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
# AES解密
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text.encode())
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text
# 获取json数据
def get_json(url, data):
headers=raw_headers
response = requests.post(url=url, headers=headers,data=data)# proxies=proxies
global index
index += 1
if index > 1500:
index -= 1500 # 每爬取约30000条评论sleep一下
print('每爬30000条评论,sleep几秒....ZZzzzzz......Go on')
time.sleep(random.randint(1, 3))
return response.content
# 传入post数据
def crypt_api(id, offset):
url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_%s/?csrf_token=" % id
first_param = "{rid:\"\", offset:\"%s\", total:\"true\", limit:\"20\", csrf_token:\"\"}" % offset
forth_param = "0CoJUm6Qyw8W8jud"
params = get_params(first_param, forth_param)
encSecKey = get_encSecKey()
data = {
"params": params,
"encSecKey": encSecKey
}
return url, data
###################################################################################################################
# 获取评论
def get_the_first_half_comment(id,comments_sum,music_name):
raw_page = 0 #用于获得目标用户评论所在的页数
for i in range(0, comments_sum//2, 20): # 每一页有20条评论
#proxies = random.choice(proxy_pool) # 随机获取代理ip
# 对于每一页需请求一次,使用一次代理
offset = i
url, data = crypt_api(id, offset)
json_text = get_json(url, data)
json_dict = json.loads(json_text.decode("utf-8"))
json_comment = json_dict['comments']
for comment in json_comment:
# 每一个comment均为包含一个user的所有评论信息
# 找了只有一条评论的歌曲信息,comment格式如下:
# music_id:5283862 music_name:忘了我吧!我的最爱
#{"isMusician":false,"userId":-1,"topComments":[],"moreHot":false,"hotComments":[],"code":200,
#"comments":[{"user":{"locationInfo":null,"experts":null,"authStatus":0,"remarkName":null,"avatarUrl":"http://p1.music.126.net/8N882UcPox32hcrYCpfOxw==/19083123811686650.jpg","userId":429847262,"expertTags":null,"vipType":0,"nickname":"故事偷盗者","userType":0},
#"beReplied":[],"likedCount":0,"liked":false,"commentId":321330017,"time":1488441683356,"content":"为了遮羞才把书包挡住屁股给你学牛看,从此每天乐此不疲逗你开心。你初一的时候开始不好好学习,谈了男朋友,最后跟他私奔,现在都还杳无音讯!但不管怎样,我都希望你现在能像以前一样,找到那头可以逗你哈哈大笑的牛,幸福下去。晚安[牵手]",
#"isRemoveHotComment":false}],"total":1,"more":false}
comment_id = comment['commentId']
user_id = comment['user']['userId']
user_name = comment['user']['nickname']
avatar_url = comment['user']['avatarUrl']
time = comment['time']
liked_count = comment['likedCount']
comment = comment['content']
print(comment_id, user_id, user_name, avatar_url, time, liked_count, comment)
# 存储在sqlite
lock = threading.Lock() #进程锁
try:
lock.acquire(True)
cur.execute(sql, (music_name, comment_id, user_id, user_name, avatar_url, time, liked_count, comment))
finally:
lock.release()
# page 是目标评论所在位置:最后一页为-1,倒数第二页为-2....
page=-(comments_sum // 20 + 1 - raw_page)
#print(page)
raw_page += 1
conn.commit() #提交数据库的更改
def get_the_second_half_comment(id,comments_sum,music_name):
raw_page = 0
half_page=0
for the_back_half_num in range(0, comments_sum//2, 20):
half_page+=1
pass
for i in range(the_back_half_num+20,comments_sum,20):
offset = i
url, data = crypt_api(id, offset)
json_text = get_json(url, data)
json_dict = json.loads(json_text.decode("utf-8"))
json_comment = json_dict['comments']
for comment in json_comment:
comment_id = comment['commentId']
user_id = comment['user']['userId']
user_name = comment['user']['nickname']
avatar_url = comment['user']['avatarUrl']
time = comment['time']
liked_count = comment['likedCount']
comment = comment['content']
print(comment_id, user_id, user_name, avatar_url, time, liked_count, comment)
# 存储在sqlite
lock = threading.Lock() #进程锁
try:
lock.acquire(True)
cur.execute(sql, (music_name, comment_id, user_id, user_name, avatar_url, time, liked_count, comment))
finally:
lock.release()
page=-(comments_sum // 20 + 1 - raw_page)+half_page
# print('《'+str(music_name)+'》'+'——'+':', comment,page)
raw_page += 1
conn.commit() #提交数据库的更改
# 用本地文档生成随机proxies
#raw_proxy_pool = []
#with open("/home/hardly/文档/proxy.txt")as fin:
#for line in fin.readlines():
#line = line.strip("\n")
#pro = {'https': 'https:' + line}
# print(proxies)
#raw_proxy_pool.append(pro)
index = 0 # 用于每爬取1500页(30000条评论)sleep一下
def get_music_info():
num = 0 # 计量爬行总评论数
try:
offset = 0
url, data = crypt_api(music_id, offset) # return url, data
json_text = get_json(url, data)
# json_dict为得到包含所有评论的dict
json_dict = json.loads(json_text.decode("utf-8"))
comments_sum = json_dict['total'] # 评论总数
print('《'+str(music_name)+'》'+'共有:{}条评论,正在爬取........'.format(comments_sum))
num += comments_sum # 将所有的comments_sum累加便是爬取总评论数
raw_page = 0 #用于获得目标用户评论所在的页数
threads = []
lock = threading.Lock() #进程锁
t1= threading.Thread(target=get_the_first_half_comment, args=(music_id,comments_sum,music_name))
threads.append(t1)
# t2= threading.Thread(target=get_the_second_half_comment, args=(music_id,comments_sum,music_name))
# threads.append(t2)
for t in threads:
t.start()
for t in threads:
t.join()
print('目前共成功爬取:'+str(num)+'条评论')
time.sleep(random.randint(1, 3))
except Exception as e:
print('出现错误:', e)
if __name__ == '__main__':
get_music_info()
conn.close()