通过F12抓包获取HDR数据,发现评论在 get?csrf_token=
中,同时发现 params
和 encSecKey
是被加密过的
因此,需要找到未加密前的数据,以及如何加密,最后模拟实现加密,请求到网易拿到评论数据
从调用堆栈中,发现相应位置
找到程序还未加密的最后一个堆栈位置
因此确定,u0x,be1x
为加密步骤,开始分析
u0x.be1x = function(X0x, e0x) {
var i0x = {},
e0x = NEJ.X({}, e0x),
mf4j = X0x.indexOf("?");
if (window.GEnc && /(^|\.com)\/api/.test(X0x) && !(e0x.headers && e0x.headers[et2x.Ct2x] == et2x.Hx4B) && !e0x.noEnc) {
if (mf4j != -1) {
i0x = j0x.hg3x(X0x.substring(mf4j + 1));
X0x = X0x.substring(0, mf4j)
}
if (e0x.query) {
i0x = NEJ.X(i0x, j0x.fU2x(e0x.query) ? j0x.hg3x(e0x.query) : e0x.query)
}
if (e0x.data) {
i0x = NEJ.X(i0x, j0x.fU2x(e0x.data) ? j0x.hg3x(e0x.data) : e0x.data)
}
i0x["csrf_token"] = u0x.gW3x("__csrf");
X0x = X0x.replace("api", "weapi");
e0x.method = "post";
delete e0x.query;
var bWf9W = window.asrsea(JSON.stringify(i0x), bsG2x(["流泪", "强"]), bsG2x(XH8z.md), bsG2x(["爱心", "女孩", "惊恐", "大笑"]));
e0x.data = j0x.cr1x({
params: bWf9W.encText,
encSecKey: bWf9W.encSecKey
})
}
var cdnHost = "y.music.163.com";
var apiHost = "interface.music.163.com";
if (location.host === cdnHost) {
X0x = X0x.replace(cdnHost, apiHost);
if (X0x.match(/^\/(we)?api/)) {
X0x = "//" + apiHost + X0x
}
e0x.cookie = true
}
czh5m(X0x, e0x)
};
经断点调试,发现在地20行进行了加密,将 i0x
计算后生成 bWf9W
临时变量,然后将 encText
和 encSecKey
赋给 e0x.data
的 params
和 encSecKey
,在这步拿出 i0x
的数据
找到加密函数
"""
function a(a) {
var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
for (d = 0; a > d; d += 1)
e = Math.random() * b.length,
e = Math.floor(e),
c += b.charAt(e);
return c
}
function b(a, b) {
var c = CryptoJS.enc.Utf8.parse(b),
d = CryptoJS.enc.Utf8.parse("0102030405060708"),
e = CryptoJS.enc.Utf8.parse(a),
f = CryptoJS.AES.encrypt(e, c, {
iv: d,
mode: CryptoJS.mode.CBC
}
);
return f.toString()
}
function c(a, b, c) {
var d, e;
return setMaxDigits(131),
d = new RSAKeyPair(b, "", c),
e = encryptedString(d, a)
}
function d(d, e, f, g) {
var h = {},
i = a(16);
return h.encText = b(d, g),
h.encText = b(h.encText, i),
h.encSecKey = c(i, e, f),
h
}
对于此页面的 function d(d, e, f, g)
:
d
为数据e
为bsG2x(["流泪", "强"])
=010001
f
为bsG2x(XH8z.md)
=一串字符串
g
为bsG2x(["爱心", "女孩", "惊恐", "大笑"]
=0CoJUm6Qyw8W8jud
function d(d, e, f, g) { # d:数据, e:010001, f:*****, g:0CoJUm6Qyw8W8jud
var h = {}, # 空对象
i = a(16); # 16位随机值, 把i设置为定值
return h.encText = b(d, g), # g是密钥
h.encText = b(h.encText, i), # 返回的是params, i是密钥
h.encSecKey = c(i, e, f), # 返回的是encSecKey,e和f是定死的,如果i固定,返回值应该也固定
h
}
先分析函数 function c
,发现传入参数 e
和 f
是定死的,如果第一项参数 i
固定,那么该函数的返回值也就固定了
function c(a, b, c) { # b和c是定死的,不同的参数只有a, a固定时返回值也固定,也就是不产生随机值
var d, e;
return setMaxDigits(131),
d = new RSAKeyPair(b, "", c),
e = encryptedString(d, a)
}
最后来看函数 a
和 b
:
a是生成了一个16位的随机数,这里可以通过F12获取到一个定值,那么剩下的变量只有b中的AES加密了
function b(a, b) { # a:数据,要加密的内容 b:密钥
var c = CryptoJS.enc.Utf8.parse(b),
d = CryptoJS.enc.Utf8.parse("0102030405060708"),
e = CryptoJS.enc.Utf8.parse(a), # e是数据
f = CryptoJS.AES.encrypt(e, c, { # AES加密, c是加密密钥
iv: d, # 偏移量
mode: CryptoJS.mode.CBC # 加密模式:CBC
}
);
return f.toString()
}
通过以上分析可以得到爬虫代码:
import requests
# pip install pycryptodome
from Crypto.Cipher import AES
from base64 import b64encode
import json
url = "https://music.163.com/weapi/comment/resource/comments/get?csrf_token="
headers = {
"Referer": "https://music.163.com/#/song?id=1857602696",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67"
}
# 请求方法:POST
data = {
"rid": "R_SO_4_1857602696",
"threadId": "R_SO_4_1857602696",
"pageNo": "1",
"pageSize": "20",
"cursor": "-1",
"offset": "0",
"orderType": "1",
"csrf_token": ""
}
# 处理加密过程
e = "010001"
f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
g = "0CoJUm6Qyw8W8jud"
i = "7MdgDcaOz6fkX1ZB"
def get_encSecKey():
return "1df401f4e77fc902f766446af730404c7b8137ed65798effb5ac4fca109ff24ff86696a4fa124505ad324655ddef3d1edf7e6effc7eb03ab906e59d93f46047d747a54e291d79e5d9650e378cad5a4e511e32172cbc63a4ad99fcaf233be481e144dc255febeb45ec529a08107a8ae5ed58a2a483694b7e31b04a34030116ecf"
def get_params(data): # 字符串
first = enc_params(data, g)
second = enc_params(first, i)
return second # 返回的就是params
def to_16(data):
pad = 16 - len(data) % 16
data += chr(pad) * pad
return data
def enc_params(data, key): # 加密过程
iv = "0102030405060708"
data = to_16(data)
aes = AES.new(key=key.encode("utf-8"), IV=iv.encode("utf-8"), mode=AES.MODE_CBC) # 创建
bs = aes.encrypt(data.encode("utf-8")) # 加密, 要求加密内容长度必须是16的倍数
return str(b64encode(bs), "utf-8") # 转化成字符串返回
resp = requests.post(url, headers=headers, data={
"params": get_params(json.dumps(data)),
"encSecKey": get_encSecKey()
})
print(resp.text)
结果如下,成功爬取评论: