学渣的Python学习日记之爬取QQ音乐
15
2020-12-02
正文
爬取QQ音乐,更多的时间是花在了解他们接口对接的逻辑,找到规律后,用Python进行模拟抓取处理,就没什么难度了。
import urllib.request
import requests
import os
import time
import json
import re
import execjs
from bs4 import BeautifulSoup
from urllib.parse import urlparse
searchUrl = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp"
musicHeaders = {
"Content-Type": "application/x-www-form-urlencoded",
"Referer": "https://y.qq.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Accept-Language": 'zh-CN,zh;q=0.9',
'Accept': "*/*",
'Cache-Control': "no-cache",
'Host': "u.y.qq.com",
'Accept-Encoding': "gzip, deflate",
'Connection': "keep-alive",
'cache-control': "no-cache"
}
musicPath = "music"
def searchMusic(title, page=1, num=10):
url = searchUrl
params = {
"ct": 24, # 未知
"qqmusic_ver": 1298, # 未知
"new_json": 1, # 未知
"remoteplace": 1, # 未知
"searchid": "01215", # 随机值
"t": 0, # 未知
"aggr": 1, # 未知
"cr": 1, # 未知
"catZhida": 1, # 未知
"lossless": 0, # 未知
"flag_qc": 0, # 未知
"p": page, # 页码
"n": num, # 页长
"w": title, # 关键字
"g_tk_new_20200303": "1114457699", # 未知
"g_tk": "1114457699", # 未知
"loginUin": "0", # 登录ID(QQ号)
"hostUin": "0", # 未知
"format": "json", # 返回格式
"inCharset": "utf-8", # 输入字符串规格
"outCharset": "utf-8", # 返回字符串规格
"notice": "0", # 未知
"platform": "yqq.json", # 未知
"needNewCode": "0" # 未知
}
try:
return requests.get(url, params=urllib.parse.urlencode(params), headers=musicHeaders, timeout=10).text
except Exception as e:
log("获取" + url + "页面出现异常:"+str(e))
return ""
def log(content):
with open(musicPath+"\\run.log", "ab+") as file:
content = (content+"\r\n").encode("utf-8")
file.write(content)
def findValues(a, n):
r = []
for i in a:
if n in i.keys():
r.append(i[n])
return r
def readContent(path):
f = open(path)
content = f.read()
f.close()
return content
def getSign(data):
jsContent = readContent("index.umd_20200305.js")
getSecuritySign = execjs.compile(jsContent)
return getSecuritySign.call("getSign", data)
def getMusicUrl(data):
url = "https://u.y.qq.com/cgi-bin/musics.fcg"
params = {
"-": "getplaysongvkey5717492774896651", # 未知
"g_tk": "1948815153", # 未知
"sign": "",
"loginUin": "0", # 登录ID
"hostUin": "0", # 未知
"format": "json", # 随机值
"inCharset": "utf8", #
"outCharset": "utf-8", # 返回字符串规格
"notice": "0", # 未知
"platform": "yqq.json", # 未知
"needNewCode": "0", # 未知
"data": ""
}
params["sign"] = getSign(data)
params["data"] = json.dumps(data, separators=(',', ':'))
p = urllib.parse.urlencode(params)
try:
return requests.get(url+"?"+p, headers=musicHeaders, timeout=10).text
# return requests.get(url, params=params, headers=musicHeaders, timeout=10).text
except Exception as e:
log("获取" + url + "页面出现异常:"+str(e))
return ""
def schedule(a, b, c):
'''''
a:已经下载的数据块
b:数据块的大小
c:远程文件的大小
'''
per = 100.0 * a * b / c
if per >= 100:
per = 100
print(' 🚀 %.2f%%' % per, end="\n")
else:
print(' 🚀 %.2f%%' % per, end="\r")
def downloadFile(url, path):
try:
urllib.request.urlretrieve(url, path, schedule)
return True
except Exception as e:
log("下载" + url + "音乐出现异常:"+str(e))
return False
searchPage = 1
searchNum = 10
searchTitle = ""
def main():
exit = False
op = False
title = ""
searchPage = 1
while True & (not exit):
if not op:
print("🔸 n下一页 p上一页 e退出")
title = input("🔸 请输入要下载的歌曲名:")
if title == "e":
exit = True
continue
elif (title == "n"):
searchPage += 1
elif (title == "p") & (searchPage > 0):
searchPage -= 1
else:
searchTitle = title
if len(searchTitle) == 0:
print("歌曲名不能为空")
continue
response = searchMusic(searchTitle, searchPage, searchNum)
j = json.loads(response)
songList = []
if ("data" in j.keys()) & ("song" in j["data"].keys()):
# print(j["data"]["song"]["curnum"])
# print(j["data"]["song"]["curpage"])
index = 0
print("页码/页长:"+str(searchPage) +
"/"+str(searchNum)+" 共"+str(j["data"]["song"]["totalnum"])+"条数据")
for _song in j["data"]["song"]["list"]:
index += 1
singer = findValues(_song["singer"], "title")
songList.append(
(_song["mid"], singer, _song["title"], _song["time_public"]))
print("🍀 "+str(index)+"."+"&".join(singer) +
" - " + _song["title"])
# print(j["data"]["song"]["totalnum"])
# print("输入需要下载的歌曲序号:")
i = input("🔸 输入需要下载的歌曲序号=>")
if i == "e":
exit = True
continue
elif (i == "n"):
searchPage += 1
op = True
continue
elif (i == "p") & (searchPage > 0):
searchPage -= 1
op = True
continue
while (type(eval(i)) != int) | (int(i) > len(songList)):
i = input("❌ 请输入正确的歌曲序号=>")
_i = int(i)
# 构建请求data
data = {"req_0": {"module": "vkey.GetVkeyServer", "method": "CgiGetVkey", "param": {"guid": "1643097364", "songmid": [
], "songtype": [0], "uin": "0", "loginflag": 1, "platform": "20"}}, "comm": {"uin": 0, "format": "json", "ct": 24, "cv": 0}}
data["req_0"]["param"]["songmid"].append(songList[_i-1][0])
# print(data)
musicResponse = getMusicUrl(data)
if len(musicResponse) == 0:
print("❌ 查询下载地址失败,请检查日志")
continue
music = json.loads(musicResponse)
if ("req_0" in music.keys()) & ("data" in music["req_0"]) & ("sip" in music["req_0"]["data"]) & (len(music["req_0"]["data"]["sip"]) > 1) & ("midurlinfo" in music["req_0"]["data"]) & (len(music["req_0"]["data"]["midurlinfo"]) > 0) & ("purl" in music["req_0"]["data"]["midurlinfo"][0]):
if len(music["req_0"]["data"]["midurlinfo"][0]["purl"]) == 0:
print("❌ VIP版权音乐")
continue
downloadUrl = music["req_0"]["data"]["sip"][1] + \
music["req_0"]["data"]["midurlinfo"][0]["purl"]
# 下载
r = downloadFile(downloadUrl, "music\\" +
"&".join(songList[_i-1][1])+" - "+songList[_i-1][2]+".m4a")
if r:
print("✔ 下载成功")
else:
print("❌ 下载失败")
print(musicResponse)
else:
print("❌ 返回下载信息错误")
main()
附件
- 0
- 0
-
分享