月色真美

月色真美

学渣的Python学习日记之爬取QQ音乐

15
2020-12-02

正文

爬取QQ音乐,更多的时间是花在了解他们接口对接的逻辑,找到规律后,用Python进行模拟抓取处理,就没什么难度了。

import urllib.request
import requests
import os
import time
import json
import re
import execjs
from bs4 import BeautifulSoup
from urllib.parse import urlparse
searchUrl = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp"
musicHeaders = {
    "Content-Type": "application/x-www-form-urlencoded",
    "Referer": "https://y.qq.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    "Accept-Language": 'zh-CN,zh;q=0.9',
    'Accept': "*/*",
    'Cache-Control': "no-cache",
    'Host': "u.y.qq.com",
    'Accept-Encoding': "gzip, deflate",
    'Connection': "keep-alive",
    'cache-control': "no-cache"
}
musicPath = "music"
def searchMusic(title, page=1, num=10):
    url = searchUrl
    params = {
        "ct": 24,  # 未知
        "qqmusic_ver": 1298,  # 未知
        "new_json": 1,  # 未知
        "remoteplace": 1,  # 未知
        "searchid": "01215",  # 随机值
        "t": 0,  # 未知
        "aggr": 1,  # 未知
        "cr": 1,  # 未知
        "catZhida": 1,  # 未知
        "lossless": 0,  # 未知
        "flag_qc": 0,  # 未知
        "p": page,  # 页码
        "n": num,  # 页长
        "w": title,  # 关键字
        "g_tk_new_20200303": "1114457699",  # 未知
        "g_tk": "1114457699",  # 未知
        "loginUin": "0",  # 登录ID(QQ号)
        "hostUin": "0",  # 未知
        "format": "json",  # 返回格式
        "inCharset": "utf-8",  # 输入字符串规格
        "outCharset": "utf-8",  # 返回字符串规格
        "notice": "0",  # 未知
        "platform": "yqq.json",  # 未知
        "needNewCode": "0"  # 未知
    }
    try:
        return requests.get(url, params=urllib.parse.urlencode(params), headers=musicHeaders, timeout=10).text
    except Exception as e:
        log("获取" + url + "页面出现异常:"+str(e))
        return ""
def log(content):
    with open(musicPath+"\\run.log", "ab+") as file:
        content = (content+"\r\n").encode("utf-8")
        file.write(content)
def findValues(a, n):
    r = []
    for i in a:
        if n in i.keys():
            r.append(i[n])
    return r
def readContent(path):
    f = open(path)
    content = f.read()
    f.close()
    return content
def getSign(data):
    jsContent = readContent("index.umd_20200305.js")
    getSecuritySign = execjs.compile(jsContent)
    return getSecuritySign.call("getSign", data)
def getMusicUrl(data):
    url = "https://u.y.qq.com/cgi-bin/musics.fcg"
    params = {
        "-": "getplaysongvkey5717492774896651",  # 未知
        "g_tk": "1948815153",  # 未知
        "sign": "",
        "loginUin": "0",  # 登录ID
        "hostUin": "0",  # 未知
        "format": "json",  # 随机值
        "inCharset": "utf8",  #
        "outCharset": "utf-8",  # 返回字符串规格
        "notice": "0",  # 未知
        "platform": "yqq.json",  # 未知
        "needNewCode": "0",  # 未知
        "data": ""
    }
    params["sign"] = getSign(data)
    params["data"] = json.dumps(data, separators=(',', ':'))
    p = urllib.parse.urlencode(params)
    try:
        return requests.get(url+"?"+p, headers=musicHeaders, timeout=10).text
        # return requests.get(url, params=params, headers=musicHeaders, timeout=10).text
    except Exception as e:
        log("获取" + url + "页面出现异常:"+str(e))
        return ""
def schedule(a, b, c):
    '''''
    a:已经下载的数据块
    b:数据块的大小
    c:远程文件的大小
   '''
    per = 100.0 * a * b / c
    if per >= 100:
        per = 100
        print(' 🚀 %.2f%%' % per, end="\n")
    else:
        print(' 🚀 %.2f%%' % per, end="\r")
def downloadFile(url, path):
    try:
        urllib.request.urlretrieve(url, path, schedule)
        return True
    except Exception as e:
        log("下载" + url + "音乐出现异常:"+str(e))
        return False
searchPage = 1
searchNum = 10
searchTitle = ""
def main():
    exit = False
    op = False
    title = ""
    searchPage = 1
    while True & (not exit):
        if not op:
            print("🔸 n下一页 p上一页 e退出")
            title = input("🔸 请输入要下载的歌曲名:")
            if title == "e":
                exit = True
                continue
            elif (title == "n"):
                searchPage += 1
            elif (title == "p") & (searchPage > 0):
                searchPage -= 1
            else:
                searchTitle = title
        if len(searchTitle) == 0:
            print("歌曲名不能为空")
            continue
        response = searchMusic(searchTitle, searchPage, searchNum)
        j = json.loads(response)
        songList = []
        if ("data" in j.keys()) & ("song" in j["data"].keys()):
            # print(j["data"]["song"]["curnum"])
            # print(j["data"]["song"]["curpage"])
            index = 0
            print("页码/页长:"+str(searchPage) +
                  "/"+str(searchNum)+" 共"+str(j["data"]["song"]["totalnum"])+"条数据")
            for _song in j["data"]["song"]["list"]:
                index += 1
                singer = findValues(_song["singer"], "title")
                songList.append(
                    (_song["mid"], singer, _song["title"], _song["time_public"]))
                print("🍀 "+str(index)+"."+"&".join(singer) +
                      " - " + _song["title"])
            # print(j["data"]["song"]["totalnum"])
        # print("输入需要下载的歌曲序号:")
        i = input("🔸 输入需要下载的歌曲序号=>")
        if i == "e":
            exit = True
            continue
        elif (i == "n"):
            searchPage += 1
            op = True
            continue
        elif (i == "p") & (searchPage > 0):
            searchPage -= 1
            op = True
            continue
        while (type(eval(i)) != int) | (int(i) > len(songList)):
            i = input("❌ 请输入正确的歌曲序号=>")
        _i = int(i)
        # 构建请求data
        data = {"req_0": {"module": "vkey.GetVkeyServer", "method": "CgiGetVkey", "param": {"guid": "1643097364", "songmid": [
        ], "songtype": [0], "uin": "0", "loginflag": 1, "platform": "20"}}, "comm": {"uin": 0, "format": "json", "ct": 24, "cv": 0}}
        data["req_0"]["param"]["songmid"].append(songList[_i-1][0])
        # print(data)
        musicResponse = getMusicUrl(data)
        if len(musicResponse) == 0:
            print("❌ 查询下载地址失败,请检查日志")
            continue
        music = json.loads(musicResponse)
        if ("req_0" in music.keys()) & ("data" in music["req_0"]) & ("sip" in music["req_0"]["data"]) & (len(music["req_0"]["data"]["sip"]) > 1) & ("midurlinfo" in music["req_0"]["data"]) & (len(music["req_0"]["data"]["midurlinfo"]) > 0) & ("purl" in music["req_0"]["data"]["midurlinfo"][0]):
            if len(music["req_0"]["data"]["midurlinfo"][0]["purl"]) == 0:
                print("❌ VIP版权音乐")
                continue
            downloadUrl = music["req_0"]["data"]["sip"][1] + \
                music["req_0"]["data"]["midurlinfo"][0]["purl"]
            # 下载
            r = downloadFile(downloadUrl, "music\\" +
                             "&".join(songList[_i-1][1])+" - "+songList[_i-1][2]+".m4a")
            if r:
                print("✔ 下载成功")
            else:
                print("❌ 下载失败")
                print(musicResponse)
        else:
            print("❌ 返回下载信息错误")
main()

附件