import re   #python正则表达式模块
import requests     #requests外部模块
import time
####################################页面规则####################################
# 首页
# http://www.htqyy.com/top/hot
# 第二页
# http://www.htqyy.com/top/musicList/hot?pageIndex=1&pageSize=20
# 第三页
# http://www.htqyy.com/top/musicList/hot?pageIndex=2&pageSize=20
####################################歌曲地址规则####################################
# 歌曲url: http://f2.htqyy.com/play7/20/mp3/2

def make_url(route,name,page,month):
    if "热播榜" == name:
        for i in range(0,page):
            url = 'http://www.htqyy.com/top/musicList/hot?pageIndex='+str(i)+'&pageSize=20'

            wash_data(url,route,month)

    elif "新曲榜" == name:
        for i in range(0,page):
            url = 'http://www.htqyy.com/top/musicList/new?pageIndex='+str(i)+'&pageSize=20'

            wash_data(url,route,month)

    elif "最新推荐" == name:
        for i in range(0,page):
            url = 'http://www.htqyy.com/top/musicList/recommend?pageIndex='+str(i)+'&pageSize=20'

            wash_data(url,route,month)

    elif "最新单曲" == name:
        for i in range(0,page):
            url = 'http://www.htqyy.com/top/musicList/latest?pageIndex='+str(i)+'&pageSize=20'

            wash_data(url,route,month)

    elif "最新单曲" == name:
        for i in range(0,page):
            url = 'http://www.htqyy.com/top/musicList/latest?pageIndex='+str(i)+'&pageSize=20'

            wash_data(url,route,month)

    elif "好听热歌榜" == name:
        url = 'http://www.htqyy.com/top/musicList/gedan'

        wash_data(url,route,month)

def wash_data(url,route,month):
    if 40 == len(url):
        data = {
            "pageIndex" : "0",       #POST请求,第一页pageindex的值为 0
            "pageSize" : "20",
            "cateId" : ""
        }
        headers = {
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
        }

        view_web = requests.post(url,data=data,headers=headers).content.decode()

        # 使用正则表达式筛选需要的数据
        pat_sid = re.findall('<input type="checkbox" name="checked" checked="checked" value="(.*?)"><span', view_web)
        pat_title = re.findall('" target="play" title="(.*?)" sid="', view_web)

        songs_sid = []
        songs_title = []

        # 将pat_sid数据和pay_title数据分别组合到songs_sid和songs_title列表内
        songs_sid.extend(pat_sid)
        songs_title.extend(pat_title)

        download(songs_sid, songs_title, url,route,month)


    else:
        view_web = requests.get(url).text

        # 使用正则表达式筛选需要的数据
        pat_sid = re.findall('<input type="checkbox" name="checked" checked="checked" value="(.*?)"><span', view_web)
        pat_title = re.findall('" target="play" title="(.*?)" sid="', view_web)

        songs_sid = []
        songs_title = []

        # 将pat_sid数据和pay_title数据分别组合到songs_sid和songs_title列表内
        songs_sid.extend(pat_sid)
        songs_title.extend(pat_title)

        download(songs_sid, songs_title, url,route,month)

def download(songs_sid,songs_title,url,route,month):
    for i in range(0,len(songs_sid)):
        url = 'http://f2.htqyy.com/play7/' + str(songs_sid[i]) + '/mp3/' + str(month)
        songs_url = requests.get(url).content

        print('正在下载第',str(i+1),'首>>>',songs_title[i])

        with open(route+'{}.mp3'.format(songs_title[i]),"wb") as f:     #向拼接路径低头。。。。
            f.write(songs_url)
        print(songs_title[i],'下载完成')

        time.sleep(0)

if __name__ == '__main__':
    print("歌曲默认保存至C:\Desktop\<你创建的文件夹>里面")
    route = input("输入歌曲保存的文件夹的完整路径(如上,反斜杠结尾,确保已经新建了该文件夹):")
    name = input("输入需要爬取的排行榜分类,歌曲名称或歌手:")
    page = int(input('输入需要爬取的页数:'))
    month = int(input('现在是几月份,输入错误则无法下载:'))
    make_url(route,name,page,month)

扫描二维码,在手机上阅读!
最后修改:2020 年 04 月 26 日 10 : 23 PM
如果觉得我的文章对你有用,请随意赞赏