好听音乐网歌曲爬虫首个版本,目前支持爬取
import re #python正则表达式模块
import requests #外部requests模块
import time
####################################页面规则####################################
# 首页
# http://www.htqyy.com/top/hot
# 第二页
# http://www.htqyy.com/top/musicList/hot?pageIndex=1&pageSize=20
# 第三页
# http://www.htqyy.com/top/musicList/hot?pageIndex=2&pageSize=20
####################################歌曲地址规则####################################
# 歌曲url: http://f2.htqyy.com/play7/20/mp3/2
def make_url(name,page,month):
if "热播榜" == name:
for i in range(0,page):
url = 'http://www.htqyy.com/top/musicList/hot?pageIndex='+str(i)+'&pageSize=20'
wash_data(url,month)
elif "新曲榜" == name:
for i in range(0,page):
url = 'http://www.htqyy.com/top/musicList/new?pageIndex='+str(i)+'&pageSize=20'
wash_data(url, month)
elif "最新推荐" == name:
for i in range(0,page):
url = 'http://www.htqyy.com/top/musicList/recommend?pageIndex='+str(i)+'&pageSize=20'
wash_data(url, month)
elif "最新单曲" == name:
for i in range(0,page):
url = 'http://www.htqyy.com/top/musicList/latest?pageIndex='+str(i)+'&pageSize=20'
wash_data(url, month)
def wash_data(url,month):
view_web = requests.get(url).text
# 使用正则表达式筛选需要的数据
pat_sid = re.findall('<input type="checkbox" name="checked" checked="checked" value="(.*?)"><span', view_web)
pat_title = re.findall('" target="play" title="(.*?)" sid="', view_web)
songs_sid = []
songs_title = []
# 将pat_sid数据和pay_title数据分别组合到songs_sid和songs_title列表内
songs_sid.extend(pat_sid)
songs_title.extend(pat_title)
download(songs_sid, songs_title, url)
def download(songs_sid,songs_title,url):
for i in range(0,len(songs_sid)):
url = 'http://f2.htqyy.com/play7/'+str(songs_sid[i])+'/mp3/4'
songs_url = requests.get(url).content
print('正在下载第',str(i+1),'首>>>',songs_title[i])
with open(r'A:\Desktop\music\{}.mp3'.format(songs_title[i]),'wb') as f:
f.write(songs_url)
print(songs_title[i],'下载完成')
time.sleep(0)
if __name__ == '__main__':
name = input("输入需要爬取的排行榜分类,歌曲名称或歌手:")
page = int(input('输入需要爬取的页数:'))
month = int(input('现在是几月份,输入错误则无法下载:'))
make_url(name,page,month)