网易云歌手爬取
程序员文章站
2022-03-08 08:01:19
#需求:获取每一个大分类下的每个子分类下的所有歌手名#思路:#1.获取大分类的URL,并请求#2.获取小分类的URL,并请求#3.获取歌手名import requestsfrom lxml import etree#定义请求函数def get_requests(url): response = requests.get(url=url, headers=headers) return etree.HTML(response.text)def get_big_typ...
#需求:获取每一个大分类下的每个子分类下的所有歌手名
#思路:
#1.获取大分类的URL,并请求
#2.获取小分类的URL,并请求
#3.获取歌手名
import requests
from lxml import etree
#定义请求函数
def get_requests(url):
response = requests.get(url=url, headers=headers)
return etree.HTML(response.text)
def get_big_type(url):
#转换成HTML元素对象
html = get_requests(url)
#获取大分类的URL
big_type_list = html.xpath('//div[@class="blk"]/ul/li')
for big_type in big_type_list:
#获取大标题链接
big_href = big_type.xpath('./a/@href')[0]
#获取拼接完整的链接
big_href_full = 'https://music.163.com/'+big_href
get_small_type(big_href_full)
#定义获取小分类的URL函数
def get_small_type(big_href_full):
html = get_requests(big_href_full)
#获取每个小分类
small_li_list = html.xpath('//ul[@class="n-ltlst f-cb" and @id="initial-selector"]/li')
for small_li in small_li_list:
#获取小分类名字
small_name = small_li.xpath('./a/text()')[0]
#获取小分类链接
small_herf = small_li.xpath('./a/@href')[0]
small_href_full = 'https://music.163.com/' + small_herf
get_singer(small_name, small_href_full)
def get_singer(small_name, small_href_full):
html = get_requests(small_href_full)
# 获取歌手名
# 前五个://ul[@id="m-artist-box"]/li/p/a/text()
# 中间五个://ul[@id="m-artist-box"]/li[@class="line"]/p/a/text()
# 其它://ul[@id="m-artist-box"]/li[@class="sml"]/a/text()
# 最终得到:
# //ul[@id="m-artist-box"]/li//a/text()
singer = html.xpath('//ul[@id="m-artist-box"]/li//a/text()')
#保存数据
write_to_txt(small_name, singer)
def write_to_txt(small_name, singer):
dic = {}
dic["小标题"] = small_name
dic["歌手"] = singer
with open('网易云歌手.txt', 'a', encoding='utf_8') as f:
f.write(str(dic)+'\n')
if __name__ == '__main__':
#定义URL:
base_url = 'https://music.163.com/discover/artist/'
# 定义请求头
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
get_big_type(base_url)
本文地址:https://blog.csdn.net/program_Mr_lu/article/details/110873540