欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

网易云歌手爬取

程序员文章站 2022-03-08 08:01:19
#需求:获取每一个大分类下的每个子分类下的所有歌手名#思路:#1.获取大分类的URL,并请求#2.获取小分类的URL,并请求#3.获取歌手名import requestsfrom lxml import etree#定义请求函数def get_requests(url): response = requests.get(url=url, headers=headers) return etree.HTML(response.text)def get_big_typ...
#需求:获取每一个大分类下的每个子分类下的所有歌手名

#思路:
#1.获取大分类的URL,并请求
#2.获取小分类的URL,并请求
#3.获取歌手名


import requests
from lxml import etree

#定义请求函数
def get_requests(url):
    response = requests.get(url=url, headers=headers)
    return etree.HTML(response.text)

def get_big_type(url):
    #转换成HTML元素对象
    html = get_requests(url)
    #获取大分类的URL
    big_type_list = html.xpath('//div[@class="blk"]/ul/li')
    for big_type in big_type_list:
        #获取大标题链接
        big_href = big_type.xpath('./a/@href')[0]
        #获取拼接完整的链接
        big_href_full = 'https://music.163.com/'+big_href
        get_small_type(big_href_full)


#定义获取小分类的URL函数
def get_small_type(big_href_full):
    html = get_requests(big_href_full)
    #获取每个小分类
    small_li_list = html.xpath('//ul[@class="n-ltlst f-cb" and @id="initial-selector"]/li')
    for small_li in small_li_list:
        #获取小分类名字
        small_name = small_li.xpath('./a/text()')[0]
        #获取小分类链接
        small_herf = small_li.xpath('./a/@href')[0]
        small_href_full = 'https://music.163.com/' + small_herf

        get_singer(small_name, small_href_full)



def get_singer(small_name, small_href_full):
    html = get_requests(small_href_full)
    # 获取歌手名
    # 前五个://ul[@id="m-artist-box"]/li/p/a/text()
    # 中间五个://ul[@id="m-artist-box"]/li[@class="line"]/p/a/text()
    # 其它://ul[@id="m-artist-box"]/li[@class="sml"]/a/text()
    # 最终得到:
    # //ul[@id="m-artist-box"]/li//a/text()
    singer = html.xpath('//ul[@id="m-artist-box"]/li//a/text()')
    #保存数据
    write_to_txt(small_name, singer)


def  write_to_txt(small_name, singer):
    dic = {}
    dic["小标题"] = small_name
    dic["歌手"] = singer
    with open('网易云歌手.txt', 'a', encoding='utf_8') as f:
        f.write(str(dic)+'\n')


if __name__ == '__main__':
    #定义URL:
    base_url = 'https://music.163.com/discover/artist/'
    # 定义请求头
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }
    get_big_type(base_url)

本文地址:https://blog.csdn.net/program_Mr_lu/article/details/110873540

相关标签: 爬虫 python