欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取网易云音乐所有歌单信息

程序员文章站 2022-05-08 10:57:39
...

使用 python + requests + lxml + selenium

  1. 使用 requests 发起请求,获取到所有分类的 url
  2. 使用 selenium 发送请求取到每页的每个歌单信息
  3. 点击下一页,爬取下一页的歌单信息
  4. 存储信息
import requests, time
from selenium import webdriver
from lxml import etree
from requests.utils import unquote
from selenium.webdriver.common.keys import Keys


class WangyiMusic:
    def __init__(self):
        self.start_url = "https://music.163.com/discover/playlist/"
        self.url_temp = "https://music.163.com"
        self.headers = {
            "Referer": "https://music.163.com/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3608.4 Safari/537.36"
        }
        self.session = requests.Session()
        self.driver = webdriver.Chrome()

    def parse_get_url(self):
        resp = self.session.get(self.start_url, headers=self.headers)
        return resp.content.decode()

    def get_category_list(self, html_str):
        el = etree.HTML(html_str)
        dl_list = el.xpath("//div[@class='bd']/dl")
        category_list = []
        for dl in dl_list:
            a_list = dl.xpath(".//a[@class='s-fc1 ']")
            for a in a_list:
                items = {}
                items["cate_name"] = a.xpath("./text()")[0]
                items["cate_url"] = self.url_temp + a.xpath("./@href")[0]
                print(items)
                category_list.append(items)

        cate_url_list = [category["cate_url"] for category in category_list]
        print(cate_url_list)
        return category_list, cate_url_list

    def save_category_list(self, category_list):
        pass

    def get_playlist_list(self):
        li_list = self.driver.find_elements_by_xpath("//ul[@id='m-pl-container']/li")
        playlist_list = []
        for li in li_list:
            items = {}
            items["playlist_name"] = li.find_element_by_xpath(".//a[@class='tit f-thide s-fc0']").text
            items["playlist_url"] = li.find_element_by_xpath(".//a[@class='tit f-thide s-fc0']").get_attribute("href")
            items["playlist_author"] = li.find_element_by_xpath(".//a[@class='nm nm-icn f-thide s-fc3']").text
            items["playlist_num"] = li.find_element_by_xpath(".//span[@class='nb']").text
            print(items)
            playlist_list.append(items)

        next_url = self.driver.find_elements_by_xpath(".//a[@class='zbtn znxt']")
        # next_url = self.driver.find_elements_by_link_text("下一页")
        next_url = next_url[0] if len(next_url) > 0 else None
        print(next_url)
        return playlist_list, next_url

    def save_playlist_list(self, playlist_list):
        for playlist in playlist_list:
            with open("music_163.txt", 'a', encoding="utf-8") as f:
                f.write(
                    "歌单名:" + playlist["playlist_name"] + " | | " + "歌单作者: " + playlist[
                        "playlist_author"] + " | | " + "歌单播放量:" + playlist["playlist_num"] + " | | " + "歌单地址:" +
                    playlist["playlist_url"])
                f.write("\n\n")

    def run(self):
        # 先取到所有分类的名称和 url
        # 发起请求
        html_str = self.parse_get_url()
        # 提取分类名和 url 数据
        category_list, cate_url_list = self.get_category_list(html_str)
        # 保存分类名和 url 数据
        self.save_category_list(category_list)

        # 遍历请求每个分类 url,取到分类中的歌单名称与 url
        for cate_url in cate_url_list:
            # 请求每个分类 url
            self.driver.get(cate_url)
            # 进入到 iframe 框架中
            self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[0])
            # 设置等待时间
            time.sleep(5)
            # 取到数据
            print("*" * 100)
            print(unquote(self.driver.current_url))  # 输出当前 url

            playlist_list, next_url = self.get_playlist_list()  # 取到第一页的歌单数据
            # 保存数据
            self.save_playlist_list(playlist_list)
            # 请求下一页
            while next_url is not None:
                next_url.send_keys(Keys.ENTER)  # 元素被覆盖,无法点击  用Enter代替click
                time.sleep(5)
                # 提取数据
                playlist_list, next_url = self.get_playlist_list()
                # 保存数据
                self.save_playlist_list(playlist_list)

            print("*" * 100)

        self.driver.quit()


if __name__ == '__main__':
    wangyimusic = WangyiMusic()
    wangyimusic.run()



中间遇到的问题

selenium 获取到了下一页的元素信息,但是点击失败

原因:元素被覆盖,无法点击

解决:

用Enter代替click

导入from selenium.webdriver.common.keys import Keys

next_url.send_keys(Keys.ENTER) # 元素被覆盖,无法点击 用Enter代替click

发现用Enter代替click后,如果不是下拉加载的页面的话,不用下拉滚动条就能用enter点到

也有其他的解决办法:

手动将页面拉到最下面、

使用加载页面的方法等。

相关标签: spider