欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python 网页抓取

程序员文章站 2022-07-08 16:00:51
...
import json
import os
import requests
import bs4
from lxml import etree

# 模拟真实浏览器标头
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}

# 获取专辑页数
def get_album(url):
    res = requests.get(url, headers=header)
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    # pagingBar_page为开发者模式下查看到的页码标识
    elems = soup.select('a[class^="pagingBar_page"]')
    print("本频道共有{}个页面".format(len(elems)-1))
    for i in elems:
        if i.text == "下一页":
            continue
        print("正在下载第{}/{}个页面".format(i.text, len(elems)-1))

        if i.text != "1":
            url = "http://www.ximalaya.com" + i.attrs["href"]

        get_url(url)

def get_url(url):
    res = requests.get(url, headers=header)
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    # sound_id为开发者模式下查看到的页码标识
    elems = soup.select('li[sound_id]')

    for i in range(len(elems)):
        murl = 'http://www.ximalaya.com/tracks/{}.json'.format(elems[i].attrs["sound_id"])
        html = requests.get(murl, headers=header).text
        dic = json.loads(html)
        try:
            print("正在下载第{}/{}文件,文件名{}:{}。".format(i+1, len(elems),
                                                     elems[i].attrs["sound_id"],dic["title"]))
            get_m4a(dic["play_path"], elems[i].attrs["sound_id"])
        except:
            print("下载{}/{}文件,文件名{}:{}时失败。".format(i + 1, len(elems),
                                                     elems[i].attrs["sound_id"], dic["title"]))


def get_m4a(url, id):
    folder = "郭德纲相声"    # 自定义文件夹名称
    res = requests.get(url)
    file = open(os.path.join(folder, os.path.basename(id)), 'wb')
    for chunk in res.iter_content(100000):
        file.write(chunk)
    file.close()


if __name__ == '__main__':
    url = "http://www.ximalaya.com/1000202/album/2667276/"  # 专辑地址
    get_album(url)