Python 网页抓取
程序员文章站
2022-07-08 16:00:51
...
import json import os import requests import bs4 from lxml import etree # 模拟真实浏览器标头 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } # 获取专辑页数 def get_album(url): res = requests.get(url, headers=header) soup = bs4.BeautifulSoup(res.text, "html.parser") # pagingBar_page为开发者模式下查看到的页码标识 elems = soup.select('a[class^="pagingBar_page"]') print("本频道共有{}个页面".format(len(elems)-1)) for i in elems: if i.text == "下一页": continue print("正在下载第{}/{}个页面".format(i.text, len(elems)-1)) if i.text != "1": url = "http://www.ximalaya.com" + i.attrs["href"] get_url(url) def get_url(url): res = requests.get(url, headers=header) soup = bs4.BeautifulSoup(res.text, "html.parser") # sound_id为开发者模式下查看到的页码标识 elems = soup.select('li[sound_id]') for i in range(len(elems)): murl = 'http://www.ximalaya.com/tracks/{}.json'.format(elems[i].attrs["sound_id"]) html = requests.get(murl, headers=header).text dic = json.loads(html) try: print("正在下载第{}/{}文件,文件名{}:{}。".format(i+1, len(elems), elems[i].attrs["sound_id"],dic["title"])) get_m4a(dic["play_path"], elems[i].attrs["sound_id"]) except: print("下载{}/{}文件,文件名{}:{}时失败。".format(i + 1, len(elems), elems[i].attrs["sound_id"], dic["title"])) def get_m4a(url, id): folder = "郭德纲相声" # 自定义文件夹名称 res = requests.get(url) file = open(os.path.join(folder, os.path.basename(id)), 'wb') for chunk in res.iter_content(100000): file.write(chunk) file.close() if __name__ == '__main__': url = "http://www.ximalaya.com/1000202/album/2667276/" # 专辑地址 get_album(url)
上一篇: 堆与堆排序