欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python爬虫 爬取音频文件 #只用于学习

程序员文章站 2022-05-04 14:08:04
...
from lxml import etree
import requests
import os
from urllib import request,parse
url = 'https://www.ximalaya.com/lishi/4164479/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
response = requests.get(url,headers=headers)
# print(response)
html = response.text
html_ele = etree.HTML(html)
mp_list = html_ele.xpath('//ul[@class="dOi2"]/li/div[2]/a/@href')
# print(mp_list)
# 遍历春秋尾部链接
for mp in mp_list:
    # print(mp)
    data = parse.urljoin(url, mp)
    # print(data)
    data_url_str = data.split('/')[-1]
    #音频地址
    data_url = 'https://www.ximalaya.com/revision/play/tracks?trackIds=' + str(data_url_str)
    # print(data_url)
    response = requests.get(data_url, headers=headers)
    # print(response.text)
    # # print(type(response.text))
    # 直接转json类型
    data_str = response.json()
    # print(type(data_str))
    # 获取m4a的地址
    m4a_url = data_str['data']['tracksForAudioPlay'][0]['src']
    m4a_name = data_str['data']['tracksForAudioPlay'][0]['trackName']
    # print(m4a_url)
    # print(m4a_name)
    # 创建down文件夹
    if not os.path.exists('Down'):
        os.mkdir('Down')
    filename = 'Down/' + m4a_name + '.m4a'
    # print(filename)
    #下载
    request.urlretrieve(m4a_url, filename)
    print(m4a_url + m4a_name + '正在下载ding...。')
    print('---' * 50)