欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python3爬取—梨视频的短视频链接

程序员文章站 2022-05-05 09:16:55
python3爬取梨视频的视频链接运行效果如图#____author:"xie"#date:2020-11-12# -*- coding: utf-8 -*-import requestsimport re,time#需求:爬取梨视频的视频数据headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/7...

python3爬取梨视频的视频链接

运行效果如图python3爬取—梨视频的短视频链接
python3爬取—梨视频的短视频链接

#____author:"xie"
#date:2020-11-12
# -*- coding: utf-8 -*-
import requests
import re,time
#需求:爬取梨视频的视频数据
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
#对下述url发起请求解析出视频详情页的url和视频的名称
def video_url():
    Current = 0
    Total = 10
    while Current <= Total:
        tat = Current * 12
        url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start={}'.format(tat)
        response = requests.get(url=url, headers=headers)
        list_url = re.findall(r'style="background-image: url(.*?);', response.text, re.I)
        for i in list_url:
            link_id = str(i).rsplit('-')[1]
            if len(link_id) != 6:
                detail_url = 'https://www.pearvideo.com/video_'+link_id
                time.sleep(0.5)
                response = requests.get(url=detail_url, headers=headers)
                data_name = re.findall(r' <h1 class="video-tt">(.*?)</h1>', response.text, re.I)[0]+'.mp4'
                headers['Referer'] = detail_url
                vedio_url = 'https://www.pearvideo.com/videoStatus.jsp?'
                param = {'contId': link_id}          # 请求的id参数
                res = requests.get(url=vedio_url, params=param, headers=headers).json()  # 获取视频伪装过的下载链接
                down_url = res['videoInfo']['videos']['srcUrl']           # 被伪装的下载地址
                ex = "third/.*?/(.*?)-.*?"
                ex1 = "short/.*?/(.*?)-.*?"
                # 获取需要被替换的字符串
                try:
                    need_replace = re.findall(ex1, down_url)[0]
                except:
                    need_replace = re.findall(ex, down_url)[0]
                replaced = 'cont-' + link_id
                down_url = down_url.replace(need_replace, replaced)
                print('第{}页的内容有:{} url:{}'.format(Current, data_name, down_url))
        Current += 1

if __name__ == '__main__':
    video_url()

本文地址:https://blog.csdn.net/ranranran52/article/details/109646809

相关标签: python 爬虫