欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

Python爬取微博短视频

程序员文章站 2022-10-11 21:52:43
#!usr/bin/env python3#-*- coding: UTF-8 -*-__auth__='lz'import requests, jsonfrom urllib.parse import urlencodefrom urllib.parse import quote,unquoteimport reimport timebase_url = 'https://m.weibo.cn/api/container/getIndex?'header = { "User...
#!usr/bin/env python3
#-*- coding: UTF-8 -*-
# 爬取微博短视频
__auth__='lz'

import requests, json
from urllib.parse import urlencode
from urllib.parse import quote,unquote
import re
import time
import os

base_url = 'https://m.weibo.cn/api/container/getIndex?'

header = {
    "User - Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 68.0.3440.106Safari / 537.36",
    "X - Requested - With": "XMLHttpRequest",
    "Accept": "application/json, text/plain, */*",
}

def get_homeInfo(name):
    hurl = base_url + 'containerid=100103type%3D1%26q%3D'+ quote(name) +'&page_type=searchall'
    try:
        hresp = requests.get(hurl, header)
        if(hresp.status_code==200):
            return hresp.json()
    except requests.ConnectionError as e:
        print("连接错误:" + e.args)

# 请求页面
def get_page(uid, luicode, lfid, q, containerid, sinceid):
    # 如果不是非常清楚字典和确定 url 编码,不要使用字典方式构造 url 参数,
    # 建议直接用字符串构造
    # 构造好后,一步步删去非必要参数
    # 确定好后,在把一些变化的参数,换成变量
    params = 'uid=2674747062&t=0&luicode=' + str(luicode) + '&lfid=' + str(lfid) + 'type=3&q='+ str(q) +'&t=0&type=uid&value=' + str(uid) + '&containerid=' + str(containerid)
    if(sinceid != 1):
        params += '&since_id=' + str(sinceid)
# 构建请求头,不然会被禁

    # 注意 quote 不要对 (= & 等) url编码,不然和原来的请求 url 不一致,会返回没有内容
    url = base_url + quote(params, "\/\:\=\&\?\#\+\!\$\,\;\'\@\(\)\*\[\]")
    try:
        resp = requests.get(url, header)
        if(resp.status_code==200):
            # 获取到的是 json 格式数据,直接用 json 方法解析并返回 json 对象
            return resp.json()
    except requests.ConnectionError as e:
        print("连接错误:" + e.args)

# 解析json数据
def parse_page_get_name2url(resp_json_obj):
    # 对着所在 key 右键,选择 copy property path
    # data.cards[""0""].card_group[""0""].mblog.page_info.media_info.stream_url
    name2url = []
    length = len(resp_json_obj['data']['cards'])
    for index in range(0, length):
        # 推荐这种取值方式,json 对象在 python 中其实是字典(dict)类型。
        # 此中方式取值方便,但假如键有误,则会抛异常
        try:
            page_info = resp_json_obj['data']['cards'][index]['mblog']['page_info']
        # 此中方式取值繁琐,但能避免抛异常
        # data = obj_json.get('data', None)
        # cards = data.get('cards', None)
        # card_groups = cards[0].get('card_group', None)
        # mblog = card_groups[0].get('mblog', None)
        # page_info = mblog['page_info']
            media_name = page_info['content2']
            media_url = page_info['media_info']['stream_url_hd']
        # 追加到数组中
            name2url.append({media_name: media_url})
        except:
            pass
            continue
    return name2url

# 新建文件保存目录
def mkdir(path):
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists=os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path+' 创建成功')
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print(path+' 目录已存在')
        return False

# 下载视频
def download_video(path, filename, url):
    resp = requests.get(url)
    time.sleep(3)
    with open(path + filename + '.mp4', 'wb') as file:
        file.write(resp.content)
        print(filename, '下载完成')

# 获取栏目containerid
def getContained(url):
    res = requests.get(url, header)
    res_json = res.json()
    # 处理个别博主无‘视频’栏目的情况,从‘微博’栏目获取
    try:
        container = res_json['data']['tabsInfo']['tabs'][2]['containerid']
    except:
        container = res_json['data']['tabsInfo']['tabs'][1]['containerid']
    return container

def last_files(file_dir):
    file_names = []
    for root, dirs, files in os.walk(file_dir):
        # print('root_dir:', root)  # 当前目录路径
        # print('sub_dirs:', dirs)  # 当前路径下所有子目录
        # print('files:', files)  # 当前路径下所有非目录子文件
        file_names.extend(files)
    return set(file_names)

def recursiveGetPage(uid, back, lfid, q, containerid, since_id, pageNo, count):
    obj_json = get_page(uid, back, lfid, q, containerid, since_id)
    since_id = obj_json['data']['cardlistInfo']['since_id']
    total = obj_json['data']['cardlistInfo']['total']
    pageSize = len(obj_json['data']['cards'])
    name2urls = parse_page_get_name2url(obj_json)
    if(len(name2urls) != pageSize):
        count += pageSize - len(name2urls)
    print('正在下载第 %d 页,当前页共 %d 条记录 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' %(pageNo, pageSize))
    for name2url in name2urls:
        for (name, url) in name2url.items():
            try:
                # 去掉特殊符号
                filename_temp = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", name)
                # # 从文件中读取上次下载的记录
                # with open(lastRecordPath, 'r') as f:
                #     data = f.read()
                # if(filename_temp == data):
                #     print('下载终止!')
                #     return;
                if filename_temp + '.mp4' in allfile:
                    print(filename_temp + '.mp4 文件已存在,下载终止!')
                    return
                download_video(path, filename_temp, url)
                count += 1
                if(count > total):
                    print('===================== %s 的全部微博视频已下载完成!保存目录: %s=====================' %(q, path))
                    return 'Done'

                print('已下载 {:.2f} %    第{}页 | {}/{}'.format(count/total * 100, pageNo, count, total))
                print('---------------------------------------------------------------')
            except:
                print('%s 下载失败!' %(name))
                pass
                continue
    print('第 %d 页下载完成 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' %(pageNo))
    pageNo += 1

    return recursiveGetPage(uid, back, lfid, q, containerid, since_id, pageNo, count)

if __name__ =='__main__':
    q = '陈翔六点半'
    rootPath = 'D:\\weibovideo\\' + q + '\\'
    path = rootPath + time.strftime('%Y%m%d%H%M%S',time.localtime(time.time())) + '\\'
    mkdir(path)
    # 获取已下载文件的文件名并去重
    allfile = last_files(rootPath)

    hres = get_homeInfo(q)
    lfid = hres['data']['scheme'].split('lfid=')[1].split('_-_')[0]
    uid = hres['data']['cards'][0]['card_group'][0]['user']['id']
    profile_url = hres['data']['cards'][0]['card_group'][0]['user']['profile_url']
    back = profile_url.split('&luicode=')[1]

    url2 = base_url + 'uid={0}&luicode={1}&type=uid&value={2}'.format(uid, back, uid)
    containerid = getContained(url2)
    # 递归函数
    recursiveGetPage(uid, back, lfid, q, containerid, 1, 1, 0)

本文地址:https://blog.csdn.net/qq_15031941/article/details/106786541