今日头条爬取新闻视频用户

程序员文章站 2024-01-09 23:34:52

import requestsimport timeimport randomimport pymongoheaders = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}base_url = 'https://www.toutiao.com/api/search/co...

import requests
import time
import random
import pymongo

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
base_url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword={}&'
video_url = '&from=video&pd=video'
users_url = '&from=media&pd=user'

# 请设置要爬取的关键字
keyword = "新闻"

def get_user():
    offset = 0
    tmp=[]
    while (True):
        url = base_url.format(offset, keyword) + users_url
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url).json()
        if (resopnse['data']) == None:
            break
        length = len(resopnse['data'])
        try:
            for l in range(0, length - 1):
                dict_ = {}
                dict_['name']=resopnse['data'][l]['name']
                dict_['description'] = resopnse['data'][l]['description']
                tmp.append(dict_)
            offset += 20
            if (len(tmp) > 200):
                set_comments(tmp)
                tmp=[]
        except Exception as e:
            print(e)
    if (len(tmp) > 0):
        set_comments(tmp)


def get_new():
    offset = 0
    num = 0
    while (True):
        url = base_url.format(offset, keyword)
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url, headers=headers).json()
        if (resopnse['data']) == None:
            print('data 为空，到达底部')
            break
        length = len(resopnse['data'])
        for l in range(0, length - 1):
            tmp = []
            try:
                # json中有假数据 用这种方法去除
                if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
                    title = resopnse['data'][l]['title']
                    source = resopnse['data'][l]['source']  # 来源
                    tmp.append(title)
                    tmp.append(source)
                    print(tmp)
                    num += 1
                else:
                    continue
            except Exception as e:
                pass
        offset += 20
    print("爬取了{}条新闻".format(num))


def get_vedio():
    offset = 0
    num = 0
    while (True):
        url = base_url.format(offset, keyword + video_url)
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url, headers=headers).json()

        if (resopnse['data']) == None:
            print('data 为空，到达底部')
            break
        length = len(resopnse['data'])
        for l in range(0, length):
            tmp = []
            try:
                # json中有假数据 用这种方法去除   还有一些假数据一模一样...
                if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
                    video_title = resopnse['data'][l]['title']
                    video_source = resopnse['data'][l]['source']  # 来源
                    tmp.append(video_title)
                    tmp.append(video_source)
                    print(tmp)
                    num += 1
                else:
                    continue
            except Exception as e:
                pass
        offset += 20
    print("爬取了{}条视频".format(num))

def set_comments(tmp):
        connection = pymongo.MongoClient('81.69.')
        db = connection.weibo
        try:
            db.comments.insert_many(
                tmp
            )
        except  Exception as e1:
            print(e1)
if __name__ == '__main__':
    get_vedio()

本文地址：https://blog.csdn.net/qq_43751489/article/details/110943688

上一篇：如何查看网易云音乐等级特权?查看网易云音乐等级特权的方法

下一篇：用Python预测2020年双十一交易额

今日头条爬取新闻视频用户

今日头条爬取新闻视频用户

Node批量爬取头条视频并保存方法

爬取今日头条Ajax请求

129款侵害用户权益App被通报：今日头条、腾讯新闻被点名

Python小程序爬取今日新闻拿走就能用

python 简单爬取今日头条热点新闻(一)

递归爬取今日头条指定用户一个月内发表的所有文章，视频，微头条

Python3 爬取今日头条街拍图片

python爬取今日头条街拍美图

Node批量爬取头条视频并保存方法