欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

今日头条爬取新闻视频用户

程序员文章站 2024-01-09 23:34:52
import requestsimport timeimport randomimport pymongoheaders = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}base_url = 'https://www.toutiao.com/api/search/co...
import requests
import time
import random
import pymongo

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
base_url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword={}&'
video_url = '&from=video&pd=video'
users_url = '&from=media&pd=user'

# 请设置要爬取的关键字
keyword = "新闻"

def get_user():
    offset = 0
    tmp=[]
    while (True):
        url = base_url.format(offset, keyword) + users_url
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url).json()
        if (resopnse['data']) == None:
            break
        length = len(resopnse['data'])
        try:
            for l in range(0, length - 1):
                dict_ = {}
                dict_['name']=resopnse['data'][l]['name']
                dict_['description'] = resopnse['data'][l]['description']
                tmp.append(dict_)
            offset += 20
            if (len(tmp) > 200):
                set_comments(tmp)
                tmp=[]
        except Exception as e:
            print(e)
    if (len(tmp) > 0):
        set_comments(tmp)


def get_new():
    offset = 0
    num = 0
    while (True):
        url = base_url.format(offset, keyword)
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url, headers=headers).json()
        if (resopnse['data']) == None:
            print('data 为空,到达底部')
            break
        length = len(resopnse['data'])
        for l in range(0, length - 1):
            tmp = []
            try:
                # json中有假数据 用这种方法去除
                if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
                    title = resopnse['data'][l]['title']
                    source = resopnse['data'][l]['source']  # 来源
                    tmp.append(title)
                    tmp.append(source)
                    print(tmp)
                    num += 1
                else:
                    continue
            except Exception as e:
                pass
        offset += 20
    print("爬取了{}条新闻".format(num))


def get_vedio():
    offset = 0
    num = 0
    while (True):
        url = base_url.format(offset, keyword + video_url)
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url, headers=headers).json()

        if (resopnse['data']) == None:
            print('data 为空,到达底部')
            break
        length = len(resopnse['data'])
        for l in range(0, length):
            tmp = []
            try:
                # json中有假数据 用这种方法去除   还有一些假数据一模一样...
                if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
                    video_title = resopnse['data'][l]['title']
                    video_source = resopnse['data'][l]['source']  # 来源
                    tmp.append(video_title)
                    tmp.append(video_source)
                    print(tmp)
                    num += 1
                else:
                    continue
            except Exception as e:
                pass
        offset += 20
    print("爬取了{}条视频".format(num))

def set_comments(tmp):
        connection = pymongo.MongoClient('81.69.')
        db = connection.weibo
        try:
            db.comments.insert_many(
                tmp
            )
        except  Exception as e1:
            print(e1)
if __name__ == '__main__':
    get_vedio()

本文地址:https://blog.csdn.net/qq_43751489/article/details/110943688