今日头条爬取新闻视频用户
程序员文章站
2024-01-09 23:34:52
import requestsimport timeimport randomimport pymongoheaders = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}base_url = 'https://www.toutiao.com/api/search/co...
import requests
import time
import random
import pymongo
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
base_url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword={}&'
video_url = '&from=video&pd=video'
users_url = '&from=media&pd=user'
# 请设置要爬取的关键字
keyword = "新闻"
def get_user():
offset = 0
tmp=[]
while (True):
url = base_url.format(offset, keyword) + users_url
time.sleep(random.randint(10, 50) * 0.1)
resopnse = requests.get(url).json()
if (resopnse['data']) == None:
break
length = len(resopnse['data'])
try:
for l in range(0, length - 1):
dict_ = {}
dict_['name']=resopnse['data'][l]['name']
dict_['description'] = resopnse['data'][l]['description']
tmp.append(dict_)
offset += 20
if (len(tmp) > 200):
set_comments(tmp)
tmp=[]
except Exception as e:
print(e)
if (len(tmp) > 0):
set_comments(tmp)
def get_new():
offset = 0
num = 0
while (True):
url = base_url.format(offset, keyword)
time.sleep(random.randint(10, 50) * 0.1)
resopnse = requests.get(url, headers=headers).json()
if (resopnse['data']) == None:
print('data 为空,到达底部')
break
length = len(resopnse['data'])
for l in range(0, length - 1):
tmp = []
try:
# json中有假数据 用这种方法去除
if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
title = resopnse['data'][l]['title']
source = resopnse['data'][l]['source'] # 来源
tmp.append(title)
tmp.append(source)
print(tmp)
num += 1
else:
continue
except Exception as e:
pass
offset += 20
print("爬取了{}条新闻".format(num))
def get_vedio():
offset = 0
num = 0
while (True):
url = base_url.format(offset, keyword + video_url)
time.sleep(random.randint(10, 50) * 0.1)
resopnse = requests.get(url, headers=headers).json()
if (resopnse['data']) == None:
print('data 为空,到达底部')
break
length = len(resopnse['data'])
for l in range(0, length):
tmp = []
try:
# json中有假数据 用这种方法去除 还有一些假数据一模一样...
if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
video_title = resopnse['data'][l]['title']
video_source = resopnse['data'][l]['source'] # 来源
tmp.append(video_title)
tmp.append(video_source)
print(tmp)
num += 1
else:
continue
except Exception as e:
pass
offset += 20
print("爬取了{}条视频".format(num))
def set_comments(tmp):
connection = pymongo.MongoClient('81.69.')
db = connection.weibo
try:
db.comments.insert_many(
tmp
)
except Exception as e1:
print(e1)
if __name__ == '__main__':
get_vedio()
本文地址:https://blog.csdn.net/qq_43751489/article/details/110943688