python:爬取新浪新闻的内容
程序员文章站
2022-05-02 20:50:04
...
import requests
import json
from bs4 import BeautifulSoup
import re
import pandas
import sqlite3
commenturl='https://comment.sina.com.cn/page/info?version=1&format=json' \
'&channel=gn&newsid=comos-{}&group=undefined&compress=0&' \
'ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread' \
'=1&callback=jsonp_1543748934208'
# 获取评论数
def getCommentCounts(newsurl):
#获取没则新闻的编号(正则表达式)
m = re.search('doc-i(.*).shtml', newsurl)
newsid = m.group(1)
#格式化链接中的的大括号
comments = requests.get(commenturl.format(newsid))
#把加了js外套的json变成标准json
jd = json.loads(comments.text.strip('jsonp_1543748934208()'))
#获取评论数
return jd['result']['count']['total'];
# 提取每则新闻的内文
def getNewsDetail(newsurl):
# 定义一个字典存储信息
result = {}
rsp = requests.get(newsurl)
rsp.encoding = 'utf-8'
soup = BeautifulSoup(rsp.text,'html.parser')
# 获取标题
result['title'] = soup.select('.main-title')[0].text
# 获取日期
result['time'] = soup.select('.date')[0].text
# 获取来源
result['source'] = soup.select('.source')[0].text
# 获取内容
result['article'] = ' '.join([p.text.strip() for p in soup.select('#article p')[:-1]])
# 获取编辑
result['editor'] = soup.select('.show_author')[0].text.lstrip('责任编辑:')
# 获取评论数
result['comment']=getCommentCounts(newsurl)
return result
# 获取分页链接
def parseListLinks(url):
newsdetails = []
rsp = requests.get(url)
# 把加了js外套的json变成标准json
jsonUrl = '{' + str(rsp.text.lstrip('try{feedCardJsonpCallback(').rstrip(') ;}catch(e){};')) + '}}'
jd=json.loads(jsonUrl)
# 获取每页的新闻链接
for ent in jd['result']['data']:
newsdetails.append(getNewsDetail(ent['url']))
return newsdetails
url='https://feed.sina.com.cn/api/roll/' \
'get?pageid=121&lid=1356&num=20&versionNumber=1.2.4' \
'&page={}&encode=utf-8&callback=feedCardJsonpCallback&_'
news_total = []
for i in range(1,3):#爬取的页数自己设定
# 格式化链接中的大括号
newsurl = url.format(i)
newsary = parseListLinks(newsurl)
news_total.extend(newsary)
# 使用pandas模块使爬取到的信息格式化
df = pandas.DataFrame(news_total)
# 保存为xlsx文件
df.to_excel('news.xlsx')
上一篇: 一封求复合的情书