python_爬取新浪新闻
程序员文章站
2022-05-02 22:04:11
...
这段代码可以优化的。但我太赖了就写出个雏形
所用到的知识点下面这个是知识点代码小结:
#http://news.sina.com.cn/c/nd/2017-07-07/doc-ifyhwehx5324387.shtm
import requests
from bs4 import BeautifulSoup
import json
#转换时间
from datetime import datetime
r=requests.get('http://news.sina.com.cn/c/nd/2017-07-07/doc-ifyhwehx5324387.shtml')
r.encoding=r.apparent_encoding
data=r.text
soup=BeautifulSoup(data,'html.parser')
line=soup.select('#artibodyTitle')
print (line[0].text)
#print(soup.prettify())
#contents.contents子节点的列表,将<tag>所有的儿子节点存入列表
times=soup.select(".time-source")[0].contents[0].strip()
#字符串转换成时间from datetime import datetime/dt.strftime
dt=datetime.strptime(times,'%Y年%m月%d日%H:%M')
print(soup.select(".time-source span a")[0].text)
c=soup.select('#artibody p')[:-1]
print(c)
for text_news in soup.select('#artibody p'):
print(text_news.text)
print(times)
print(soup.select('#commentCount1'))
#''.join()以什么什么连接
r1=requests.get('http://comment5.news.sina.com.cn/page/info?version=1&\
format=js&channel=gn&newsid=comos-fyhwehx5324387&group=&'
'\compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20')
r1.encoding=r1.apparent_encoding
#json
'''
load和loads都是实现“反序列化”,区别在于(以Python为例):
loads针对内存对象,即将Python内置数据序列化为字串
如使用json.dumps序列化的对象d_json=json.dumps({'a':1, 'b':2}),在这里d_json是一个字串'{"b": 2, "a": 1}'
d=json.loads(d_json) #{ b": 2, "a": 1},使用load重新反序列化为dict
load针对文件句柄
如本地有一个json文件a.json则可以d=json.load(open('a.json'))
相应的,dump就是将内置类型序列化为json对象后写入文件
'''
count=r1.text.strip("var data=")
countnum=json.loads(count)
print(countnum['result']['count']['total'])
这个是程序的源码:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re
def getHtml(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
json_text=r.text.lstrip(' newsloadercallback(').rstrip(');')
return json_text
except:
return '连接超时请检查网络'
def getnewslist(html,newslist):
try:
json_lines =json.loads(html)['result']['data']
for lines in json_lines:
news_lines=lines['title']
news_href=lines['url']
newslist.append([news_lines,news_href])
except:
return "抓取异常"
def printNewsContent(url):
hmtl_content=getHtml(url)
soup=BeautifulSoup(hmtl_content,'html.parser')
news_title=soup.select('#artibodyTitle')[0].text
news_strtime=soup.select('#navtimeSource')[0].contents[0]
news_time=datetime.strptime(news_strtime.strip(),'%Y年%m月%d日%H:%M')
print(news_title, news_time)
for contents_lists in soup.select('#artibody p'):
news_contents=contents_lists.text
print(news_contents)
def news_comments(url,commentlist):
#'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-fyhwefp0253530&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&'
html=getHtml(url)
num=int(json.loads(html.lstrip('var data='))['result']['count']['total'])
print('当前评论数',num)
if num!=0:
a= json.loads(html.lstrip('var data='))['result']['cmntlist']
for comments in a:
comment_area = comments['area']
comment_content = comments['content']
commentlist.append([comment_area, comment_content])
else:
print('当前还没有评论')
def main():
commentlist=[]
newslist=[]
start_url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback'
num=input('你要查询的页数')
count_lines = 1
if num.isdigit():
url=start_url.format(int(num))
html=getHtml(url)
getnewslist(html,newslist)
for newslines in newslist:
print(count_lines,newslines[0])
count_lines+=1
else:
print('请输入对应页数')
page_num=input('请输入你要查看的文章内容')
if page_num.isdigit():
page_url=newslist[int(page_num)-1][1]
printNewsContent(page_url)
else:
print('请输入正确的新闻')
if page_num.isdigit():
c_url ='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&'
_url = newslist[int(page_num) - 1][1]
m=re.search('doc-i(.+).shtml',_url)
newid=m.group(1)
newid_url=c_url.format(newid)
news_comments(newid_url,commentlist)
print(commentlist)
else:
print('请输入对应页数')
main()