欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python_爬取新浪新闻

程序员文章站 2022-05-02 22:04:11
...

这段代码可以优化的。但我太赖了就写出个雏形

所用到的知识点下面这个是知识点代码小结:

#http://news.sina.com.cn/c/nd/2017-07-07/doc-ifyhwehx5324387.shtm
import  requests
from bs4 import BeautifulSoup
import json
#转换时间
from datetime import datetime
r=requests.get('http://news.sina.com.cn/c/nd/2017-07-07/doc-ifyhwehx5324387.shtml')
r.encoding=r.apparent_encoding
data=r.text
soup=BeautifulSoup(data,'html.parser')
line=soup.select('#artibodyTitle')
print (line[0].text)
#print(soup.prettify())
#contents.contents子节点的列表,将<tag>所有的儿子节点存入列表
times=soup.select(".time-source")[0].contents[0].strip()
#字符串转换成时间from datetime import datetime/dt.strftime
dt=datetime.strptime(times,'%Y年%m月%d日%H:%M')
print(soup.select(".time-source span a")[0].text)
c=soup.select('#artibody p')[:-1]
print(c)
for text_news in soup.select('#artibody p'):
    print(text_news.text)
print(times)
print(soup.select('#commentCount1'))
#''.join()以什么什么连接
r1=requests.get('http://comment5.news.sina.com.cn/page/info?version=1&\
format=js&channel=gn&newsid=comos-fyhwehx5324387&group=&'
                '\compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20')
r1.encoding=r1.apparent_encoding
#json
'''
load和loads都是实现“反序列化”,区别在于(以Python为例):

loads针对内存对象,即将Python内置数据序列化为字串
如使用json.dumps序列化的对象d_json=json.dumps({'a':1, 'b':2}),在这里d_json是一个字串'{"b": 2, "a": 1}'
d=json.loads(d_json)  #{ b": 2, "a": 1},使用load重新反序列化为dict
load针对文件句柄
如本地有一个json文件a.json则可以d=json.load(open('a.json'))
相应的,dump就是将内置类型序列化为json对象后写入文件
'''
count=r1.text.strip("var data=")
countnum=json.loads(count)
print(countnum['result']['count']['total'])

这个是程序的源码:

import  requests
from bs4 import BeautifulSoup
import  json
from datetime import datetime
import re
def  getHtml(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        json_text=r.text.lstrip(' newsloadercallback(').rstrip(');')
        return json_text
    except:
        return '连接超时请检查网络'
def getnewslist(html,newslist):
    try:
        json_lines =json.loads(html)['result']['data']
        for  lines in json_lines:
            news_lines=lines['title']
            news_href=lines['url']
            newslist.append([news_lines,news_href])
    except:
        return "抓取异常"


def printNewsContent(url):
    hmtl_content=getHtml(url)
    soup=BeautifulSoup(hmtl_content,'html.parser')
    news_title=soup.select('#artibodyTitle')[0].text
    news_strtime=soup.select('#navtimeSource')[0].contents[0]
    news_time=datetime.strptime(news_strtime.strip(),'%Y年%m月%d日%H:%M')
    print(news_title, news_time)
    for contents_lists in soup.select('#artibody p'):
        news_contents=contents_lists.text
        print(news_contents)



def news_comments(url,commentlist):
   #'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-fyhwefp0253530&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&'
    html=getHtml(url)
    num=int(json.loads(html.lstrip('var data='))['result']['count']['total'])
    print('当前评论数',num)
    if num!=0:
      a= json.loads(html.lstrip('var data='))['result']['cmntlist']
      for comments in a:
          comment_area = comments['area']
          comment_content = comments['content']
          commentlist.append([comment_area, comment_content])
    else:
        print('当前还没有评论')
def main():
    commentlist=[]
    newslist=[]
    start_url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback'
    num=input('你要查询的页数')
    count_lines = 1
    if num.isdigit():
        url=start_url.format(int(num))
        html=getHtml(url)
        getnewslist(html,newslist)
        for newslines in newslist:
            print(count_lines,newslines[0])
            count_lines+=1
    else:
        print('请输入对应页数')

    page_num=input('请输入你要查看的文章内容')
    if page_num.isdigit():
        page_url=newslist[int(page_num)-1][1]
        printNewsContent(page_url)
    else:
        print('请输入正确的新闻')

    if page_num.isdigit():
        c_url ='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&'
        _url = newslist[int(page_num) - 1][1]

        m=re.search('doc-i(.+).shtml',_url)
        newid=m.group(1)
        newid_url=c_url.format(newid)
        news_comments(newid_url,commentlist)
        print(commentlist)


    else:
        print('请输入对应页数')








main()