python_爬取新浪新闻

程序员文章站 2022-05-02 22:04:11

...

这段代码可以优化的。但我太赖了就写出个雏形

所用到的知识点下面这个是知识点代码小结：

#http://news.sina.com.cn/c/nd/2017-07-07/doc-ifyhwehx5324387.shtm
import  requests
from bs4 import BeautifulSoup
import json
#转换时间
from datetime import datetime
r=requests.get('http://news.sina.com.cn/c/nd/2017-07-07/doc-ifyhwehx5324387.shtml')
r.encoding=r.apparent_encoding
data=r.text
soup=BeautifulSoup(data,'html.parser')
line=soup.select('#artibodyTitle')
print (line[0].text)
#print(soup.prettify())
#contents.contents子节点的列表，将<tag>所有的儿子节点存入列表
times=soup.select(".time-source")[0].contents[0].strip()
#字符串转换成时间from datetime import datetime/dt.strftime
dt=datetime.strptime(times,'%Y年%m月%d日%H:%M')
print(soup.select(".time-source span a")[0].text)
c=soup.select('#artibody p')[:-1]
print(c)
for text_news in soup.select('#artibody p'):
    print(text_news.text)
print(times)
print(soup.select('#commentCount1'))
#''.join()以什么什么连接
r1=requests.get('http://comment5.news.sina.com.cn/page/info?version=1&\
format=js&channel=gn&newsid=comos-fyhwehx5324387&group=&'
                '\compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20')
r1.encoding=r1.apparent_encoding
#json
'''
load和loads都是实现“反序列化”，区别在于（以Python为例）：

loads针对内存对象，即将Python内置数据序列化为字串
如使用json.dumps序列化的对象d_json=json.dumps({'a':1, 'b':2})，在这里d_json是一个字串'{"b": 2, "a": 1}'
d=json.loads(d_json)  #{ b": 2, "a": 1}，使用load重新反序列化为dict
load针对文件句柄
如本地有一个json文件a.json则可以d=json.load(open('a.json'))
相应的，dump就是将内置类型序列化为json对象后写入文件
'''
count=r1.text.strip("var data=")
countnum=json.loads(count)
print(countnum['result']['count']['total'])

这个是程序的源码：

import  requests
from bs4 import BeautifulSoup
import  json
from datetime import datetime
import re
def  getHtml(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        json_text=r.text.lstrip(' newsloadercallback(').rstrip(');')
        return json_text
    except:
        return '连接超时请检查网络'
def getnewslist(html,newslist):
    try:
        json_lines =json.loads(html)['result']['data']
        for  lines in json_lines:
            news_lines=lines['title']
            news_href=lines['url']
            newslist.append([news_lines,news_href])
    except:
        return "抓取异常"


def printNewsContent(url):
    hmtl_content=getHtml(url)
    soup=BeautifulSoup(hmtl_content,'html.parser')
    news_title=soup.select('#artibodyTitle')[0].text
    news_strtime=soup.select('#navtimeSource')[0].contents[0]
    news_time=datetime.strptime(news_strtime.strip(),'%Y年%m月%d日%H:%M')
    print(news_title, news_time)
    for contents_lists in soup.select('#artibody p'):
        news_contents=contents_lists.text
        print(news_contents)



def news_comments(url,commentlist):
   #'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-fyhwefp0253530&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&'
    html=getHtml(url)
    num=int(json.loads(html.lstrip('var data='))['result']['count']['total'])
    print('当前评论数',num)
    if num!=0:
      a= json.loads(html.lstrip('var data='))['result']['cmntlist']
      for comments in a:
          comment_area = comments['area']
          comment_content = comments['content']
          commentlist.append([comment_area, comment_content])
    else:
        print('当前还没有评论')
def main():
    commentlist=[]
    newslist=[]
    start_url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback'
    num=input('你要查询的页数')
    count_lines = 1
    if num.isdigit():
        url=start_url.format(int(num))
        html=getHtml(url)
        getnewslist(html,newslist)
        for newslines in newslist:
            print(count_lines,newslines[0])
            count_lines+=1
    else:
        print('请输入对应页数')

    page_num=input('请输入你要查看的文章内容')
    if page_num.isdigit():
        page_url=newslist[int(page_num)-1][1]
        printNewsContent(page_url)
    else:
        print('请输入正确的新闻')

    if page_num.isdigit():
        c_url ='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&'
        _url = newslist[int(page_num) - 1][1]

        m=re.search('doc-i(.+).shtml',_url)
        newid=m.group(1)
        newid_url=c_url.format(newid)
        news_comments(newid_url,commentlist)
        print(commentlist)


    else:
        print('请输入对应页数')








main()

上一篇： php + nginx 安装

下一篇： MySQL错误Forcing close of thread的两种解决方法_MySQL

python_爬取新浪新闻

Python爬虫实战教程：爬取网易新闻

荐 selenium、requests爬取新浪微博高清图片

Python爬虫系列 - 初探：爬取新闻推送

最适合新手练手的爬虫案例——爬取新浪微博用户图片！

thinkphp5使用workerman定时器定时爬取某站点新闻资讯等内容

详解用python写网络爬虫-爬取新浪微博评论

Python爬虫爬取新浪微博内容示例【基于代理IP】

做了个简单的post请求爬虫，爬取广东省科技厅关于创新的新闻

Python爬虫实例--爬取人民网新闻

Python+Scrapy爬取腾讯新闻首页所有新闻及评论