欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫小实战二 之新浪新闻爬取新闻内容关键字

程序员文章站 2022-05-02 22:01:05
...
# coding:utf8
import requests
from bs4 import BeautifulSoup
import re
import xlwt

class News:
    def __init__(self,searchName,searchArea='news'):
        self.head = {
            'User-Agent' :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/61.0.3163.79 Safari/537.36",
            'Cookie' : "UOR=,k.sina.com.cn,; SINAGLOBAL=171.217.92.127_1558922890.497388; Apache=171.217.92.127_"
                       "1558922890.497390; ULV=1558922899005:2:2:2:171.217.92.127_1558922890.497390:1558922890476; "
                       "U_TRS1=0000007f.8da65be9.5ceb5bda.d813a4df; U_TRS2=0000007f.8db85be9.5ceb5bda.5119306e; "
                       "WEB2_OTHER=ea4b3c7f7d9d067e32500238120cd1b6; SUB=_2AkMrt9TAf8NxqwJRmP0TxGvqZIh2zwjEieKd6yUbJRM"
                       "yHRl-yD83qh0gtRB6ADf6Lx2uvK1-lDL0cQSEo7_kxmqyVI1u; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFSsFZ"
                       "ZQkpLzpzZDRWlR4_j; ULOGIN_IMG=gz-1ce8d0f8e61941477ad657acb3495d473d56; UM_distinctid=16af7db4ddc"
                       "287-0dc58dcf9bf925-1781c36-100200-16af7db4ddd29d; lxlrttp=1556243090"
        }
        self.searchName = searchName
        self.searchArea = searchArea
        self.UrlData = []    #爬取新闻url的存储
        self.TitleData = []  #爬取新闻标题的存储
        self.NewsData = []  # 爬取新闻数据的存储
        self.pageCount = 0

    def get_page_count(self):
        url = 'https://search.sina.com.cn/?q={}&c={}&ie=utf-8'  #查找的关键字url
        response = requests.get(url.format(self.searchName,self.searchArea))
        # response.encoding = 'utf-8'
        html =response.text
        #得到的网页,判断是否有找到news
        soup =BeautifulSoup(html,'lxml')
        # print(soup)
        #爬取是否有l_v2这个class,如果有代表有数据,否则无
        try:
            page = soup.select('.l_v2')[0].text
        except Exception as e:
            page = ''
            print(e)
        if page !='' :
            purl = ''
            pageCount = re.findall(r'[0-9]\d*',page)
            for x in pageCount:
                purl = purl+x
            print(purl)
            self.pageCount = int(purl)//20 +1  #总的页数
        else:
            self.pageCount = 0
        return self.pageCount

    #get 整页的news
    def get_news_data(self):
        url = 'https://search.sina.com.cn/?q={}&c={}&ie=utf-8&sort=time&page={}'
        count =input('共找到{}页信息,请输入需要爬取的页数,不输入按q继续【爬取全部】:'.format(self.pageCount))
        if count=='q':count = self.pageCount
        print('开始爬取......')

        for x in range(1,int(count)):
            responses = requests.get(url.format(self.searchName,self.searchArea,x),headers=self.head)
            # print(url.format(self.searchName,self.searchArea,x))
            html = responses.text
            soup = BeautifulSoup(html,'lxml')
            reg = soup.select('h2 a')
            # print(reg)
            newsUrl = re.findall('<a href="(.*?)" target="_blank">.*?</a>', str(reg),re.S)  # 新闻url
            newsTitle = re.findall('<a href=".*?" target="_blank">(.*?)</a>', str(reg), re.S)   #新闻标题
            newsTitle =re.sub('<.*?>','',str(newsTitle))
            newsTitle = newsTitle[1:len(newsTitle)-1].replace("'",'')
            titleData = newsTitle.split(',') #新闻标题
            for i in range(len(titleData)):
                    self.TitleData.append(titleData[i])
            # for i in range(len(titleData)):
            #     for j in range(len(titleData[i])):
            #             self.TitleData.append(titleData[i][j])
            for i in range(len(newsUrl)):
                    self.UrlData.append(newsUrl[i])

    def get_news_content(self,url):
        #根据得到的url,获取二级新闻页面的内容
        responses = requests.get(url,headers=self.head)
        responses.encoding = 'utf-8'
        html = responses.text
        soup = BeautifulSoup(html,'lxml')
        reg =soup.select('p')
        # regTitle = soup.select(('h1'))  # 获取页面标题  <h1 class="main-title">杨鸣:打3X3选郭少大韩做队友 颜值靠才华支撑</h1>
        # title = re.findall(r'.*?>(.*?)<.*?', str(regTitle[0]), re.S)
        # if len(title)>1:
        #     for i in range(len(title)-1):
        #         if title[i]!='':
        #             self.TitleData.append(title[i])
        #             break
        # else:
        #     self.TitleData.append(title[0])
        newsData = []  #用来装一条newscontent,二维
        newsData1 = []  # 用来装一条newscontent,1维
        #对<p><font> 这种格式做特殊处理
        if '<p><font>' in str(reg):
            reg = soup.select('p font')
            # print(reg)
            for x in reg:
                if len(x) > 0 and (self.searchName in str(x)):
                    data = re.findall('<font>(.*?)</font>', str(x),re.S)
                    newsData.append(data)
        else:
            #<p>这种格式默认处理
            for x in reg:
                if len(x) > 0 and (self.searchName in str(x)):
                    data = re.findall(r'<p(.*?)</p>', str(x),re.S)
                    newsData.append(data)

        #将二维数组转成一维存储入NewsData(新闻content)
        if len(newsData)==0 :
            newsData1 = []
        else :
            # print('newsData1 = {}'.format(newsData) )
            for i in range(len(newsData)-1):
                if newsData[i][0] != '':
                    newsData1.append(newsData[i][0])
                else:continue
        self.NewsData.append(newsData1)


     #封装后的入数据方法
    def final_func(self):
        for x in range(len(self.UrlData)):
            self.get_news_content(self.UrlData[x])
        self.save_data_excel()


    #将finalData存储到excel里面
    def save_data_excel(self):
        urldata = self.UrlData
        newsdata = self.NewsData
        titledata = self.TitleData
        ExcelTitle = ['编号','新闻标题','URL','新闻内容']
        row = 0
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('包含\"' +self.searchName +'\"的新闻',cell_overwrite_ok='True')
        #先写入第一行
        print('开始写入数据到excel....')
        for i in range(len(ExcelTitle)):
            sheet.write(row, 0, ExcelTitle[0])
            sheet.write(row, 1, ExcelTitle[1])
            sheet.write(row, 2, ExcelTitle[2])
            sheet.write(row, 3, ExcelTitle[3])

        for j in range(len(self.TitleData)):
            if len(newsdata[j])!= 0:
                row += 1
                sheet.write(row, 0, row)
                sheet.write(row, 1, titledata[j])
                sheet.write(row, 2, urldata[j])
                sheet.write(row, 3, newsdata[j])
        book.save('D:\\test\\数据.csv')
        print('写入数据完毕!')


if __name__=='__main__':
    news = News('人','news')
    news.get_page_count()
    news.get_news_data()
    news.final_func()

希望对你们有用