python爬虫小实战二 之新浪新闻爬取新闻内容关键字
程序员文章站
2022-05-02 22:01:05
...
# coding:utf8
import requests
from bs4 import BeautifulSoup
import re
import xlwt
class News:
def __init__(self,searchName,searchArea='news'):
self.head = {
'User-Agent' :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/61.0.3163.79 Safari/537.36",
'Cookie' : "UOR=,k.sina.com.cn,; SINAGLOBAL=171.217.92.127_1558922890.497388; Apache=171.217.92.127_"
"1558922890.497390; ULV=1558922899005:2:2:2:171.217.92.127_1558922890.497390:1558922890476; "
"U_TRS1=0000007f.8da65be9.5ceb5bda.d813a4df; U_TRS2=0000007f.8db85be9.5ceb5bda.5119306e; "
"WEB2_OTHER=ea4b3c7f7d9d067e32500238120cd1b6; SUB=_2AkMrt9TAf8NxqwJRmP0TxGvqZIh2zwjEieKd6yUbJRM"
"yHRl-yD83qh0gtRB6ADf6Lx2uvK1-lDL0cQSEo7_kxmqyVI1u; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFSsFZ"
"ZQkpLzpzZDRWlR4_j; ULOGIN_IMG=gz-1ce8d0f8e61941477ad657acb3495d473d56; UM_distinctid=16af7db4ddc"
"287-0dc58dcf9bf925-1781c36-100200-16af7db4ddd29d; lxlrttp=1556243090"
}
self.searchName = searchName
self.searchArea = searchArea
self.UrlData = [] #爬取新闻url的存储
self.TitleData = [] #爬取新闻标题的存储
self.NewsData = [] # 爬取新闻数据的存储
self.pageCount = 0
def get_page_count(self):
url = 'https://search.sina.com.cn/?q={}&c={}&ie=utf-8' #查找的关键字url
response = requests.get(url.format(self.searchName,self.searchArea))
# response.encoding = 'utf-8'
html =response.text
#得到的网页,判断是否有找到news
soup =BeautifulSoup(html,'lxml')
# print(soup)
#爬取是否有l_v2这个class,如果有代表有数据,否则无
try:
page = soup.select('.l_v2')[0].text
except Exception as e:
page = ''
print(e)
if page !='' :
purl = ''
pageCount = re.findall(r'[0-9]\d*',page)
for x in pageCount:
purl = purl+x
print(purl)
self.pageCount = int(purl)//20 +1 #总的页数
else:
self.pageCount = 0
return self.pageCount
#get 整页的news
def get_news_data(self):
url = 'https://search.sina.com.cn/?q={}&c={}&ie=utf-8&sort=time&page={}'
count =input('共找到{}页信息,请输入需要爬取的页数,不输入按q继续【爬取全部】:'.format(self.pageCount))
if count=='q':count = self.pageCount
print('开始爬取......')
for x in range(1,int(count)):
responses = requests.get(url.format(self.searchName,self.searchArea,x),headers=self.head)
# print(url.format(self.searchName,self.searchArea,x))
html = responses.text
soup = BeautifulSoup(html,'lxml')
reg = soup.select('h2 a')
# print(reg)
newsUrl = re.findall('<a href="(.*?)" target="_blank">.*?</a>', str(reg),re.S) # 新闻url
newsTitle = re.findall('<a href=".*?" target="_blank">(.*?)</a>', str(reg), re.S) #新闻标题
newsTitle =re.sub('<.*?>','',str(newsTitle))
newsTitle = newsTitle[1:len(newsTitle)-1].replace("'",'')
titleData = newsTitle.split(',') #新闻标题
for i in range(len(titleData)):
self.TitleData.append(titleData[i])
# for i in range(len(titleData)):
# for j in range(len(titleData[i])):
# self.TitleData.append(titleData[i][j])
for i in range(len(newsUrl)):
self.UrlData.append(newsUrl[i])
def get_news_content(self,url):
#根据得到的url,获取二级新闻页面的内容
responses = requests.get(url,headers=self.head)
responses.encoding = 'utf-8'
html = responses.text
soup = BeautifulSoup(html,'lxml')
reg =soup.select('p')
# regTitle = soup.select(('h1')) # 获取页面标题 <h1 class="main-title">杨鸣:打3X3选郭少大韩做队友 颜值靠才华支撑</h1>
# title = re.findall(r'.*?>(.*?)<.*?', str(regTitle[0]), re.S)
# if len(title)>1:
# for i in range(len(title)-1):
# if title[i]!='':
# self.TitleData.append(title[i])
# break
# else:
# self.TitleData.append(title[0])
newsData = [] #用来装一条newscontent,二维
newsData1 = [] # 用来装一条newscontent,1维
#对<p><font> 这种格式做特殊处理
if '<p><font>' in str(reg):
reg = soup.select('p font')
# print(reg)
for x in reg:
if len(x) > 0 and (self.searchName in str(x)):
data = re.findall('<font>(.*?)</font>', str(x),re.S)
newsData.append(data)
else:
#<p>这种格式默认处理
for x in reg:
if len(x) > 0 and (self.searchName in str(x)):
data = re.findall(r'<p(.*?)</p>', str(x),re.S)
newsData.append(data)
#将二维数组转成一维存储入NewsData(新闻content)
if len(newsData)==0 :
newsData1 = []
else :
# print('newsData1 = {}'.format(newsData) )
for i in range(len(newsData)-1):
if newsData[i][0] != '':
newsData1.append(newsData[i][0])
else:continue
self.NewsData.append(newsData1)
#封装后的入数据方法
def final_func(self):
for x in range(len(self.UrlData)):
self.get_news_content(self.UrlData[x])
self.save_data_excel()
#将finalData存储到excel里面
def save_data_excel(self):
urldata = self.UrlData
newsdata = self.NewsData
titledata = self.TitleData
ExcelTitle = ['编号','新闻标题','URL','新闻内容']
row = 0
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('包含\"' +self.searchName +'\"的新闻',cell_overwrite_ok='True')
#先写入第一行
print('开始写入数据到excel....')
for i in range(len(ExcelTitle)):
sheet.write(row, 0, ExcelTitle[0])
sheet.write(row, 1, ExcelTitle[1])
sheet.write(row, 2, ExcelTitle[2])
sheet.write(row, 3, ExcelTitle[3])
for j in range(len(self.TitleData)):
if len(newsdata[j])!= 0:
row += 1
sheet.write(row, 0, row)
sheet.write(row, 1, titledata[j])
sheet.write(row, 2, urldata[j])
sheet.write(row, 3, newsdata[j])
book.save('D:\\test\\数据.csv')
print('写入数据完毕!')
if __name__=='__main__':
news = News('人','news')
news.get_page_count()
news.get_news_data()
news.final_func()
希望对你们有用
上一篇: 关于游戏扑克的算法