Python爬虫抓取豆瓣算法类书籍综合排名导出为XLS文件
程序员文章站
2022-08-17 13:27:20
Python爬虫抓取豆瓣算法类书籍综合排名导出为XLS文件环境Python 3.7.4requests==2.22.0bs4==0.0.1xlwt==1.3.0urllib3==1.24.2re初始化def __init__(self): # URL前后缀 self.urlPrefix = 'https://book.douban.com/tag/%E7%AE%97%E6%B3%95?start=' self.urlSuffix = '&type=T'...
Python爬虫抓取豆瓣算法类书籍综合排名导出为XLS文件
环境
Python 3.7.4
requests==2.22.0
bs4==0.0.1
xlwt==1.3.0
urllib3==1.24.2
re
初始化
def __init__(self):
# URL前后缀
self.urlPrefix = 'https://book.douban.com/tag/%E7%AE%97%E6%B3%95?start='
self.urlSuffix = '&type=T'
# 伪装浏览器headers
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}
self.books = []
# 正则表达式编译
self.detailURLPattern = re.compile(r'<a href="(.*?)".*>')
self.titlePattern = re.compile(r'<a href=.*title="(.*?)">')
self.imagePattern = re.compile(r'<img class="" src="(.*?)" .*>')
self.publisherPattern = re.compile(r'<div class="pub">(.*?)</div>', re.S)
self.ratingPattern = re.compile(r'<span class="rating_nums">(.*?)</span>')
self.evaluatorsPattern = re.compile(r'<span class="pl">.*?(\d*?)人评价.*?</span>', re.S)
self.introductionPattern = re.compile(r'<p>(.*?)</p>', re.S)
self.purchaseLinkPattern = re.compile(r'<a href="(.*?)".*?</a>', re.S)
self.pagesPattern = re.compile(r'<a href="/tag/.*?&type=T">(\d*?)</a>')
# 导出为XLS相关变量
self.workbook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workbook.add_sheet('Books')
self.xlsPath = './Books.xls'
抓取网页
def crawlPage(self, url):
# request请求
request = urllib.request.Request(headers = self.headers, url = url)
page = None
try:
# 抓取HTML页面并解码
response = urllib.request.urlopen(request)
page = response.read().decode('utf-8')
except urllib.error.URLError as e:
# 出错处理
print("Get page fail!")
print(e)
return page
获取总页数
def getTotalPages(self):
# 拼接URL
url = self.urlPrefix + str(0) + self.urlSuffix
# 获取网址
page = self.crawlPage(url)
# 提取数据,选择HTML解析器
beautifulSoup = BeautifulSoup(page, 'html.parser')
pageNumbers = []
for subject in beautifulSoup.find_all('div', class_ = 'paginator'):
subject = str(subject)
# 正则表达式提取数据
pageNumbers = re.findall(self.pagesPattern, subject)
totalPageNumber = 0
# 获取最大页码返回
for pageNumber in pageNumbers:
totalPageNumber = max(totalPageNumber, int(pageNumber))
return totalPageNumber
提取数据
def extractData(self):
totalPages = self.getTotalPages()
for i in range(0, totalPages):
# 拼接URL
url = self.urlPrefix + str(i * 20) + self.urlSuffix
# 获取页面
page = self.crawlPage(url)
# 提取数据,选择HTML解析器
beautifulSoup = BeautifulSoup(page, 'html.parser')
for subject in beautifulSoup.find_all('li', class_ = 'subject-item'):
subject = str(subject)
## 正则表达式提取数据
book = []
title = re.findall(self.titlePattern, subject)
if len(title) > 0:
title = title[0]
else:
title = 'Nothing'
book.append(title)
detailURL = re.findall(self.detailURLPattern, subject)[0]
book.append(detailURL)
imageURL = re.findall(self.imagePattern, subject)[0]
book.append(imageURL)
publisher = str(re.findall(self.publisherPattern, subject)[0]).replace(' ', '').replace('\n', '')
book.append(publisher)
rating = re.findall(self.ratingPattern, subject)
if len(rating) > 0:
rating = rating[0]
else:
rating = 'None'
book.append(rating)
evaluators = re.findall(self.evaluatorsPattern, subject)[0]
book.append(evaluators)
introduction = re.findall(self.introductionPattern, subject)
if len(introduction) > 0:
introduction = introduction[0]
else:
introduction = 'Nothing'
book.append(introduction)
purchaseLink = re.findall(self.purchaseLinkPattern, subject)
if len(purchaseLink) > 1:
purchaseLink = purchaseLink[1]
else:
purchaseLink = 'Nothing'
book.append(purchaseLink)
## 书籍相关信息加入列表
self.books.append(book)
导出为XLS文件
def exportXLS(self):
if (len(self.books) == 0):
# 若未获得数据,提醒并返回
print("Get data first")
return
# 列标
columns = ['title', 'detailURL', 'imageURL', 'publisher', 'rating', 'evaluators', 'introduction', 'purchaseLink']
# 写入列标
for column in range(0, len(columns)):
self.sheet.write(0, column, columns[column])
# 每行写入对应数据信息
for i in range(1, len(self.books) + 1):
for column in range(0, len(columns)):
self.sheet.write(i, column, self.books[i - 1][column])
# 导出为XLS文件保存
self.workbook.save(self.xlsPath)
实现代码
# -*- coding:utf-8 -*-
import re
from re import S, sub
from bs4 import BeautifulSoup
import xlwt
import urllib
# from urllib import request
class Spider():
def __init__(self):
self.urlPrefix = 'https://book.douban.com/tag/%E7%AE%97%E6%B3%95?start='
self.urlSuffix = '&type=T'
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}
self.books = []
self.detailURLPattern = re.compile(r'<a href="(.*?)".*>')
self.titlePattern = re.compile(r'<a href=.*title="(.*?)">')
self.imagePattern = re.compile(r'<img class="" src="(.*?)" .*>')
self.publisherPattern = re.compile(r'<div class="pub">(.*?)</div>', re.S)
self.ratingPattern = re.compile(r'<span class="rating_nums">(.*?)</span>')
self.evaluatorsPattern = re.compile(r'<span class="pl">.*?(\d*?)人评价.*?</span>', re.S)
self.introductionPattern = re.compile(r'<p>(.*?)</p>', re.S)
self.purchaseLinkPattern = re.compile(r'<a href="(.*?)".*?</a>', re.S)
self.pagesPattern = re.compile(r'<a href="/tag/.*?&type=T">(\d*?)</a>')
self.workbook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workbook.add_sheet('Books')
self.xlsPath = './Books.xls'
def crawlPage(self, url):
request = urllib.request.Request(headers = self.headers, url = url)
page = None
try:
response = urllib.request.urlopen(request)
page = response.read().decode('utf-8')
except urllib.error.URLError as e:
print("Get page fail!")
print(e)
return page
def extractData(self):
totalPages = self.getTotalPages()
for i in range(0, totalPages):
url = self.urlPrefix + str(i * 20) + self.urlSuffix
page = self.crawlPage(url)
beautifulSoup = BeautifulSoup(page, 'html.parser')
for subject in beautifulSoup.find_all('li', class_ = 'subject-item'):
subject = str(subject)
book = []
title = re.findall(self.titlePattern, subject)
if len(title) > 0:
title = title[0]
else:
title = 'Nothing'
book.append(title)
detailURL = re.findall(self.detailURLPattern, subject)[0]
book.append(detailURL)
imageURL = re.findall(self.imagePattern, subject)[0]
book.append(imageURL)
publisher = str(re.findall(self.publisherPattern, subject)[0]).replace(' ', '').replace('\n', '')
book.append(publisher)
rating = re.findall(self.ratingPattern, subject)
if len(rating) > 0:
rating = rating[0]
else:
rating = 'None'
book.append(rating)
evaluators = re.findall(self.evaluatorsPattern, subject)[0]
book.append(evaluators)
introduction = re.findall(self.introductionPattern, subject)
if len(introduction) > 0:
introduction = introduction[0]
else:
introduction = 'Nothing'
book.append(introduction)
purchaseLink = re.findall(self.purchaseLinkPattern, subject)
if len(purchaseLink) > 1:
purchaseLink = purchaseLink[1]
else:
purchaseLink = 'Nothing'
book.append(purchaseLink)
self.books.append(book)
def getTotalPages(self):
url = self.urlPrefix + str(0) + self.urlSuffix
page = self.crawlPage(url)
beautifulSoup = BeautifulSoup(page, 'html.parser')
pageNumbers = []
for subject in beautifulSoup.find_all('div', class_ = 'paginator'):
subject = str(subject)
pageNumbers = re.findall(self.pagesPattern, subject)
totalPageNumber = 0
for pageNumber in pageNumbers:
totalPageNumber = max(totalPageNumber, int(pageNumber))
return totalPageNumber
def exportXLS(self):
if (len(self.books) == 0):
print("Get data first")
return
columns = ['title', 'detailURL', 'imageURL', 'publisher', 'rating', 'evaluators', 'introduction', 'purchaseLink']
for column in range(0, len(columns)):
self.sheet.write(0, column, columns[column])
for i in range(1, len(self.books) + 1):
for column in range(0, len(columns)):
self.sheet.write(i, column, self.books[i - 1][column])
self.workbook.save(self.xlsPath)
if __name__ == "__main__":
spider = Spider()
spider.extractData()
spider.exportXLS()
# spider.getTotalPage()
输出结果
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!
本文地址:https://blog.csdn.net/qq_44486439/article/details/107312777
上一篇: 2020年学习总结及经验分享
下一篇: AsyncTask详解