python爬虫实例(1):小说下载(本程序只用于测试)
-- coding:UTF-8 --
from bs4 import BeautifulSoup
import requests
import os
import time
import sys
pageserver=‘http://www.biqukan.com/’
findurl=‘https://www.biqukan.com/s.php?ie=gbk&s=2758772450457967865&q=’
head={‘User-Agent’:‘Mozilla/5.0’}
class downloader(object):
def __init__(self,novelurl,novelname): #,textname,target
self.server = pageserver
self.textname = novelname #小说名
self.target =self.server +novelurl
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = 0 #章节数
"""
函数说明:获取下载链接
"""
def get_download_url(self):
req = requests.get(url=self.target)
html = req.text
div_bf = BeautifulSoup(html,features="html.parser")
div = div_bf.find_all('div', class_='listmain')
a_bf = BeautifulSoup(str(div[0]),features="html.parser")
a = a_bf.find_all('a')
self.nums = len(a[12:]) #剔除不必要的章节,并统计章节数
for each in a[12:]:
self.names.append(each.string)
self.urls.append(self.server + each.get('href'))
"""
函数说明:获取章节内容
Parameters:
"""
def get_contents(self, target,num,name):
try:
req = requests.get(url=target,timeout=60)
html = req.text
bf = BeautifulSoup(html,features="html.parser")
texts = bf.find_all('div', class_='showtxt')
a=int(0)
while texts.__len__()==0:
if a==10:
pr('被抓取页面的异常,跳过该页面!','页面地址:%s,文档编号:%s,章节名:%s' % (target,num,name))
break
pr('被抓取页面异常,正在尝试第%s次重新抓取!'% str(a+1))
req = requests.get(url=target, timeout=60)
html = req.text
bf = BeautifulSoup(html, features="html.parser")
texts = bf.find_all('div', class_='showtxt')
a+=1
if texts.__len__()==0:
return '该页面因服务器异常,未能抓取到信息!'
texts = texts[0].text.replace('\xa0'*8,'\n\n')
return texts
except :
pr('被抓取页面的服务器异常,跳过该页面!','页面地址:%s,文档编号:%s,章节名:%s' % (target,num,name))
return '该页面因服务器异常,未能抓取到信息!'
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
"""
def writer(self, name, path, text):
with open(path, 'w',encoding='utf-8') as f:
f.write(name + '\n')
f.write(text)
def pr(*pr):
s=’’
for p in pr:
s+=(p+’ ‘)
st=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+’ ‘+s+’ ‘+’\n’
with open(‘D:/爬虫下载的小说/下载日志.txt’, ‘a’, encoding=‘utf-8’) as f:
f.write(st)
print(st,end=’’)
def getpageurl(name):
pr(‘正在查找你要的小说,请等候。。。’)
req = requests.get(findurl+name,headers=head)
html = req.text
div_bf = BeautifulSoup(html, features=“html.parser”)
div = div_bf.find_all(‘div’, class_=‘type_show’)
a_bf = BeautifulSoup(str(div[0]), features=“html.parser”)
ans = a_bf.find_all(‘a’)
for each in ans:
if each.string==name:
pr(‘找到了你要的小说,即将开始下载:’)
return each.get(‘href’)
if name “main”:
if not os.path.exists(‘D:/爬虫下载的小说/’):
os.makedirs(‘D:/爬虫下载的小说/’)
pr(’\n\n\n’)
pr(‘该程序只用于测试笔趣看网站!’)
inp=input(’%s 请输入你要查找的小说名:\n’ % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
novelurl=getpageurl(inp)
if novelurlNone:
pr( ‘在笔趣阁网站未找到你想要的小说!’)
sys.exit(0)
dl = downloader(novelurl,inp)
dl.get_download_url()
pr( ‘程序在您的D盘根目录创建了一个名为"爬虫下载的小说"的文件夹,用于存放下载的小说!’)
pr(’《%s》开始下载:’% dl.textname)
path=‘D:/爬虫下载的小说/%s/’ % dl.textname
isExists = os.path.exists(path)
if not isExists:# 如果不存在则创建目录
os.makedirs(path) #多层创建目录
for i in range(dl.nums):
dl.writer(dl.names[i], path+’%s %s.txt’ % (str(i+1),dl.names[i]), dl.get_contents(dl.urls[i],i,dl.names[i]))
pr(" 已下载:%.3f%%" % (float(i/dl.nums)*100))
pr(’《%s》下载完成’% dl.textname)
上一篇: php中环境变量$_ENV与getenv
下一篇: js下函数般调用正则的方法附代码