欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫实例(1):小说下载(本程序只用于测试)

程序员文章站 2024-01-22 08:28:22
...

-- coding:UTF-8 --

from bs4 import BeautifulSoup
import requests
import os
import time
import sys

pageserver=‘http://www.biqukan.com/
findurl=‘https://www.biqukan.com/s.php?ie=gbk&s=2758772450457967865&q=
head={‘User-Agent’:‘Mozilla/5.0’}

class downloader(object):

def __init__(self,novelurl,novelname):   #,textname,target
    self.server = pageserver
    self.textname = novelname   #小说名
    self.target =self.server +novelurl
    self.names = []            #存放章节名
    self.urls = []            #存放章节链接
    self.nums = 0            #章节数

"""
函数说明:获取下载链接
"""
def get_download_url(self):
    req = requests.get(url=self.target)
    html = req.text
    div_bf = BeautifulSoup(html,features="html.parser")
    div = div_bf.find_all('div', class_='listmain')
    a_bf = BeautifulSoup(str(div[0]),features="html.parser")
    a = a_bf.find_all('a')
    self.nums = len(a[12:])                                #剔除不必要的章节,并统计章节数
    for each in a[12:]:
        self.names.append(each.string)
        self.urls.append(self.server + each.get('href'))

"""
函数说明:获取章节内容
Parameters:
"""
def get_contents(self, target,num,name):
    try:
        req = requests.get(url=target,timeout=60)
        html = req.text
        bf = BeautifulSoup(html,features="html.parser")
        texts = bf.find_all('div', class_='showtxt')
        a=int(0)
        while texts.__len__()==0:
            if a==10:
                pr('被抓取页面的异常,跳过该页面!','页面地址:%s,文档编号:%s,章节名:%s' % (target,num,name))
                break
            pr('被抓取页面异常,正在尝试第%s次重新抓取!'% str(a+1))
            req = requests.get(url=target, timeout=60)
            html = req.text
            bf = BeautifulSoup(html, features="html.parser")
            texts = bf.find_all('div', class_='showtxt')
            a+=1
        if texts.__len__()==0:
            return '该页面因服务器异常,未能抓取到信息!'
        texts = texts[0].text.replace('\xa0'*8,'\n\n')
        return texts
    except :
        pr('被抓取页面的服务器异常,跳过该页面!','页面地址:%s,文档编号:%s,章节名:%s' % (target,num,name))
        return '该页面因服务器异常,未能抓取到信息!'

"""
函数说明:将爬取的文章内容写入文件
Parameters:
    name - 章节名称(string)
    path - 当前路径下,小说保存名称(string)
    text - 章节内容(string)
"""
def writer(self, name, path, text):
    with open(path, 'w',encoding='utf-8') as f:
        f.write(name + '\n')
        f.write(text)

def pr(*pr):
s=’’
for p in pr:
s+=(p+’ ‘)
st=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+’ ‘+s+’ ‘+’\n’
with open(‘D:/爬虫下载的小说/下载日志.txt’, ‘a’, encoding=‘utf-8’) as f:
f.write(st)
print(st,end=’’)

def getpageurl(name):
pr(‘正在查找你要的小说,请等候。。。’)
req = requests.get(findurl+name,headers=head)
html = req.text
div_bf = BeautifulSoup(html, features=“html.parser”)
div = div_bf.find_all(‘div’, class_=‘type_show’)
a_bf = BeautifulSoup(str(div[0]), features=“html.parser”)
ans = a_bf.find_all(‘a’)
for each in ans:
if each.string==name:
pr(‘找到了你要的小说,即将开始下载:’)
return each.get(‘href’)

if name main”:
if not os.path.exists(‘D:/爬虫下载的小说/’):
os.makedirs(‘D:/爬虫下载的小说/’)
pr(’\n\n\n’)
pr(‘该程序只用于测试笔趣看网站!’)
inp=input(’%s 请输入你要查找的小说名:\n’ % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
novelurl=getpageurl(inp)
if novelurl
None:
pr( ‘在笔趣阁网站未找到你想要的小说!’)
sys.exit(0)
dl = downloader(novelurl,inp)
dl.get_download_url()
pr( ‘程序在您的D盘根目录创建了一个名为"爬虫下载的小说"的文件夹,用于存放下载的小说!’)
pr(’《%s》开始下载:’% dl.textname)
path=‘D:/爬虫下载的小说/%s/’ % dl.textname
isExists = os.path.exists(path)
if not isExists:# 如果不存在则创建目录
os.makedirs(path) #多层创建目录
for i in range(dl.nums):
dl.writer(dl.names[i], path+’%s %s.txt’ % (str(i+1),dl.names[i]), dl.get_contents(dl.urls[i],i,dl.names[i]))
pr(" 已下载:%.3f%%" % (float(i/dl.nums)*100))
pr(’《%s》下载完成’% dl.textname)

相关标签: python webspider