xpath+多进程爬取八零电子书百合之恋分类下所有小说。
程序员文章站
2022-10-08 23:21:13
代码 控制台输出 打开文件夹查看是否下载成功 done。 ......
代码
# 需要的库 import requests from lxml import etree from multiprocessing import pool import os # 请求头 headers = { 'user-agent': 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/65.0.3325.181 safari/537.36' } # 创建存储路径 pathname = './八零电子书/' if not os.path.exists(pathname): os.mkdir(pathname) # 获取书籍列表 def get_booklist(url): try: response = requests.get(url=url,headers=headers) etrees = etree.html(response.text) sum = etrees.xpath('//a[@class="last"]/text()')[0] booklist = etrees.xpath('//div[@class="book_bg"]/a/@href') pool.map(get_book,booklist) urls = ['http://www.quanshuwang.com/list/3_{}.html'.format(i) for i in range(2,int(sum)+1)] pool.map(get_booklist,urls) except exception: print('get_booklist failed') # 获取具体书籍 def get_book(url): try: response = requests.get(url=url, headers=headers) etrees = etree.html(response.text) mulu = etrees.xpath('//a[@id="read_book"]/@href')[1] get_mulu(mulu) except exception: print('get_book failed') # 获取书籍目录 def get_mulu(url): try: response = requests.get(url=url, headers=headers) etrees = etree.html(response.text) zhangjie = etrees.xpath('//div[@id="yulan"]/li/a/@href') for i in zhangjie: get_content(i) except exception: print('get_mulu failed') # 获取书籍内容 def get_content(url): try: response = requests.get(url=url, headers=headers) etrees = etree.html(response.text.encode(response.encoding).decode(response.apparent_encoding)) book_name = etrees.xpath('//p[@class="text"]/a/text()')[1] zhangjie = etrees.xpath('//div[@class="date"]/h1/text()')[0] contents = etrees.xpath('//div[@id="content"]/text()') print(zhangjie+'..正在下载') f = open(pathname+book_name+'.txt','a+',encoding='utf-8') f.write(zhangjie+'\n\n') for con in contents: f.write(con+'\n') f.close() except exception: print('get_content failed') # 程序入口 if __name__ == '__main__': url = 'https://www.80txt.la/sort5/1.html' # 创建进程池 pool = pool() # 启动函数 get_booklist(url)
控制台输出
e:\anaconda\python.exe e:/练习/最后阶段/0809/八零电子书.py 1第一章 捡到个小雌性..正在下载 01 遗嘱..正在下载 第一章 捡了东西不一定能换到钱..正在下载 2第二章 摔出了地球..正在下载 02 异变..正在下载 3第三章 这是个高科技世界..正在下载 第二章 爷爷!您是我的亲爷爷..正在下载 03 手镯..正在下载 第三章 不在新手村混的新手..正在下载 4第四章 所谓杌力..正在下载 第一章 我会打架..正在下载 04长生..正在下载
打开文件夹查看是否下载成功
done。