Python爬虫爬取小说,并保存至本地文本文件中
程序员文章站
2022-07-08 14:03:17
运行环境:Python3.6、Pycharm2017.3、Windows10 RS3、requests2.18.4、BeautifulSoup4.6.0
简介
运行效果展示...
运行环境:Python3.6、Pycharm2017.3、Windows10 RS3、requests2.18.4、BeautifulSoup4.6.0
简介运行效果展示
① book136_singleprocess.py单进程保存小说章节
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : Woolei # @File : book136_singleprocess.py import requests import time import os from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } # 获取小说章节内容,并写入文本 def getChapterContent(each_chapter_dict): content_html = requests.get(each_chapter_dict['chapter_url'], headers=headers).text soup = BeautifulSoup(content_html, 'lxml') content_tag = soup.find('p', {'id': 'content'}) p_tag = content_tag.find_all('p') print('正在保存的章节 --> ' + each_chapter_dict['name']) for each in p_tag: paragraph = each.get_text().strip() with open(each_chapter_dict['name'] + r'.txt', 'a', encoding='utf8') as f: f.write(' ' + paragraph + '\n\n') f.close() # 获取小说各个章节的名字和url def getChapterInfo(novel_url): chapter_html = requests.get(novel_url, headers=headers).text soup = BeautifulSoup(chapter_html, 'lxml') chapter_list = soup.find_all('li') chapter_all_dict = {} for each in chapter_list: import re chapter_each = {} chapter_each['name'] = each.find('a').get_text() # 获取章节名字 chapter_each['chapter_url'] = each.find('a')['href'] # 获取章节url chapter_num = int(re.findall('\d+', each.get_text())[0]) # 提取章节序号 chapter_all_dict[chapter_num] = chapter_each # 记录到所有的章节的字典中保存 return chapter_all_dict if __name__ == '__main__': start = time.clock() # 记录程序运行起始时间 novel_url = 'https://www.136book.com/sanshengsanshimenglitaohua/' # 这里以三生三世十里桃花为例 novel_info = getChapterInfo(novel_url) # 获取小说章节记录信息 dir_name = '保存的小说路径' if not os.path.exists(dir_name): os.mkdir(dir_name) os.chdir(dir_name) # 切换到保存小说的目录 for each in novel_info: getChapterContent(novel_info[each]) # time.sleep(1) end = time.clock() # 记录程序结束的时间 print('保存小说结束,共保存了 %d 章,消耗时间:%f s' % (len(novel_info), (end - start)))
在下载保存的过程中发现,消耗的时间比较长。所以计划使用多进程来提高效率
② book136_multiprocess.py多进程保存小说章节
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : Woolei # @File : book136_2.py import requests import time import os from bs4 import BeautifulSoup from multiprocessing import Pool url = 'https://www.136book.com/huaqiangu/ebxeeql/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } # 获取小说章节内容,并写入文本 def getChapterContent(each_chapter_dict): content_html = requests.get(each_chapter_dict['chapter_url'], headers=headers).text soup = BeautifulSoup(content_html, 'lxml') content_tag = soup.find('p', {'id': 'content'}) p_tag = content_tag.find_all('p') print('正在保存的章节 --> ' + each_chapter_dict['name']) for each in p_tag: paragraph = each.get_text().strip() with open(each_chapter_dict['name'] + r'.txt', 'a', encoding='utf8') as f: f.write(' ' + paragraph + '\n\n') f.close() # 获取小说各个章节的名字和url def getChapterInfo(novel_url): chapter_html = requests.get(novel_url, headers=headers).text soup = BeautifulSoup(chapter_html, 'lxml') chapter_list = soup.find_all('li') chapter_all_dict = {} for each in chapter_list: import re chapter_each = {} chapter_each['name'] = each.find('a').get_text() # 获取章节名字 chapter_each['chapter_url'] = each.find('a')['href'] # 获取章节url chapter_num = int(re.findall('\d+', each.get_text())[0]) # 提取章节序号 chapter_all_dict[chapter_num] = chapter_each # 记录到所有的章节的字典中保存 return chapter_all_dict if __name__ == '__main__': start = time.clock() novel_url = 'https://www.136book.com/sanshengsanshimenglitaohua/' novel_info = getChapterInfo(novel_url) dir_name = '保存的小说路径' if not os.path.exists(dir_name): os.mkdir(dir_name) os.chdir(dir_name) pool = Pool(processes=10) # 创建10个进程 pool.map(getChapterContent, [novel_info[each] for each in novel_info]) pool.close() pool.join() end = time.clock() print('多进程保存小说结束,共保存了 %d 章,消耗时间:%f s' % (len(novel_info), (end - start)))
运行过程中,在任务管理器中可以看到创建了10个子进程(processes=10),可以创建多个进程来提高效率,但是不考虑电脑性能而创建过多的进程后会使电脑执行效率严重下降。
之前创建了200个进程后,直接电脑崩掉了
改进一下可以将文件存储写成一个SaveFile类,调用方法可保存.txt、.md、.html等格式文档,以方便查看。这就等一下再更新吧
(●’?’●)
上一篇: 恋爱路上的雷人爆笑