基于python的文本与图片爬虫脚本

程序员文章站 2022-06-15 18:37:16

一个自动下载小说的爬虫脚本，针对笔趣网的小说爬取...

一、一个自动下载小说的爬虫脚本（附带解释）–静态网页

#一个自动下载小说类,针对笔趣网目前
#还未解决问题：重复章节等无用信息的删除，适应不同网站（编码方式等），排版
from bs4 import BeautifulSoup
import sys, requests

class txt_downloader():
    def __init__(self, Source_url , chapter_url):
        self.chapter_URL = []          #章节链接
        self.chapter_name = []         #章节名
        self.nums = 0                  #章节数
        self.Source_url = Source_url   #小说总网址
        self.chapter_url = chapter_url #具体小说网址

    def Get_links(self):
        re = requests.get(url=chapter_url)
        re.encoding = 'gbk'#将默认编码方式装为gbk，原网址对应编码方式
       # print(req.encoding)#解决中文显示未乱码问题，查看默认输出编码方式
        re = re.text
        Bs = BeautifulSoup(re)##html需要是str类型，Bs是一个列表
        text = Bs.find_all('div', class_='listmain')#寻找所有class属性是listmain的div标签
        Bs_a = BeautifulSoup(str(text[0]))#
        text_links = Bs_a.find_all('a')
        self.nums = len(text_links)
        for each in text_links:
            self.chapter_name.append(each.string)#each.string取出了章节名
            self.chapter_URL.append(Source_url + each.get('href'))

    def Get_content(self, content_url):
        re = requests.get(url=content_url)
        re.encoding = 'gbk'
        re = re.text
        Bs = BeautifulSoup(re)
        text = Bs.find_all('div', class_='showtxt')
        text_content = text[0].text.replace('\xa0'*8, '\n\n')#
        return text_content

    def save_txt(self, name, path, text):
        write_flag = True#??
        with open(path, 'a', encoding='utf-8') as f:
            f.write(name+'\n')
            f.writelines(text)
            f.write('\n\n')


if __name__ == "__main__":
    Source_url = 'https://www.biqukan.com/'
    chapter_url = 'https://www.biqukan.com/25_25963/'
    down = txt_downloader(Source_url, chapter_url)
    down.Get_links()
    print('开始下载：')
    for i in range(down.nums):
        down.save_txt(down.chapter_name[i], '下载.txt', down.Get_content(down.chapter_URL[i]))
        sys.stdout.write("已经下载：%.3f%%" % float(i/down.nums) + '\r')#在cmd中运行显示进度
        sys.stdout.flush()
    print("下载完成")

二、一个自动下载图片的爬虫脚本–动态网页

// A code block
var foo = 'bar';

本文地址：https://blog.csdn.net/weixin_43244470/article/details/108774632

基于python的文本与图片爬虫脚本

基于python爬虫的github-exploitdb漏洞库监控与下载

基于python的文本与图片爬虫脚本

基于python的文本与图片爬虫脚本