欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

spider----爬取多层多页书籍

程序员文章站 2022-05-08 10:57:03
...

代码如下:

import os
import urllib.request
import time
from bs4 import BeautifulSoup


def get_request(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    request = urllib.request.Request(url=url, headers=headers)
    return request


def soup_content(content):
    soup = BeautifulSoup(content, 'lxml')
    return soup


def url_content(request):
    content = urllib.request.urlopen(request).read().decode('utf8')
    return content


def main():
    url = 'https://m.feiyanqing.com/huahuo/'
    request = get_request(url)
    content = url_content(request)
    # print(content)
    soup = soup_content(content)

    ret = soup.select('ul[class="picli fix"]')

    for i in ret:
        # print(i)
        # 第二层
        # url = 'http://m.feiyanqing.com' + i.select('a')[0].attrs['href']
        url = 'http://m.feiyanqing.com' + i.select('a')[0].attrs['href']
        # print(url)
        request = get_request(url)
        content = url_content(request)
        soup = soup_content(content)
        # print(soup)
        result = soup.select('h3 > a')
        # print(result)
        dirname = input('输入文件夹名称: ')
        for http in result:
            url = 'http://m.feiyanqing.com' + http.attrs['href']
            # print(url)
            request = get_request(url)
            content = url_content(request)
            soup = soup_content(content)
            pagelink = soup.select('.zzt > li > b')
            # print(pagelink)
            if len(pagelink) == 0:
                filename = soup.select('.boxcon > .wzbt')[0].text
                # 加载文件目录
                if not os.path.exists(dirname):
                    os.mkdir(dirname)
                filepath = os.path.join(dirname, filename)
                fp = open(filepath, 'w', encoding='utf8')
                text = soup.select('.zw > p')
                print('正在下载%s......' % filename)
                for text1 in text:
                    fp.write(text1.text + '\n')
                    # print(text)
                time.sleep(2)
                print('下载完成...')
                fp.close()
            elif len(pagelink) >= 2:
                num = int(pagelink[0].text)
                # print(num)
                for page in range(1, num + 1):
                    if page == 1:
                        url = url
                    else:
                        url = 'http://m.feiyanqing.com' + str(http.attrs['href']).replace('.', '_%s.' % page)
                    # print(url)
                    request = get_request(url)
                    content = url_content(request)
                    soup = soup_content(content)
                    filename = soup.select('.boxcon > .wzbt')[0].text
                    if not os.path.exists(dirname):
                        os.mkdir(dirname)
                    filepath = os.path.join(dirname, filename)
                    fp = open(filepath, 'w', encoding='utf8')
                    print('正在下载%s......' % filename)
                    text = soup.select('.zw > p')
                    # print(text)
                    for text1 in text:
                        fp.write(text1.text + '\n')
                    print('下载完成...')
                    time.sleep(2)
                    fp.close()


#

#

if __name__ == '__main__':
    main()

 

相关标签: spider