spider----爬取多层多页书籍
程序员文章站
2022-05-08 10:57:03
...
代码如下:
import os
import urllib.request
import time
from bs4 import BeautifulSoup
def get_request(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def soup_content(content):
soup = BeautifulSoup(content, 'lxml')
return soup
def url_content(request):
content = urllib.request.urlopen(request).read().decode('utf8')
return content
def main():
url = 'https://m.feiyanqing.com/huahuo/'
request = get_request(url)
content = url_content(request)
# print(content)
soup = soup_content(content)
ret = soup.select('ul[class="picli fix"]')
for i in ret:
# print(i)
# 第二层
# url = 'http://m.feiyanqing.com' + i.select('a')[0].attrs['href']
url = 'http://m.feiyanqing.com' + i.select('a')[0].attrs['href']
# print(url)
request = get_request(url)
content = url_content(request)
soup = soup_content(content)
# print(soup)
result = soup.select('h3 > a')
# print(result)
dirname = input('输入文件夹名称: ')
for http in result:
url = 'http://m.feiyanqing.com' + http.attrs['href']
# print(url)
request = get_request(url)
content = url_content(request)
soup = soup_content(content)
pagelink = soup.select('.zzt > li > b')
# print(pagelink)
if len(pagelink) == 0:
filename = soup.select('.boxcon > .wzbt')[0].text
# 加载文件目录
if not os.path.exists(dirname):
os.mkdir(dirname)
filepath = os.path.join(dirname, filename)
fp = open(filepath, 'w', encoding='utf8')
text = soup.select('.zw > p')
print('正在下载%s......' % filename)
for text1 in text:
fp.write(text1.text + '\n')
# print(text)
time.sleep(2)
print('下载完成...')
fp.close()
elif len(pagelink) >= 2:
num = int(pagelink[0].text)
# print(num)
for page in range(1, num + 1):
if page == 1:
url = url
else:
url = 'http://m.feiyanqing.com' + str(http.attrs['href']).replace('.', '_%s.' % page)
# print(url)
request = get_request(url)
content = url_content(request)
soup = soup_content(content)
filename = soup.select('.boxcon > .wzbt')[0].text
if not os.path.exists(dirname):
os.mkdir(dirname)
filepath = os.path.join(dirname, filename)
fp = open(filepath, 'w', encoding='utf8')
print('正在下载%s......' % filename)
text = soup.select('.zw > p')
# print(text)
for text1 in text:
fp.write(text1.text + '\n')
print('下载完成...')
time.sleep(2)
fp.close()
#
#
if __name__ == '__main__':
main()
上一篇: scrapy爬取网易云音乐
下一篇: yml文件使用