多线程爬取某网站表情包
程序员文章站
2022-03-26 14:55:24
...
import requests
import parsel
import re
import concurrent.futures # 线程池模块
def change_title(title):
mode = re.compile(r'[\\\/\:\*\?\"\<\>|]')
new_title = re.sub(mode, "_", title)
return new_title
def get_response(html_url):
headers = {
'User-Agent': 'xxx'
}
response = requests.get(url=html_url, headers=headers)
return response
def save(title,img_url,name):
img_content = get_response(img_url).content
try:
with open('img\\' + title + '.' + name, mode='wb') as f:
f.write(img_content)
print('正在保存...', title)
except:
pass
def main(html_url):
html_data=get_response(html_url).text
selector = parsel.Selector(html_data) # 把response.text转换成selector对象
divs = selector.css('#container div.tagbqppdiv') # css根据标签提取内容
for div in divs:
img_url = div.css('img::attr(data-original)').get()
title = div.css('img::attr(title)').get()
# print(title, img_url)
name = img_url.split('.')[-1]
# print(name)
# 保存数据
new_title = change_title(title)
save(new_title,img_url,name)
if __name__ == '__main__':
exe = concurrent.futures.ThreadPoolExecutor(max_workers=5)
for page in range(1, 21):
url = f'http://fabiaoqing.com/biaoqing/lists/page/{page}.html'
exe.submit(main,url)
exe.shutdown()
代码学习出自b站视频,链接https://www.bilibili.com/video/BV1dh411Q7Xo?share_source=copy_web