协程爬取doutula网站所有表情包
程序员文章站
2022-03-26 14:40:07
...
-
readme.text
- 运行环境:python3.x版本以上
- 用到的库文件:requests库,re库,urllib库,queue库,gevent库,os库,lxml库
- 终端执行命令:python3 doutula_spider.py 10 # 10表示爬区十页表情包
-
soutula_spider.py
-
import requests from lxml import etree import os import re from urllib import request from queue import Queue import gevent from gevent import monkey import sys monkey.patch_all() gQueue = Queue(20) gNext_queue = Queue(20) gQueue.put("http://www.doutula.com/photo/list/") gNext_queue.put("http://www.doutula.com/photo/list/") gCount = 0 def parse_url(): global gQueue global gNext_queue global gCount headers = { "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" } url = gQueue.get() # 获取url地址 response = requests.get(url, headers=headers) # 获取响应 html = etree.HTML(response.text) # 根据a标签来分组 a_list = html.xpath("//div[@class='page-content text-center']//a[@class='col-xs-6 col-sm-3']") for li in a_list: img_url = li.xpath(".//@data-original")[0] # 图片url title = li.xpath(".//@alt")[0] # 标题 title = re.sub(r"[??! !’“、]*?", "", title) # 去除表情中不符合文件命名的符号 suffix = os.path.splitext(img_url)[1] # 判断该图片是gif还会jpg suffix = re.sub(r"!dta", "", suffix) # 出去jpg!dta后坠名 filename = title + suffix request.urlretrieve(img_url, "images/"+filename) # 下载图片到images if re.findall("page=(\d+)", url): current_number = re.findall("page=(\d+)", url)[0] current_number = int(current_number) else: current_number = 1 print("******第%d页已经下载完成******" % current_number) parse_url() def get_next_url(spider_number): """翻页""" global gQueue global gNext_queue global gCount headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" } start_url = gNext_queue.get() # 获取url地址 print(start_url) response = requests.get(start_url, headers=headers) # 获取url响应 html = etree.HTML(response.text) next_url = html.xpath("//a[@rel='next']/@href")[0] # 获去next一页url地址 next_url = "http://www.doutula.com" + next_url # 补全url地址 current_number = re.findall("page=(\d+)", next_url)[0] # 获取当前url地址页数 current_number = int(current_number) # 转换成int gQueue.put(next_url) # 添加到队列中 gNext_queue.put(next_url) if current_number == spider_number: # 判断是否终止需要下一页url print(next_url) return get_next_url(spider_number) def main(): if len(sys.argv) != 2: # 判断执行时候是否给脚本传递参数 print("*"*30) print("请根据一下格式输入命令(需要给脚本传递参数):") print("python3 doutula_spider.py 10 # 10表示爬去的页数是10页") print("*"*30) get_ansuer = input("请输入yes表示是否默认只下载第一第二页,其他键退出\n") if get_ansuer == 'yes': spider_number = 2 else: return else: try: spider_number = int(sys.argv[1]) # 获取需要下载表情包的页数 except Exception as e: print("请输入正确需要下载表情包的页数:%s" % e) return gevent.joinall([ gevent.spawn(get_next_url, spider_number), # 加载下一页url到gQueue, 只需一个 gevent.spawn(parse_url), gevent.spawn(parse_url), # 四个parse_url协程,速度更快 gevent.spawn(parse_url), gevent.spawn(parse_url), ], timeout=30) if __name__ == '__main__': main()
上一篇: [three.js]学习笔记
下一篇: SNMP协议