scrapy抓斗鱼主播的图片
程序员文章站
2022-04-28 08:36:26
...
1.该项目通过此网站获取信息
http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0
打开是这样子的,(如果现实乱码,安装Chrome插件jsonview)我们只取 nickname 昵称,vertical_src主播照片这两条信息
2.创建项目:命令行输入
scrapy startproject Douyu
3.这时候可以用IDE工具打开项目了,然后同样是命令行终端下,跳到spider目录下,用命令创建爬虫文件douyu.py
scrapy genspider douyu "douyucdn.cn"
4.编写items.py文件
import scrapy
class DouyuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 主播昵称
nickname = scrapy.Field()
# 图片链接
imageLink = scrapy.Field()
5.编写douyu.py文件
from Douyu.items import DouyuItem
import scrapy
import json
class DouyuSpider(scrapy.Spider):
name = 'douyu'
allowed_domains = ['douyucdn.cn']
baseURL = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
# 偏移量,默认0
offset = 0
start_urls = [baseURL + str(offset)]
def parse(self, response):
# data_list取出json串中的data信息,data是字段
data_list = json.loads(response.body)['data']
# 如果data_list长度等于0,说明跳转到没有信息的页。结束爬取
if len(data_list) == 0:
return
# 遍历data_list列表
for data in data_list:
item = DouyuItem()
# 取json中的两个字段信息
# nickname 是昵称,vertical_src主播照片
item['nickname'] = data["nickname"]
item['imageLink'] = data["vertical_src"]
# 返回item,交给管道处理
yield item
# 翻到页。。这里是字符串拼接
self.offset += 20
# 将拼接的url回调处理
yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)
6.改写pipelines.py文件
import os
import scrapy
# 导入setting中的IMAGES_STORE值,然后在取别名iamages_store
from Douyu.settings import IMAGES_STORE as images_store
from scrapy.pipelines.images import ImagesPipeline
# ImagesPipeline处理图片的类
class DouyuPipeline(ImagesPipeline):
# 重写方法
def get_media_requests(self, item, info):
image_Link = item['imageLink']
yield scrapy.Request(image_Link)
# 重写方法
def item_completed(self, results, item, info):
# print(results)
'''
:returns:返回的值是这种类型:
[(True, {'url': 'https://rpic.douyucdn.cn/asrpic/180801/5238090_2206.jpg', 'path': 'full/36988844e412f3974ec7837a915ca55d404b65e4.jp
g', 'checksum': 'a0db090bcd4b383ec995c6a9a8af5623'})]
'''
# 取出results里图片信息中的图片路径的值
# 如果x的ok==TRUE,就取出x的path值
# image_path 取出来的是一个列表
image_path = [x["path"] for ok, x in results if ok]
print("*****", image_path[0], "***********")
print(images_store)
# 重命名,后面是新名字
os.rename(images_store + "/" + image_path[0], images_store + '/' + 'full/' + item['nickname'] + '.jpg')
7.修改settings.py文件
import os
BOT_NAME = 'Douyu'
SPIDER_MODULES = ['Douyu.spiders']
NEWSPIDER_MODULE = 'Douyu.spiders'
# 获取当前文件路径
project_dir = os.path.abspath(os.path.dirname(__file__))
# 设置图片保存路径
IMAGES_STORE = os.path.join(project_dir, 'image')
# print(IMAGES_STORE)
# print("########################################################")
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 模拟手机端的请求
USER_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) ' \
'AppleWebKit/603.3.8 (KHTML, like Gecko) ' \
'Mobile/14G60 MicroMessenger/6.5.18 NetType/WIFI Language/en'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
同时打开ITEM_PIPELINES
ITEM_PIPELINES = {
'Douyu.pipelines.DouyuPipeline': 300,
}