python-scrapy爬虫框架中meta参数的使用
程序员文章站
2022-03-03 07:59:23
...
meta是response方法中的参数
作用:在不同的函数之间传递数据,需要爬取二级界面时,在spider中使用很方便~
- meta参数的数据类型是字典,即meta={'key':value}
- meta参数在不同函数中都要用response对象调用
- value可以是任意类型的数据,以response.meta['key']形式调用
爬取王者荣耀英雄技能信息的爬虫实例
# -*- coding: utf-8 -*-
import scrapy
import re
from WZRY.items import WzryItem
class WzrySpider(scrapy.Spider):
name = 'wzry'
allowed_domains = ['pvp.qq.com']
start_urls = ['https://pvp.qq.com/web201605/herolist.shtml']
def parse(self, response):
#//是相对路径,/是绝对路径
hero_list=response.xpath("//ul[@class='herolist clearfix']//a")
print('一共有'+str(len(hero_list))+'个英雄')
for hero in hero_list:
item=WzryItem()
item['hero']=hero.xpath("./text()").extract()[0]
number=hero.xpath("./img/@src").extract()[0][-7:-4]
item['image_urls']="https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{0}/{0}-bigskin-".format(number)
url="https://pvp.qq.com/web201605/herodetail/"+number+".shtml"
yield scrapy.Request(url=url,callback=self.parse_detail,meta={'item':item})
def parse_detail(self,response):
item=response.meta["item"]
sample=r'[\u4E00-\u9FA5●]+'
skins=response.xpath(".//ul[@class='pic-pf-list pic-pf-list3']/@data-imgname").extract()[0]
item['skins']=re.findall(sample,skins)
image_urls=item['image_urls']
item['image_urls']=[]
for i in range(1,len(item['skins'])+1):
image_url=image_urls+str(i)+'.jpg'
item['image_urls'].append(image_url)
skills=response.xpath(".//p[@class='skill-name']/b/text()").extract()
skills_detail=response.xpath(".//p[@class='skill-desc']/text()").extract()
item['skill1']=skills[0]
item['skill2']=skills[1]
item['skill3']=skills[2]
item['skill4']=skills[3]
item['skill1_detail']=skills_detail[0]
item['skill2_detail']=skills_detail[1]
item['skill3_detail']=skills_detail[2]
item['skill4_detail']=skills_detail[3]
if len(skills)!=4:
item['skill5']=skills[4]
item['skill5_detail']=skills_detail[4]
else:
item['skill5']='null'
item['skill5_detail']='null'
print(item['hero']+"爬取成功!")
yield item