用Scrapy爬取Lpl春季赛各个队伍信息(异步加载)
· Python 3.6.3
· Chrome浏览器 版本 64.0.3282.167(正式版本) (64 位)
· PyCharm Community Edition 2017.3.3
· Scrapy 1.4.0
本人平时喜欢玩英雄联盟,喜欢关注英雄联盟赛事。现在学了点爬虫的知识,就想尝试着把2018年Lpl春季赛各个战队的信息抓取下来保存。于是登录了官网('http://lpl.qq.com/'),进入战队页,用requests和BeautifulSoup写爬虫脚本,结果返回的是空的信息。
2018年lpl春季赛各大战队
右键查看网页源代码信息,发现源代码中没有任何关于战队的信息,只有跳转链接信息。
说明这个网页不是静态的网页,不能简单地通过结构网页获取信息。
右键'检查',点击'Network',点击'XHR',按F5刷新后,发现并没有任何我们想要的战队信息。
点击'JS'选项,发现其中一个文件叫'clublist.js',里面包含了各大战队的信息。
因为战队数目有限,就不用写代码抓取了,直接复制下来改写成字典信息。
TeamIDs = {
'BLG': 57, 'EDG': 1, 'FPX': 7,
'IG': 2, 'JDG': 29, 'LGD': 4,
'OMG': 6, 'RNG': 8, 'RW': 422,
'SNG': 41, 'Snake': 9, 'TOP': 42,
'VG': 11, 'WE': 12
}
获取到战队信息,接下来就获取战队详细信息。随便进入一个战队的详情页面,比如进入'BLG'战队。
该页面的链接形式是这样的:'http://lpl.qq.com/es/team_detail.shtml?tid=57',我发现'tid=57',正好是刚刚抓取到的战队信息里对应的'BLG'战队ID,我们可以通过改变数值就能访问各个战队的详情页面。
为了获取到详细信息,同样的,右键'检查',点击'Network',点击'XHR',F5刷新,发现第一个文件就是我们想要的文件。
其链接是这样的形式:'http://lpl.qq.com/web201612/data/LOL_MATCH2_TEAM_TEAM57_INFO.js',其中也有'57',说明通过改写战队ID也能访问其他战队的详情信息。
在'Preview'选项中,找到了战队介绍与战队队员的详细信息,以json格式储存。
其中有个'MemberID'的参数很重要,它是构成队员详情页URL的重要参数。
在这个页面内,其实还有一个很重要的信息是“战队数据”,我发现它藏在一个js文件中,里面有“胜、负、击杀、大龙、小龙”的信息,但是由于它里面的赛事名称是以数字而非字符储存,我没有找到还原关系,便放弃这部分信息的抓取。
接下来就是队员详细信息的抓取了,随便选择一个队员,点击进入他的详情页中,以同样的方式找到信息。(那些难以还原的信息这次先不抓取。)
队员信息的URL是这样的:'http://lpl.qq.com/web201612/data/LOL_MATCH2_TEAM_MEMBER7_INFO.js',其中有参数'MEMBER7',我们可以将其中的数字替换成各个队员的'MemberID',就能拿到各个队员的详细信息了。
解析页面信息完毕,接下来就是编写代码了。
创建一个新的工程:scrapy startproject lol
创建一个爬虫:scrapy genspider lpl lpl.qq.com
#编写spider
# -*- coding: utf-8 -*-
from scrapy import Spider,Request
from lol.items import ActivePlayersItem,TeamBaseInfoItem,BaseInfoItem,PlayerAwardsItem,FavoriteHerosItem
import json
class LplSpider(Spider):
name = 'lpl'
allowed_domains = ['lpl.qq.com']
start_urls = ['http://lpl.qq.com/']
team_url = 'http://lpl.qq.com/web201612/data/LOL_MATCH2_TEAM_TEAM{TeamId}_INFO.js' #这是战队详情页的url格式
member_url = 'http://lpl.qq.com/web201612/data/LOL_MATCH2_TEAM_MEMBER{MemberId}_INFO.js' #这是队员详情页的url格式
TeamIDs = { #这是战队的ID号
'BLG': 57, 'EDG': 1, 'FPX': 7,
'IG': 2, 'JDG': 29, 'LGD': 4,
'OMG': 6, 'RNG': 8, 'RW': 422,
'SNG': 41, 'Snake': 9, 'TOP': 42,
'VG': 11, 'WE': 12
}
def start_requests(self): #将战队ID号取出,构建完整的战队详情页的URL,并使用parse_team函数解析
for k, TeamID in self.TeamIDs.items():
yield Request(self.team_url.format(TeamId=TeamID),self.parse_team)
def parse_team(self, response): #将战队的信息解析并存入Item
datas = json.loads(response.text)
item1 = ActivePlayersItem()
item2 = TeamBaseInfoItem()
if 'msg' in datas.keys():
data2 = datas['msg']['baseInfo']
item2['TeamDesc'] = data2.get('TeamDesc')
item2['TeamEnName'] = data2.get('TeamEnName')
item2['TeamId'] = data2.get('TeamId')
item2['TeamLogo'] = data2.get('TeamLogo')
item2['TeamName'] = data2.get('TeamName')
yield item2
for data1 in datas['msg']['activePlayers']:
item1['GameName'] = data1.get('GameName')
item1['MemberId'] = data1.get('MemberId')
item1['NickName'] = data1.get('NickName')
item1['Place'] = data1.get('Place')
item1['RealName'] = data1.get('RealName')
item1['UserIcon'] = data1.get('UserIcon')
yield item1
yield Request(self.member_url.format(MemberId=data1.get('MemberId')), self.parse_member) #构造队员信息URL,回调函数为parse_member
def parse_member(self, response): #将队员的信息存入Item
results = json.loads(response.text)
item3 = BaseInfoItem()
item4 = FavoriteHerosItem()
item5 = PlayerAwardsItem()
if 'msg' in results.keys():
data3 = results['msg']['baseInfo']
item3['EnName'] = data3.get('EnName')
item3['GameDate'] = data3.get('GameDate')
item3['GameHero'] = data3.get('GameHero')
item3['GameName'] = data3.get('GameName')
item3['GamePlace'] = data3.get('GamePlace')
item3['MemberDesc'] = data3.get('MemberDesc')
item3['MemberId'] = data3.get('MemberId')
item3['NickName'] = data3.get('NickName')
item3['RealName'] = data3.get('RealName')
item3['TeamId'] = data3.get('TeamId')
item3['TeamName'] = data3.get('TeamName')
item3['UserIcon'] = data3.get('UserIcon')
yield item3
for data4 in results['msg']['favoriteHeros']:
item4['HeroId'] = data4.get('HeroId')
item4['UseNum'] = data4.get('UseNum')
item4['WinNum'] = data4.get('WinNum')
item4['sUpdated'] = data4.get('sUpdated')
yield item4
try: #这里是队员以往荣誉,但是有些队员是新人,可能没有这方面的信息。
for data5 in results['msg']['playerAwards']:
item5['AwardDesc'] = data5.get('AwardDesc')
item5['RankName'] = data5.get('RankName')
item5['sGameName'] = data5.get('sGameName')
yield item5
except TypeError:
return None
#编写Item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy import Item,Field
class ActivePlayersItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
GameName = Field()
GamePlace = Field()
MemberId = Field()
NickName = Field()
Place = Field()
RealName = Field()
UserIcon = Field()
class TeamBaseInfoItem(Item):
TeamDesc = Field()
TeamEnName = Field()
TeamId = Field()
TeamLogo = Field()
TeamName = Field()
class BaseInfoItem(Item):
EnName = Field()
GameDate = Field()
GameHero = Field()
GameName = Field()
GamePlace = Field()
MemberDesc = Field()
MemberId = Field()
NickName = Field()
RealName = Field()
TeamId = Field()
TeamName = Field()
UserIcon = Field()
class FavoriteHerosItem(Item):
HeroId = Field()
UseNum = Field()
WinNum = Field()
sUpdated = Field()
class PlayerAwardsItem(Item):
AwardDesc = Field()
RankName = Field()
sGameName = Field()
#编写Item Pipeline
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class MongoPipeline(object):
collection_name = 'lpl'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert(dict(item))
return item
#Settings
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'Referer':'http://lpl.qq.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
'Host':'lpl.qq.com'
}
ROBOTSTXT_OBEY = False
MONGODB_URI = 'localhost'
MONGODB_PORT = 27017
MONGODB_DB = 'lpl'
ITEM_PIPELINES = {
'lol.pipelines.MongoPipeline': 300,
}