scrapy爬虫之爬取拉勾网职位信息
程序员文章站
2022-05-09 17:41:34
...
一.编写Item
import scrapy
class LagouItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
positionId = scrapy.Field()#职位ID,作为辨识字段插入数据库
city = scrapy.Field()# 城市
positionName = scrapy.Field()#职位
salary = scrapy.Field()#工资
workYear = scrapy.Field()#经验要求
education = scrapy.Field()# 教育程度
businessZones = scrapy.Field()#工作地点(科技园)一个元素的列表
companyShortName = scrapy.Field()#公司简称
companyFullName = scrapy.Field()# 公司全称
companySize = scrapy.Field()# 公司人数规模
industryField = scrapy.Field()#公司服务方向
positionAdvantage = scrapy.Field()#职位优势(一句话)
createTime = scrapy.Field()# 岗位发布时间
二.编写Spiders
此处注意拉钩网的反爬策略,所以加入带有不用登陆获得的cookie的headers(不加cookie会导致只能爬四五页就会因请求频繁而反爬),另外需要在settings里设置scrapy自带的cookie机制关闭
import scrapy
from Lagou.items import LagouItem
import json,time,random
class LagouspiderSpider(scrapy.Spider):
name = "lagouspider"
allowed_domains = ["www.lagou.com"]
url = 'https://www.lagou.com/jobs/positionAjax.json?'#city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
page = 1
allpage =0
cookie = ???
headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'cookie': cookie }
def start_requests(self):
yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
'first': 'true','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
def parse(self, response):
#print(response.text)
item = LagouItem()
data = json.loads(response.text)
totalCount = data['content']['positionResult']['totalCount']#总共多少条信息
resultSize = data['content']['positionResult']['resultSize']#每页多少条信息
result = data['content']['positionResult']['result']#得到一个包含15个信息的列表
for each in result:
for field in item.fields:
if field in each.keys():
item[field] = each.get(field)
yield item
time.sleep(random.randint(5, 10))
if int(resultSize):
self.allpage = int(totalCount) // int(resultSize) + 1
if self.page < self.allpage:
self.page += 1
yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
'first': 'false','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
三.编写Pipelines
import json,pymongo
from scrapy.conf import settings
class JsonPipeline(object):
def __init__(self):
self.file = open('job.json','w',encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item),ensure_ascii=False) + '\n'
self.file.write(line)
return item
def close_spider(self,spider):
self.file.close()
class MongoPipeline(object):
def __init__(self):
mongo_uri = settings.get('MONGO_URI')# localhost:27017
mongo_db = settings.get('MONGO_DB')#数据库名
self.client = pymongo.MongoClient(mongo_uri)
self.db = self.client[mongo_db]
def process_item(self, item, spider):
#self.db['拉钩关键词招聘信息表'].insert(dict(item))
self.db['拉钩关键词招聘信息表'].update({'positionId':item['positionId']},dict(item),True)
return item
def close_spider(self, spider):
self.client.close()
四.编写settings(加入)
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'Lagou.pipelines.JsonPipeline': 100,
'Lagou.pipelines.MongoPipeline': 300
}
MONGO_URI = 'localhost:27017'
MONGO_DB = 'lagou'
注意:1、headers
2、headers里的cookie
3、formdata
4、settings设置scrapy默认的cookie加载机制关闭,而是用每次发请求时自带的包含cookie的headers
5、mongoDB数据库的存储insert用到的两种方法,此处update很关键(取的去重比较的字段应唯一),可以去重
上一篇: 2.爬取智联招聘的职位信息1.0
下一篇: selenium爬取拉勾网职位招聘信息