欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

scrapy爬虫之爬取拉勾网职位信息

程序员文章站 2022-05-09 17:41:34
...

一.编写Item

import scrapy


class LagouItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    positionId = scrapy.Field()#职位ID,作为辨识字段插入数据库
    city = scrapy.Field()# 城市
    positionName = scrapy.Field()#职位
    salary = scrapy.Field()#工资
    workYear = scrapy.Field()#经验要求
    education = scrapy.Field()# 教育程度

    businessZones = scrapy.Field()#工作地点(科技园)一个元素的列表
    companyShortName = scrapy.Field()#公司简称
    companyFullName = scrapy.Field()# 公司全称
    companySize = scrapy.Field()# 公司人数规模
    industryField = scrapy.Field()#公司服务方向
    positionAdvantage = scrapy.Field()#职位优势(一句话)
    createTime = scrapy.Field()# 岗位发布时间

二.编写Spiders

此处注意拉钩网的反爬策略,所以加入带有不用登陆获得的cookie的headers(不加cookie会导致只能爬四五页就会因请求频繁而反爬),另外需要在settings里设置scrapy自带的cookie机制关闭

import scrapy
from Lagou.items import LagouItem
import json,time,random

class LagouspiderSpider(scrapy.Spider):
    name = "lagouspider"
    allowed_domains = ["www.lagou.com"]

    url = 'https://www.lagou.com/jobs/positionAjax.json?'#city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
    page = 1
    allpage =0

    cookie = ???
    headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
               'Referer': 'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=',
               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
               'cookie': cookie }

    def start_requests(self):
       yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
            'first': 'true','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)

    def parse(self, response):
        #print(response.text)
        item = LagouItem()
        data = json.loads(response.text)

        totalCount = data['content']['positionResult']['totalCount']#总共多少条信息
        resultSize = data['content']['positionResult']['resultSize']#每页多少条信息

        result = data['content']['positionResult']['result']#得到一个包含15个信息的列表
        for each in result:
            for field in item.fields:
                if field in each.keys():
                    item[field] = each.get(field)
            yield item

        time.sleep(random.randint(5, 10))

        if int(resultSize):
            self.allpage = int(totalCount) // int(resultSize) + 1
            if self.page < self.allpage:
                self.page += 1
                yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
            'first': 'false','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)

三.编写Pipelines

import json,pymongo
from scrapy.conf import settings

class JsonPipeline(object):
    def __init__(self):
        self.file = open('job.json','w',encoding='utf-8')

    def process_item(self, item, spider):
        line = json.dumps(dict(item),ensure_ascii=False) + '\n'
        self.file.write(line)
        return item

    def close_spider(self,spider):
        self.file.close()

class MongoPipeline(object):
    def __init__(self):
        mongo_uri = settings.get('MONGO_URI')# localhost:27017
        mongo_db = settings.get('MONGO_DB')#数据库名
        self.client = pymongo.MongoClient(mongo_uri)
        self.db = self.client[mongo_db]

    def process_item(self, item, spider):
        #self.db['拉钩关键词招聘信息表'].insert(dict(item))
        self.db['拉钩关键词招聘信息表'].update({'positionId':item['positionId']},dict(item),True)
        return item

    def close_spider(self, spider):
        self.client.close()

四.编写settings(加入)

COOKIES_ENABLED = False
ITEM_PIPELINES = {
    'Lagou.pipelines.JsonPipeline': 100,
    'Lagou.pipelines.MongoPipeline': 300
}

MONGO_URI = 'localhost:27017'
MONGO_DB = 'lagou'

注意:1、headers

           2、headers里的cookie

            3、formdata 

            4、settings设置scrapy默认的cookie加载机制关闭,而是用每次发请求时自带的包含cookie的headers

             5、mongoDB数据库的存储insert用到的两种方法,此处update很关键(取的去重比较的字段应唯一),可以去重