欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

利用Pyspider爬取BOSS相关职位信息

程序员文章站 2022-05-09 21:16:45
...

利用Pyspider爬取BOSS职位信息

pyspider初步代码


#!/usr/bin/env python
#-*- encoding: utf-8 -*-
#Created on 2019-04-23 16:01:48
#Project: boss

from pyspider.libs.base_handler import *
from pyspider.libs.WebRequest import *
from pyspider.database.mysql.mysqldb import SQL
import time,sys
reload(sys)
sys.setdefaultencoding('utf-8')


class Handler(BaseHandler):
    
    
    #职位详情
    position_list = ['大数据']
    
    #北京、上海、广州、深圳、成都
    #['c101010100','c101020100','c101280100','c101280600','c101270100']
    city_list = ['c101270100']
    
    #代理设置
    proxy = get_home_proxy()
    
    crawl_config = {
    	#请求消息头
        "headers": header()
    }
    
    
    @every(minutes=72 * 60)
    def on_start(self):
        for city in self.city_list:
            for position in self.position_list:
                for work_year in range(102,108):
                    crawl_url = 'https://www.zhipin.com/{city}/e_{workyear}/?query={position}'.format(city=city, workyear=work_year, position=position) 
                    time.sleep(1)
                    self.crawl(url=crawl_url, callback=self.index_page, proxy=random.choice(self.proxy))

    def index_page(self, response):
        url = response.url
        page_num = 1
        while page_num <= 10:
            crawlurl = url+"&page="+str(page_num)
            time.sleep(1)
            self.crawl(url=crawlurl, callback=self.detail_page, proxy=random.choice(self.proxy))
            page_num +=1
            
    @config(priority=2)
    def detail_page(self, response):
        page = response.etree
        job_list = []
        #内容列表
        content_list = page.xpath("//div[@class='job-list']/ul/li")
        for each in content_list:
            # 职位名称
            position_name = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/div[@class='job-title']/text()")[0]
            #薪水
            salary = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/span/text()")[0]
            #地区
            city = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[0]
            #经验
            experience = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[1]
            #学历
            education = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[2]
            #公司
            company = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/h3[@class='name']/a/text()")[0]
            if len(each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")) == 3: 
                #公司行业领域
                industry_field= each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[0]
                #融资轮
                rounds = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
                #规模
                scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[2]
            else:
                #公司行业领域
                industry_field= each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[0]
                #融资轮
                rounds = ''
                #规模
                scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
            job = {   "city": city,
                    "position_name": position_name,
                    "salary": salary,
                    "experience": experience,
                    "education": education,
                    "company": company,
                    "industry_field":industry_field,
                    "rounds": rounds,
                    "scale": scale,
                    "crawl_date": time.strftime("%Y-%m-%d %H%:%M:%S",time.localtime())
                  }
            job_list.append(job)
        return job_list    
            
    
    #将结果数据存入数据库
    def on_result(self, result):
        if not result or not len(result)>0:
            return
        sql = SQL()
        for res in result:
            sql.insert('boss_original',**res)