欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取51job的信息

程序员文章站 2022-05-09 22:01:03
...
# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem


class JobSpider(scrapy.Spider):
    name = 'job'
    allowed_domains = ['51job.com']
    start_urls = ['http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,Python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
                  'http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,PHP,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
                  'http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,html5,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']

    def parse(self, response):
        #解析第一页
        #第一种方法:加上以下六行代码
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_job_info,
            meta={},
            dont_filter=True
        )
        #解析下一页
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_page,
            meta={},
            dont_filter=True
        )

    def parse_next_page(self, response):
        """
        解析下一页
        :param response:
        :return:
        """
        next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
        #判断下一页是否有
        if next_page:
            yield scrapy.Request(
                # #response.url先解析第一页的网址
                #第二种方法:把此处的url改为第一页的网址
                # url=response.url,
                url=next_page,
                callback=self.parse_job_info,
                meta={},
                dont_filter=True
            )
            #循环:自己调用自己
            yield scrapy.Request(
                url=next_page,
                callback=self.parse_next_page,
                meta={},
                dont_filter=True
            )
            """
            递归:如果一个函数内部自己调用自己,这种形式就叫做递归
            """

    def parse_job_info(self, response):
        """
        解析工作信息
        :param response:
        :return:
        """
        job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
        for job_div in job_div_list:
            job_name = job_div.xpath("p/span/a/@title").extract_first(
                "没有工作名称").replace(",", "/")
            job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first(
                "没有公司名称").strip()
            job_place = job_div.xpath("span[@class='t3']/text()").extract_first("没有工作地点").strip()
            job_salary = job_div.xpath("span["
                              "@class='t4']/text()").extract_first("工资面议").strip()
            job_time = job_div.xpath("span[@class='t4']/text()").extract_first("没有工作时间").strip()
            job_type = "51job" if "51job.com" in response.url else "其他"
            print(job_type ,job_name,job_company_name,job_place,job_salary,job_time)

            """
            数据清洗:负责清除数据两端的空行,空格,特殊符号等,常用操作一般是strip()
            还包括清除哪些无效数据,例如数据格式不完整的数据,以及重复数据"""

            item = JobspiderItem()
            item['job_name'] = job_name
            item['fan_kui_lv'] = "没有反馈率"
            item['job_company_name'] = job_company_name
            item['job_salary'] = job_salary
            item['job_place'] = job_place
            item['job_type'] = job_type
            item['job_time'] = "没有时间"
            yield item