欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取51和智联的招聘信息

程序员文章站 2022-03-02 23:24:17
...

爬虫代码:

# -*- coding: utf-8 -*-
import scrapy,requests
from ..items import JOBspiderItem

class A51jobSpider(scrapy.Spider):
    name = '51job'
    allowed_domains = ['51job.com']
    start_urls = [ 'http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=','http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,java,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=','http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,php,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']

    def parse(self, response):
        '''

        :param response:
        :return:
        '''
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_job_info,
            meta={},
            dont_filter=True,

        )
        # yield scrapy.Request(
        #     # url=response.url,
        #     callback=self.parse_job_info,
        #     meta={},
        #     dont_filter=True,
        # )

    def parse_next_page(self,response):

        '''
        解析下一页
        :param response:
        :return:
        '''
        # .extract_first('')取出列表转化为字符串
        next_page=response.xpath('//li[@class="bk"][2]/a/@href').extract_first(' ')
        if next_page:
            yield scrapy.Request(
                url=next_page,
                callback=self.parse_job_info,
                meta={},
                dont_filter=True,
            )

            '''
            递归:一个函数内部自己调用自己
            '''
    def parse_job_info(self,response):
        '''

        :param response:
        :return:
        '''
        job_div_list=response.xpath('//div[@id="resultList"]/div[@class="el"]')
        for job_div in job_div_list:
            job_name=job_div.xpath('p/span/a/@title').extract_first('无工作').strip()
            job_company_name=job_div.xpath('span[@class="t2"]/a/@title').extract_first('无公司信息').strip()
            job_place=job_div.xpath('span[@class="t3"]/text()').extract_first('无工作地点').strip()
            job_salary=job_div.xpath('span[@class="t4"]/text()').extract_first('无名称').strip()
            job_time=job_div.xpath('span[@class="t5"]/text()').extract_first('无时间信息').strip()
            job_type='51job' if '51job.com' in response.url else '其他'
            print(job_type,job_name,job_company_name, job_place,job_salary,job_time)
            '''
            数据清洗:清除数据两端的空行特殊字符
            常用:strip
            '''
            item=JOBspiderItem()
            item['job_name'] = job_name
            item['job_company_name'] = job_company_name
            item['job_place'] = job_place
            item['job_salary'] = job_salary
            item['job_time'] = job_time
            item['job_type'] = job_type
            item['fan_kui_lv'] = '没有反馈率'
            yield item
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_page,
            meta={},
            dont_filter=True,

        )



pipelines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# Pipeline:管道,用于接收爬虫返回的item 数据
class JobspiderPipeline(object):
    def process_item(self, item, spider):
        return item


class ToCsvPipeline(object):
    def process_item(self, item, spider):
        with open("job.csv","a",encoding="gb18030") as f:

            job_name = item['job_name']
            fan_kui_lv = item['fan_kui_lv']
            job_company_name = item['job_company_name']
            job_salary = item['job_salary']
            job_place = item['job_place']

            job_time = item['job_time']
            job_type = item['job_type']
            job_info = [job_name,fan_kui_lv,job_company_name,job_salary,job_place,job_time,job_type,"\n"]
            f.write(",".join(job_info))
        #把item传递给下一个pipeline做处理
        return item

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


# class JobspiderItem(scrapy.Item):
#     # define the fields for your item here like:
#     job_name = scrapy.Field()
#     fan_kui_lv = scrapy.Field()
#     job_company_name = scrapy.Field()
#     job_salary = scrapy.Field()
#     job_place = scrapy.Field()


class JOBspiderItem(scrapy.Item):
    job_name=scrapy.Field()
    job_company_name=scrapy.Field()
    job_place=scrapy.Field()
    job_salary=scrapy.Field()
    job_time=scrapy.Field()
    job_type=scrapy.Field()
    fan_kui_lv = scrapy.Field()

相关标签: 爬取招聘信息