crawlspider 爬取51job nlp北京地区的职位并保存到mongo

程序员文章站 2022-05-09 22:01:21

...

思路：先进入51job官网，输入关键词：nlp+北京，发现总共有5页，而且有规律。

在spider里的start_urls在存放format取得这5页的链接

rules的正则：发现提取的职位链接url有规律：都有beijing/数字，或beijing-三个小写字母/数字，因此写了两个Rule.

有的非北京地区的职位也在页面中，需要注意。

提取的职位详情中有的是5项，有的是4项，极个别是6,7项。这个是个坑。

以下是spider代码：

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class JobSpider(CrawlSpider):

    name = 'job'
    allowed_domains = ['51job.com']
    start_urls =['https://search.51job.com/list/010000,000000,0000,00,9,99,NLP,2,{}.html'.format(num) for num in range(1,6)]
    rules = (
        # 页码
       # Rule(LinkExtractor(allow='https://search.51job.com/beijing.*\d+'), follow=True),
        #内容,
        Rule(LinkExtractor(allow=r'jobs.51job.com/beijing-[a-z]{3}/\d+.html'), callback='parse_item',follow=False),
        #Rule(LinkExtractor(restrict_xpaths='//div[@class="el"]/p[@class="t1"]/span/a/href'), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=r'jobs.51job.com/beijing/\d+.html'), callback='parse_item',follow=False)
       # Rule(LinkExtractor(allow=r'http://search.51job.com/list'), follow=True),
    )


    def parse_item(self, response):
        self.crawler.stats.inc_value('inc_newreq_num')
        title=response.xpath('//h1/text()').extract_first().strip()
        company= response.xpath('//p[@class="cname"]/a[1]/text()').extract_first().strip()
        pay=response.xpath('//div[@class="cn"]/strong/text()').extract_first()
        benefit = ''.join(response.xpath('//div[@class="t1"]').xpath('string(.)').extract()).strip().replace('\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t', '-')
        if len(response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]'))!=0:
            address = response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]').xpath(
                'string(.)').extract_first().strip().replace('上班地址：', '')
        else:
            address='无'
        detail=response.xpath('//p[@class="msg ltype"]/text()').extract()



        if len(detail)==5:

            area,experience,education,total_num,published =[(detail[i]).strip() for i in range(len(detail))]
            yield {
                'title': title,
                'company': company,
                'pay': pay,
                # 'detail':detail,
                'area': area,
                'experience': experience,
                'education': education,
                'total_num': total_num,
                'published': published,
                'benefit': benefit,
                'address': address
            }

        elif len(detail) == 4:
            area, experience, total_num, published = [(detail[i]).strip() for i in range(len(detail))]
        #area,experience,education,total_num,published=response.xpath('//p[@class="msg ltype"]/text()').extract_first().split('|',4)

            yield {
                'title':title,
                'company':company,
                'pay':pay,
                #'detail':detail,
                'area':area,
                'experience':experience,
                'education':'无要求',
                'total_num':total_num,
                'published':published,
                'benefit':benefit,
                'address':address
            }
        else:
            yield {
            'title': title,
            'company': company,
            'pay': pay,
            'detail': detail,
            'benefit': benefit,
            'address': address
            }

pipline的代码如下：

import pymongo

class MongoPipeline(object):
    def open_spider(self,spider):
        self.client=pymongo.MongoClient()
    def process_item(self, item, spider):
        self.client.job.nlp.insert_one(item)
        return item
    def close_spider(self,spider):
        log_stats=str(spider.crawler.stats.get_stats())
        self.client.close()
        print(log_stats)

settings代码如下：

from fake_useragent import UserAgent

BOT_NAME = 'job51'

SPIDER_MODULES = ['job51.spiders']
NEWSPIDER_MODULE = 'job51.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent(verify_ssl=False).random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'job51.middlewares.Job51SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'job51.middlewares.Job51DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'job51.pipelines.MongoPipeline': 300,
}