欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

crawlspider 爬取51job nlp北京地区的职位并保存到mongo

程序员文章站 2022-05-09 22:01:21
...

思路:先进入51job官网,输入关键词:nlp+北京,发现总共有5页,而且有规律。

在spider里的start_urls在存放format取得这5页的链接

rules的正则:发现提取的职位链接url有规律:都有beijing/数字,或beijing-三个小写字母/数字,因此写了两个Rule.

有的非北京地区的职位也在页面中,需要注意。

提取的职位详情中有的是5项,有的是4项,极个别是6,7项。这个是个坑。

以下是spider代码:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class JobSpider(CrawlSpider):

    name = 'job'
    allowed_domains = ['51job.com']
    start_urls =['https://search.51job.com/list/010000,000000,0000,00,9,99,NLP,2,{}.html'.format(num) for num in range(1,6)]
    rules = (
        # 页码
       # Rule(LinkExtractor(allow='https://search.51job.com/beijing.*\d+'), follow=True),
        #内容,
        Rule(LinkExtractor(allow=r'jobs.51job.com/beijing-[a-z]{3}/\d+.html'), callback='parse_item',follow=False),
        #Rule(LinkExtractor(restrict_xpaths='//div[@class="el"]/p[@class="t1"]/span/a/href'), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=r'jobs.51job.com/beijing/\d+.html'), callback='parse_item',follow=False)
       # Rule(LinkExtractor(allow=r'http://search.51job.com/list'), follow=True),
    )


    def parse_item(self, response):
        self.crawler.stats.inc_value('inc_newreq_num')
        title=response.xpath('//h1/text()').extract_first().strip()
        company= response.xpath('//p[@class="cname"]/a[1]/text()').extract_first().strip()
        pay=response.xpath('//div[@class="cn"]/strong/text()').extract_first()
        benefit = ''.join(response.xpath('//div[@class="t1"]').xpath('string(.)').extract()).strip().replace('\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t', '-')
        if len(response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]'))!=0:
            address = response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]').xpath(
                'string(.)').extract_first().strip().replace('上班地址:', '')
        else:
            address='无'
        detail=response.xpath('//p[@class="msg ltype"]/text()').extract()



        if len(detail)==5:

            area,experience,education,total_num,published =[(detail[i]).strip() for i in range(len(detail))]
            yield {
                'title': title,
                'company': company,
                'pay': pay,
                # 'detail':detail,
                'area': area,
                'experience': experience,
                'education': education,
                'total_num': total_num,
                'published': published,
                'benefit': benefit,
                'address': address
            }

        elif len(detail) == 4:
            area, experience, total_num, published = [(detail[i]).strip() for i in range(len(detail))]
        #area,experience,education,total_num,published=response.xpath('//p[@class="msg ltype"]/text()').extract_first().split('|',4)

            yield {
                'title':title,
                'company':company,
                'pay':pay,
                #'detail':detail,
                'area':area,
                'experience':experience,
                'education':'无要求',
                'total_num':total_num,
                'published':published,
                'benefit':benefit,
                'address':address
            }
        else:
            yield {
            'title': title,
            'company': company,
            'pay': pay,
            'detail': detail,
            'benefit': benefit,
            'address': address
            }

pipline的代码如下:

import pymongo

class MongoPipeline(object):
    def open_spider(self,spider):
        self.client=pymongo.MongoClient()
    def process_item(self, item, spider):
        self.client.job.nlp.insert_one(item)
        return item
    def close_spider(self,spider):
        log_stats=str(spider.crawler.stats.get_stats())
        self.client.close()
        print(log_stats)

settings代码如下:

from fake_useragent import UserAgent

BOT_NAME = 'job51'

SPIDER_MODULES = ['job51.spiders']
NEWSPIDER_MODULE = 'job51.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent(verify_ssl=False).random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'job51.middlewares.Job51SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'job51.middlewares.Job51DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'job51.pipelines.MongoPipeline': 300,
}