crawlspider 爬取51job nlp北京地区的职位并保存到mongo
程序员文章站
2022-05-09 22:01:21
...
思路:先进入51job官网,输入关键词:nlp+北京,发现总共有5页,而且有规律。
在spider里的start_urls在存放format取得这5页的链接
rules的正则:发现提取的职位链接url有规律:都有beijing/数字,或beijing-三个小写字母/数字,因此写了两个Rule.
有的非北京地区的职位也在页面中,需要注意。
提取的职位详情中有的是5项,有的是4项,极个别是6,7项。这个是个坑。
以下是spider代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class JobSpider(CrawlSpider):
name = 'job'
allowed_domains = ['51job.com']
start_urls =['https://search.51job.com/list/010000,000000,0000,00,9,99,NLP,2,{}.html'.format(num) for num in range(1,6)]
rules = (
# 页码
# Rule(LinkExtractor(allow='https://search.51job.com/beijing.*\d+'), follow=True),
#内容,
Rule(LinkExtractor(allow=r'jobs.51job.com/beijing-[a-z]{3}/\d+.html'), callback='parse_item',follow=False),
#Rule(LinkExtractor(restrict_xpaths='//div[@class="el"]/p[@class="t1"]/span/a/href'), callback='parse_item', follow=True),
Rule(LinkExtractor(allow=r'jobs.51job.com/beijing/\d+.html'), callback='parse_item',follow=False)
# Rule(LinkExtractor(allow=r'http://search.51job.com/list'), follow=True),
)
def parse_item(self, response):
self.crawler.stats.inc_value('inc_newreq_num')
title=response.xpath('//h1/text()').extract_first().strip()
company= response.xpath('//p[@class="cname"]/a[1]/text()').extract_first().strip()
pay=response.xpath('//div[@class="cn"]/strong/text()').extract_first()
benefit = ''.join(response.xpath('//div[@class="t1"]').xpath('string(.)').extract()).strip().replace('\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t', '-')
if len(response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]'))!=0:
address = response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]').xpath(
'string(.)').extract_first().strip().replace('上班地址:', '')
else:
address='无'
detail=response.xpath('//p[@class="msg ltype"]/text()').extract()
if len(detail)==5:
area,experience,education,total_num,published =[(detail[i]).strip() for i in range(len(detail))]
yield {
'title': title,
'company': company,
'pay': pay,
# 'detail':detail,
'area': area,
'experience': experience,
'education': education,
'total_num': total_num,
'published': published,
'benefit': benefit,
'address': address
}
elif len(detail) == 4:
area, experience, total_num, published = [(detail[i]).strip() for i in range(len(detail))]
#area,experience,education,total_num,published=response.xpath('//p[@class="msg ltype"]/text()').extract_first().split('|',4)
yield {
'title':title,
'company':company,
'pay':pay,
#'detail':detail,
'area':area,
'experience':experience,
'education':'无要求',
'total_num':total_num,
'published':published,
'benefit':benefit,
'address':address
}
else:
yield {
'title': title,
'company': company,
'pay': pay,
'detail': detail,
'benefit': benefit,
'address': address
}
pipline的代码如下:
import pymongo
class MongoPipeline(object):
def open_spider(self,spider):
self.client=pymongo.MongoClient()
def process_item(self, item, spider):
self.client.job.nlp.insert_one(item)
return item
def close_spider(self,spider):
log_stats=str(spider.crawler.stats.get_stats())
self.client.close()
print(log_stats)
settings代码如下:
from fake_useragent import UserAgent
BOT_NAME = 'job51'
SPIDER_MODULES = ['job51.spiders']
NEWSPIDER_MODULE = 'job51.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent(verify_ssl=False).random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'job51.middlewares.Job51SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'job51.middlewares.Job51DownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'job51.pipelines.MongoPipeline': 300,
}