爬取51和智联的招聘信息
程序员文章站
2022-03-02 23:24:17
...
爬虫代码:
# -*- coding: utf-8 -*-
import scrapy,requests
from ..items import JOBspiderItem
class A51jobSpider(scrapy.Spider):
name = '51job'
allowed_domains = ['51job.com']
start_urls = [ 'http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=','http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,java,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=','http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,php,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
def parse(self, response):
'''
:param response:
:return:
'''
yield scrapy.Request(
url=response.url,
callback=self.parse_job_info,
meta={},
dont_filter=True,
)
# yield scrapy.Request(
# # url=response.url,
# callback=self.parse_job_info,
# meta={},
# dont_filter=True,
# )
def parse_next_page(self,response):
'''
解析下一页
:param response:
:return:
'''
# .extract_first('')取出列表转化为字符串
next_page=response.xpath('//li[@class="bk"][2]/a/@href').extract_first(' ')
if next_page:
yield scrapy.Request(
url=next_page,
callback=self.parse_job_info,
meta={},
dont_filter=True,
)
'''
递归:一个函数内部自己调用自己
'''
def parse_job_info(self,response):
'''
:param response:
:return:
'''
job_div_list=response.xpath('//div[@id="resultList"]/div[@class="el"]')
for job_div in job_div_list:
job_name=job_div.xpath('p/span/a/@title').extract_first('无工作').strip()
job_company_name=job_div.xpath('span[@class="t2"]/a/@title').extract_first('无公司信息').strip()
job_place=job_div.xpath('span[@class="t3"]/text()').extract_first('无工作地点').strip()
job_salary=job_div.xpath('span[@class="t4"]/text()').extract_first('无名称').strip()
job_time=job_div.xpath('span[@class="t5"]/text()').extract_first('无时间信息').strip()
job_type='51job' if '51job.com' in response.url else '其他'
print(job_type,job_name,job_company_name, job_place,job_salary,job_time)
'''
数据清洗:清除数据两端的空行特殊字符
常用:strip
'''
item=JOBspiderItem()
item['job_name'] = job_name
item['job_company_name'] = job_company_name
item['job_place'] = job_place
item['job_salary'] = job_salary
item['job_time'] = job_time
item['job_type'] = job_type
item['fan_kui_lv'] = '没有反馈率'
yield item
yield scrapy.Request(
url=response.url,
callback=self.parse_next_page,
meta={},
dont_filter=True,
)
pipelines
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# Pipeline:管道,用于接收爬虫返回的item 数据
class JobspiderPipeline(object):
def process_item(self, item, spider):
return item
class ToCsvPipeline(object):
def process_item(self, item, spider):
with open("job.csv","a",encoding="gb18030") as f:
job_name = item['job_name']
fan_kui_lv = item['fan_kui_lv']
job_company_name = item['job_company_name']
job_salary = item['job_salary']
job_place = item['job_place']
job_time = item['job_time']
job_type = item['job_type']
job_info = [job_name,fan_kui_lv,job_company_name,job_salary,job_place,job_time,job_type,"\n"]
f.write(",".join(job_info))
#把item传递给下一个pipeline做处理
return item
items
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
# class JobspiderItem(scrapy.Item):
# # define the fields for your item here like:
# job_name = scrapy.Field()
# fan_kui_lv = scrapy.Field()
# job_company_name = scrapy.Field()
# job_salary = scrapy.Field()
# job_place = scrapy.Field()
class JOBspiderItem(scrapy.Item):
job_name=scrapy.Field()
job_company_name=scrapy.Field()
job_place=scrapy.Field()
job_salary=scrapy.Field()
job_time=scrapy.Field()
job_type=scrapy.Field()
fan_kui_lv = scrapy.Field()
上一篇: Ubuntu 14.04安装Scrapy
下一篇: 反爬虫模拟浏览器——实现银行信息爬取