爬取51job的信息
程序员文章站
2022-05-09 22:01:03
...
# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ['51job.com']
start_urls = ['http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,Python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
'http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,PHP,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
'http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,html5,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
def parse(self, response):
#解析第一页
#第一种方法:加上以下六行代码
yield scrapy.Request(
url=response.url,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
#解析下一页
yield scrapy.Request(
url=response.url,
callback=self.parse_next_page,
meta={},
dont_filter=True
)
def parse_next_page(self, response):
"""
解析下一页
:param response:
:return:
"""
next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
#判断下一页是否有
if next_page:
yield scrapy.Request(
# #response.url先解析第一页的网址
#第二种方法:把此处的url改为第一页的网址
# url=response.url,
url=next_page,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
#循环:自己调用自己
yield scrapy.Request(
url=next_page,
callback=self.parse_next_page,
meta={},
dont_filter=True
)
"""
递归:如果一个函数内部自己调用自己,这种形式就叫做递归
"""
def parse_job_info(self, response):
"""
解析工作信息
:param response:
:return:
"""
job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
for job_div in job_div_list:
job_name = job_div.xpath("p/span/a/@title").extract_first(
"没有工作名称").replace(",", "/")
job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first(
"没有公司名称").strip()
job_place = job_div.xpath("span[@class='t3']/text()").extract_first("没有工作地点").strip()
job_salary = job_div.xpath("span["
"@class='t4']/text()").extract_first("工资面议").strip()
job_time = job_div.xpath("span[@class='t4']/text()").extract_first("没有工作时间").strip()
job_type = "51job" if "51job.com" in response.url else "其他"
print(job_type ,job_name,job_company_name,job_place,job_salary,job_time)
"""
数据清洗:负责清除数据两端的空行,空格,特殊符号等,常用操作一般是strip()
还包括清除哪些无效数据,例如数据格式不完整的数据,以及重复数据"""
item = JobspiderItem()
item['job_name'] = job_name
item['fan_kui_lv'] = "没有反馈率"
item['job_company_name'] = job_company_name
item['job_salary'] = job_salary
item['job_place'] = job_place
item['job_type'] = job_type
item['job_time'] = "没有时间"
yield item