利用Pyspider爬取BOSS相关职位信息
程序员文章站
2022-05-09 21:16:45
...
利用Pyspider爬取BOSS职位信息
pyspider初步代码
#!/usr/bin/env python
#-*- encoding: utf-8 -*-
#Created on 2019-04-23 16:01:48
#Project: boss
from pyspider.libs.base_handler import *
from pyspider.libs.WebRequest import *
from pyspider.database.mysql.mysqldb import SQL
import time,sys
reload(sys)
sys.setdefaultencoding('utf-8')
class Handler(BaseHandler):
#职位详情
position_list = ['大数据']
#北京、上海、广州、深圳、成都
#['c101010100','c101020100','c101280100','c101280600','c101270100']
city_list = ['c101270100']
#代理设置
proxy = get_home_proxy()
crawl_config = {
#请求消息头
"headers": header()
}
@every(minutes=72 * 60)
def on_start(self):
for city in self.city_list:
for position in self.position_list:
for work_year in range(102,108):
crawl_url = 'https://www.zhipin.com/{city}/e_{workyear}/?query={position}'.format(city=city, workyear=work_year, position=position)
time.sleep(1)
self.crawl(url=crawl_url, callback=self.index_page, proxy=random.choice(self.proxy))
def index_page(self, response):
url = response.url
page_num = 1
while page_num <= 10:
crawlurl = url+"&page="+str(page_num)
time.sleep(1)
self.crawl(url=crawlurl, callback=self.detail_page, proxy=random.choice(self.proxy))
page_num +=1
@config(priority=2)
def detail_page(self, response):
page = response.etree
job_list = []
#内容列表
content_list = page.xpath("//div[@class='job-list']/ul/li")
for each in content_list:
# 职位名称
position_name = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/div[@class='job-title']/text()")[0]
#薪水
salary = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/span/text()")[0]
#地区
city = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[0]
#经验
experience = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[1]
#学历
education = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[2]
#公司
company = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/h3[@class='name']/a/text()")[0]
if len(each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")) == 3:
#公司行业领域
industry_field= each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[0]
#融资轮
rounds = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
#规模
scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[2]
else:
#公司行业领域
industry_field= each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[0]
#融资轮
rounds = ''
#规模
scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
job = { "city": city,
"position_name": position_name,
"salary": salary,
"experience": experience,
"education": education,
"company": company,
"industry_field":industry_field,
"rounds": rounds,
"scale": scale,
"crawl_date": time.strftime("%Y-%m-%d %H%:%M:%S",time.localtime())
}
job_list.append(job)
return job_list
#将结果数据存入数据库
def on_result(self, result):
if not result or not len(result)>0:
return
sql = SQL()
for res in result:
sql.insert('boss_original',**res)