scrapy爬取前程无忧、应届生 数据+分析
一、总体要求
利用python编写爬虫程序,从招聘网站上爬取数据,将数据存入到MongoDB数据库中,将存入的数据作一定的数据清洗后做数据分析,最后将分析的结果做数据可视化。
二、环境
pycharm、mongodb、python3.6
三、爬取字段
1、具体要求:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)。
(1)新建一个项目:scrapy startproject pa*job
(2)生成一个spider文件:scrapy genspider * *.com
结构如下:
(3)修改settings.py
BOT_NAME = 'pa*job'
SPIDER_MODULES = ['pa*job.spiders']
NEWSPIDER_MODULE = 'pa*job.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
DOWNLOAD_DELAY = 0.5
ITEM_PIPELINES = {
'pa*job.pipelines.Pa*jobPipeline': 300,
}
(4)编写items.py
代码如下:
import scrapy
class Pa*jobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
work_place = scrapy.Field() # 工作地点
company_name = scrapy.Field() # 公司名称
position_name = scrapy.Field() # 职位名称
company_info = scrapy.Field() # 公司信息
work_salary = scrapy.Field() # 薪资情况
release_date = scrapy.Field() # 发布时间
job_require = scrapy.Field() # 职位信息
contact_way = scrapy.Field() # 联系方式
education = scrapy.Field() # 学历
work_experience = scrapy.Field()#工作经验
pass
(5)编写spiders文件
我们最关键的东西就是能够把xpath找正确,很明显我们能看见每行数据都在这个标签中,我们可以写个循环
还有我们可以按住ctrl+f,看我们的xpath是否匹配到了
(6)详情页的url
下一页的url:
spider代码如下:
# -*- coding: utf-8 -*-
import scrapy
from pa*job.items import Pa*jobItem
class *Spider(scrapy.Spider):
name = '*'
allowed_domains = ['51job.com']
start_urls =['https://search.51job.com/list/000000,000000,0130%252C7501%252C7506%252C7502,01%252C32%252C38,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
def parse(self, response):
#每条数据存放的xpath
node_list = response.xpath("//div[@id='resultList']/div[@class='el']")
# 整个for循环结束代表 当前这一页已经爬完了, 那么就该开始爬取下一页
for node in node_list:
item = Pa*jobItem()
# 职位名称
item["position_name"] = node.xpath("./p/span/a/@title").extract_first()
# 公司信息
item["company_name"] = node.xpath("./span[@class='t2']/a/@title").extract_first()
# 工作地点
item["work_place"] = node.xpath("./span[@class='t3']/text()").extract_first()
# 薪资情况
item["work_salary"] = node.xpath("./span[@class='t4']/text()").extract_first()
# 发布时间
item["release_date"] = node.xpath("./span[@class='t5']/text()").extract_first()
#详情页的url
detail_url = node.xpath("./p/span/a/@href").extract_first()
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})
#下一页
next_url = response.xpath("//div[@class='p_in']//li[@class='bk'][2]/a/@href").extract_first()
#如果没有详情页的url我们就返回不再执行
if not next_url:
return
yield scrapy.Request(url=next_url, callback=self.parse)
def parse_detail(self, response):
item = response.meta["item"]
# 职位信息
item["job_require"] = response.xpath("//div[@class='bmsg job_msg inbox']/p/text()").extract()
# 联系方式
item["contact_way"] = response.xpath("//div[@class='bmsg inbox']/a/text()").extract()
# 公司信息
item["company_info"] = response.xpath("//div[@class='tmsg inbox']/text()").extract()
# 学历
item["education"] = response.xpath("//div[@class='tHeader tHjob']/div/div/p[2]/text()").extract()[2]
# 工作经验
item["work_experience"] = response.xpath("//div[@class='tHeader tHjob']/div/div/p[2]/text()").extract()[1]
yield item
(7)我们需要把数据存储到mongodb,需要编写我们的pipelines.py
代码如下:
from pymongo import MongoClient
class Pa*jobPipeline(object):
def open_spider(self, spider):
self.db = MongoClient('localhost', 27017).pa*job_db#连接mongodb数据库
self.collection = self.db.pa*job_collection#连接表
def process_item(self, item, spider):
self.collection.insert_one(dict(item))
return item
我这里是分别存储数据采集、数据开发、数据分析这些表的
运行scapy crawl *
我们来看一下mongodb里面的数据是怎样的
每个数据库中表的数据是怎样的
总共爬了大概2万多条数据
四、分析
(1)分析“数据分析”、“大数据开发工程师”、“数据采集”等岗位的平均工资、最高工资、最低工资,并作条形图将结果展示出来;
这里我是用mongodb里面的
先把数据采集、分析、开发最高最低平均工资算下来,再绘制图形
代码如下:
import pymongo
client = pymongo.MongoClient('localhost',27017)
db = client['pa*jobsjfx_db']
table = db['pa*jobsjfx']
data = table.find({"$and":[{"position_name":{"$regex":("大数据分析")}},{"work_salary":{"$regex":"万/月"}}]},{"work_salary"})
li = []
for i in data:
li.append(i["work_salary"])
li = [i.split("-") for i in li]
max = []
min= []
avg= []
for f in li:
max.append(f[0])
min.append(f[1].replace("万/月",""))
avg.append((float(f[0])+float(f[1].replace("万/月","")))/2)
a = max(max)
b = min(min)
c = 0
for i in avg:
c += i
c = c/len(avg)
print("max={}".format(a))
print("min={}".format(b))
print("avg={:.2f}".format(c))
import pymongo
client = pymongo.MongoClient('localhost',27017)
db = client['pa*jobkaifa_db']
table = db['pa*jobkaifa_collection']
data = table.find({"$and":[{"position_name":{"$regex":("数据开发")}},{"work_salary":{"$regex":"万/月"}}]},{"work_salary"})
li = []
for i in data:
li.append(i["work_salary"])
li = [i.split("-") for i in li]
max_salary = []
min_salary = []
avg_salary = []
for f in li:
max.append(f[0])
min.append(f[1].replace("万/月",""))
avg.append((float(f[0])+float(f[1].replace("万/月","")))/2)
a = max(max)
b = min(min)
c = 0
for i in avg:
c += i
c = c/len(avg)
print("max={}".format(a))
print("min={}".format(b))
print("avg={:.2f}".format(c))
import pymongo
client = pymongo.MongoClient('localhost',27017)
db = client['pa*jobcj_db']
table = db['pa*jobcj_collection']
data = table.find({"$and":[{"position_name":{"$regex":("数据采集")}},{"work_salary":{"$regex":"万/月"}}]},{"work_salary"})
li = []
for i in data:
li.append(i["work_salary"])
li = [i.split("-") for i in li]
max= []
min = []
avg = []
for f in li:
max.append(f[0])
min.append(f[1].replace("万/月",""))
avg.append((float(f[0])+float(f[1].replace("万/月","")))/2)
a = max(max)
b = min(min)
c = 0
for i in avg:
c += i
c = c/len(avg)
print("max={}".format(a))
print("min={}".format(b))
print("avg={:.2f}".format(c))
绘制图形:
# coding=gbk
import re
from pyecharts import options as opts
from pyecharts.charts import Bar
from pymongo import MongoClient
import pandas as pd
import numpy as np
post_list=["数据分析", "大数据开发工程师", "数据采集"]
avg_list = [0.96, 1.28, 1.10]
max_list = [2.5, 6, 1.2 ]
min_list = [1.2, 1, 1]
bar = (
Bar(
init_opts=opts.InitOpts(width="1600px", height="900px"),
)
.set_global_opts(
# 设置标题信息
title_opts=opts.TitleOpts(title="行业薪资", subtitle="单位 元/月"),
# 设置X轴倾斜值
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30}),
# 显示工具箱
toolbox_opts=opts.ToolboxOpts()
)
# 关联数据
.add_xaxis(post_list) # 确定x轴上要显示的内容
# 确定y轴上要显示的内容
.add_yaxis('平均工资', avg_list)
.add_yaxis('最高工资', max_list)
.add_yaxis('最低工资', min_list)
)
bar.render("数据行业薪资.html")
(2)分析“数据分析”、“大数据开发工程师”、“数据采集”等大数据相关岗位在成都、北京、上海、广州、深圳的岗位数,并做饼图将结果展示出来。
先把数据采集在每个地方的岗位数算出来,在进行制图
代码如下:
import pymongo
client = pymongo.MongoClient('localhost',27017)
db = client['pa*jobcj_db']
tab = db['pa*jobcj_collection']
data = tab.find({"$and":[{"position_name":{"$regex":("数据采集")}},{"work_place":{"$regex":"北京"}}]},{"work_place"})
li = []
for i in data:
li.append(i["work_place"])
print("岗位数={}".format(len(li)))
以此类推,岗位求出来之后绘制图形
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
sd = [5,24,24,19,14]
labels = ['成都','北京','上海','广州','深圳']
plt.pie(x=sd,labels=labels,autopct='%.1f%%')
plt.title("数据采集")
plt.show()
(3)分析大数据相关岗位1-3年工作经验的薪资水平(平均工资、最高工资、最低工资),并做出条形图展示出来
(七)选做题(二选一):将“数据采集”岗位要求的技能做出词云图。
先把monfodb里面的job_require提取出来,命令如下:
mongoexport -d pa*jobkaifa_db -c pa*jobkaifa_collection -f work_place --type=json -o D:\cjdata1.json
再用notepad++进行去重,保存一个txt文件,绘制词云图代码如下:
import jieba
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text_road=str(input('w.txt'))
picture_road=str(input('1.jpg'))
#加载需要分析的文章
text = open(text_road,'r',encoding='utf-8').read()
#对文章进行分词
wordlist_after_jieba = jieba.cut(text, cut_all=False)
wl_space_split = " ".join(wordlist_after_jieba)
#读取照片通过numpy.array函数将照片等结构数据转化为np-array
mask=np.array(Image.open(picture_road))
#选择屏蔽词,不显示在词云里面
stopwords = set(STOPWORDS)
#可以加多个屏蔽词
stopwords.add("<br/>")
#创建词云对象
wc = WordCloud(
background_color="white",
font_path='C:\Windows\Fonts\simfang.ttf',
max_words=1000, # 最多显示词数
mask=mask,
stopwords=stopwords,
max_font_size=100 # 字体最大值
)
#生成词云
wc.generate(text)
#从背景图建立颜色方案
image_colors =ImageColorGenerator(mask)
#将词云颜色设置为背景图方案
wc.recolor(color_func=image_colors)
#显示词云
plt.imshow(wc,interpolation='bilinear')
#关闭坐标轴
plt.axis("off")
#显示图像
plt.show()
#保存词云
wc.to_file('词云图.png')
效果图如下:
以上是前程无忧,下面我们做的是应届生
同样我们先创建一个项目:
scrapy startproject payingjiesheng
scrapy genspider yingjiesheng yingjiesheng.com
和上面的字段是一样的
我们编写items.py
代码如下:
import scrapy
class PayingjieshengItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
work_info = scrapy.Field() # 职位简介
company_name = scrapy.Field() # 公司名称
work_place = scrapy.Field() # 工作地点
work_type = scrapy.Field() # 职位类型
work_come = scrapy.Field() # 职位来源
position_name = scrapy.Field() # 职位名称
job_require = scrapy.Field() # 任职要求
company_info = scrapy.Field() # 公司信息
release_date = scrapy.Field() # 发布时间
work_experience = scrapy.Field()#工作经验
spiders.py
代码如下:
# -*- coding: utf-8 -*-
import scrapy
from payingjiesheng.items import PayingjieshengItem
class YingjieshengSpider(scrapy.Spider):
name = 'yingjiesheng'
allowed_domains = ['yingjiesheng.com']
start_urls = ['https://s.yingjiesheng.com/search.php?area=&word=%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88&jobterm=']
def parse(self, response):
node_list = response.xpath("//div[@id='container']/div/ul/li")
# 整个for循环结束代表 当前这一页已经爬完了, 那么就该开始爬取下一页
for node in node_list:
item = PayingjieshengItem()
# item["work_info"] = node.xpath("./div/text()").extract_first()
detail_url = node.xpath("./div/h3/a/@href").extract_first()
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})
next_url = response.xpath("//div[@id='container']/div[3]/a[4]/@href").extract_first()
if not next_url:
return
yield scrapy.Request(url=next_url, callback=self.parse)
def parse_detail(self, response):
item = response.meta["item"]
# 公司名称
item["company_name"] = response.xpath("//div[@class='main clearfix']/div/h1/text()").extract()
# 发布时间
item["release_date"] = response.xpath("//div[@class='info clearfix']/ol/li/u/text()").extract()
# 工作地点
item["work_place"] = response.xpath("//div[@class='info clearfix']/ol/li[2]/u/text()").extract()
# 职位类型
item["work_type"] = response.xpath("//div[@class='info clearfix']/ol/li[3]/u/text()").extract()
# 职位来源
item["work_come"] = response.xpath("//div[@class='info clearfix']/ol/li[4]/a/text()").extract()
# 职位名称
item["position_name"] = response.xpath("//div[@class='info clearfix']/ol/li[5]/u/text()").extract()
# 任职要求
item["job_require"] = response.xpath("//div[@id='wordDiv']/div/div/p/text()").extract()[1::]
# 公司信息
item["company_info"] = response.xpath("//div[@class='jobIntro'][2]/text()").extract()
yield item
我们运行一下:scrapy crawl yingjiesheng
查看一下mongodb里面的数据看是否爬取下来:
上一篇: html 属性tabIndex