python网络爬虫案例
程序员文章站
2022-06-05 18:46:41
...
python爬取51Job招聘信息
# -*-coding:utf-8-*-
import requests
import pandas as pd
from pprint import pprint
from lxml import etree
import time
import warnings
import json
warnings.filterwarnings("ignore")
"""
这个文件是最完整的代码,我们需要进行翻页操作,进行所有数据的爬取。
这里的翻页操作很简单,观察下面的url,不同页面只有一个地方不同。
https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,1.html?
https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,2.html?
https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,3.html?
"""
for i in range(1, 1501):
print("正在爬取第" + str(i) + "页的数据")
url_pre = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,"
url_end = ".html?"
url = url_pre + str(i) + url_end
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
web = requests.get(url, headers=headers)
web.encoding = "gbk"
dom = etree.HTML(web.text)
# 1、岗位名称
job_name = dom.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@title')
# 2、公司名称
company_name = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t2"]/a[@target="_blank"]/@title')
# 3、工作地点
address = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t3"]/text()')
# 4、工资
salary_mid = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t4"]')
salary = [i.text for i in salary_mid]
# 5、发布日期
release_time = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t5"]/text()')
# 6、获取二级网址url
deep_url = dom.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@href')
RandomAll = []
JobDescribe = []
CompanyType = []
CompanySize = []
Industry = []
# range(len(deep_url))
for i in range(len(deep_url)):
web_test = requests.get(deep_url[i], headers=headers)
web_test.encoding = "gbk"
dom_test = etree.HTML(web_test.text)
# 7、爬取经验、学历信息,先合在一个字段里面,以后再做数据清洗。命名为random_all
random_all = dom_test.xpath('//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()')
# 8、岗位描述性息
job_describe = dom_test.xpath('//div[@class="tBorderTop_box"]//div[@class="bmsg job_msg inbox"]/p/text()')
# 9、公司类型
company_type = dom_test.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[1]/@title')
# 10、公司规模(人数)
company_size = dom_test.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[2]/@title')
# 11、所属行业(公司)
industry = dom_test.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[3]/@title')
# 将上述信息保存到各自的列表中
# print random_all
# aa = json.dumps(random_all, encoding='gbk', ensure_ascii=False)
aa = ["".join(item.split()) for item in random_all]
RandomAll.append(aa)
# job_describe = json.dumps(job_describe, encoding='gbk', ensure_ascii=False)
bb = ["".join(item.split()) for item in job_describe]
JobDescribe.append(bb)
# company_type = json.dumps(company_type, encoding='gbk', ensure_ascii=False)
CompanyType.append(company_type)
# company_size = json.dumps(company_size, encoding='gbk', ensure_ascii=False)
CompanySize.append(company_size)
# industry = json.dumps(industry, encoding='gbk', ensure_ascii=False)
Industry.append(industry)
# 为了反爬,设置睡眠时间
time.sleep(1)
# 由于我们需要爬取很多页,为了防止最后一次性保存所有数据出现的错误,因此,我们每获取一夜的数据,就进行一次数据存取。
df = pd.DataFrame()
df["岗位名称"] = job_name
df["公司名称"] = company_name
df["工作地点"] = address
df["工资"] = salary
df["发布日期"] = release_time
df["经验、学历"] = RandomAll
df["公司类型"] = CompanyType
df["公司规模"] = CompanySize
df["所属行业"] = Industry
df["岗位描述"] = JobDescribe
# 这里在写出过程中,有可能会写入失败,为了解决这个问题,我们使用异常处理。
try:
df.to_csv("job_info.csv", mode="ab+", header=None, index=None, encoding="gbk")
except Exception as e:
print("当页数据写入失败", str(e))
time.sleep(1)
print("数据爬取完毕,是不是很开心!!!")
下一篇: 火遍大江南北的麦片牛奶减肥,你值得有用