python爬虫之爬取拉勾网职位信息
程序员文章站
2022-05-09 17:41:16
...
import json
import time
import requests
import csv
# 1. 创建文件对象
f = open('lgposition.csv', 'w', encoding='utf-8', newline='')
# 2. 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)
# 3. 构建列表头
csv_writer.writerow(
["公司", "职位名称", "公司简称", "公司规模", "公司行业", "融资", "福利", "职位类型", "第二职位", "第三职位", "技能", "职位发布时间", "城市", "区域",
"薪水", "工作年限", "学历", "职位优势"])
def main(pages,position):
# 主url
url1 = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
# ajax请求
url = "https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false"
# 请求头
headers = {
'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
,'Referer': 'https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
,'Accept': 'application/json, text/javascript, */*; q=0.01'
,'Host': 'www.lagou.com'
}
# 通过data来控制翻页
for page in range(1, pages):
data = {
'first': 'false',
'pn': page,
'kd': position
}
s = requests.Session()
s.get(url=url1, headers=headers, timeout=3)
cookie = s.cookies
respon = s.post(url=url, headers=headers, data=data, cookies=cookie, timeout=3)
time.sleep(3)
total = respon.text
results = json.loads(respon.text)['content']['positionResult']['result']
extractPositionData(results)
def extractPositionData(results):
if len(results):
for result in results:
companyLabelList = result['companyLabelList']
companyLabelLists = ''
if len(companyLabelList):
for i in companyLabelList:
companyLabelLists += i + ','
skillLable = result['skillLables']
skillLables = ''
if len(skillLable):
for i in skillLable:
skillLables += i + ','
# 4. 写入csv文件内容
csv_writer.writerow(
[result['companyFullName'],
result['positionName'],
result['companyShortName'],
result['companySize'],
result['industryField'],
result['financeStage'],
companyLabelLists,
result['firstType'],
result['secondType'],
result['thirdType'],
skillLables,
result['createTime'],
result['city'],
result['district'],
result['salary'],
result['workYear'],
result['education'],
result['positionAdvantage']])
if __name__ == '__main__':
main(30,'python')
# 5. 关闭文件
f.close()
下一篇: 2.爬取智联招聘的职位信息1.0