欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫之爬取拉勾网职位信息

程序员文章站 2022-05-09 17:41:16
...
import json
import time
import requests
import csv

# 1. 创建文件对象
f = open('lgposition.csv', 'w', encoding='utf-8', newline='')
    # 2. 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)
    # 3. 构建列表头
csv_writer.writerow(
        ["公司", "职位名称", "公司简称", "公司规模", "公司行业", "融资", "福利", "职位类型", "第二职位", "第三职位", "技能", "职位发布时间", "城市", "区域",
         "薪水", "工作年限", "学历", "职位优势"])

def main(pages,position):
    # 主url
    url1 = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
    # ajax请求
    url = "https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false"
    # 请求头
    headers = {
        'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
        ,'Referer': 'https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
        ,'Accept': 'application/json, text/javascript, */*; q=0.01'
        ,'Host': 'www.lagou.com'
    }
    # 通过data来控制翻页
    for page in range(1, pages):
        data = {
            'first': 'false',
            'pn': page,
            'kd': position
        }
        s = requests.Session()
        s.get(url=url1, headers=headers, timeout=3)
        cookie = s.cookies 
        respon = s.post(url=url, headers=headers, data=data, cookies=cookie, timeout=3)
        time.sleep(3)
        total = respon.text
        results = json.loads(respon.text)['content']['positionResult']['result']
        extractPositionData(results)

def extractPositionData(results):
    if len(results):
        for result in results:
            companyLabelList = result['companyLabelList']
            companyLabelLists = ''
            if len(companyLabelList):
                for i in companyLabelList:
                    companyLabelLists += i + ','
            skillLable = result['skillLables']
            skillLables = ''
            if len(skillLable):
                for i in skillLable:
                    skillLables += i + ','
            # 4. 写入csv文件内容
            csv_writer.writerow(
                [result['companyFullName'],
                 result['positionName'],
                 result['companyShortName'],
                 result['companySize'],
                 result['industryField'],
                 result['financeStage'],
                 companyLabelLists,
                 result['firstType'],
                 result['secondType'],
                 result['thirdType'],
                 skillLables,
                 result['createTime'],
                 result['city'],
                 result['district'],
                 result['salary'],
                 result['workYear'],
                 result['education'],
                 result['positionAdvantage']])

if __name__ == '__main__':
    main(30,'python')
    # 5. 关闭文件
    f.close()