欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python爬取智联招聘信息

程序员文章站 2022-05-09 17:45:06
...
代码如下
import requests
import json
from lxml import etree



headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,'

                  ' like Gecko) Chrome/72.0.3626.119 Safari/537.36'

}

def url_index():
    range_ = ['https://fe-api.zhaopin.com/c/i/sou?start={}&pageSize=90'.format(num) for num in range(0, 12)]
    return range_

def salary_index(salary):
    salarys=['','']
    if 'K' in salary:
        salary=salary.replace('K','000')
        if '.' in salary:
            salary = salary.replace('.000', '00')
        if '-' in salary:
            salarys=salary.split('-')
        else:
            salarys=[salary,salary]
    return salarys
def go_index(url):
    response = requests.post(url, headers)
    loads = json.loads(response.content.decode())#将请求数据转换成字典数据

    json_list=[]
    for item in loads['data']['results']:
        salarys=salary_index(item.get('salary',''))

        data={
            'city_name':item.get('city',dict()).get('display',''),#城市
            'com_name':item.get('company',dict()).get('name',''),#公司名
            'com_size':item.get('company',dict()).get('size',dict()).get('name',''),#公司人数
            'com_type':item.get('company',dict()).get('type',dict()).get('name',''),#公司性质
            'job_name': item.get('jobName', ''),#职位
            'job_tag': item.get('jobTag', dict()).get('searchTag', ''),#福利
            'timeState': item.get('timeState', ''),#发布时间情况 -最新 -最近 -招聘中
            'low_salary': salarys[0],#工资下限
            'higt_salary': salarys[1],#上限
            'positionURL': item.get('positionURL', '')#详情页
        }
        json_list.append(data)

    return json_list
def del_json(json_list):#从详情页中获取公司计划招聘人数
    end_list=[]
    for item in json_list:
        url=item['positionURL']
        item['size']=''
        response = requests.get(url, headers)
        html = etree.HTML(response.content.decode())
        html_index = html.xpath('//ul[@class="summary-plane__info"]/li')
        item['size']=html_index[3].xpath('./text()')
        del item['positionURL']
        print(item)
        end_list.append(item)
    return end_list
if __name__ == '__main__':    #主要逻辑
    #获取url_list
    url_list=url_index();
    json_list=[]
    for url in url_list:
        #发送请求  处理数据
        json_index=go_index(url);
        json_list+=json_index
    end_list=del_json(json_list)

    try:
        with open('zl2.txt','a+',encoding='utf-8') as f:
            json.dump(end_list,f,ensure_ascii=False)
    except Exception as e:
        print(e)