欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

智联招聘爬虫

程序员文章站 2022-05-09 17:41:34
...
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame, Series
import json
import time
import Against_Reptilia_solve

url = "https://fe-api.zhaopin.com/c/i/sou"
params = {  # 参数
    'start': '90',
    'pageSize': '90',
    'cityId': '489',
    'workExperience': '-1',
    'education': '-1',
    'companyType': '-1',
    'employmentType': '-1',
    'jobWelfareTag': '-1',
    'kw': 'python',
    'kt': '3',
}
headers_choice = Against_Reptilia_solve.get_user_agent()
headers = {  # 头部信息
    'User-Agent': headers_choice
}
url1 = "https://jobs.zhaopin.com/%s.htm"


class Spider():
    def __init__(self):
        self.word=input("请输入关键词:")
        page=input("请输入页数:")
        # self.word='python'
        # page=2
        params['kw'] = self.word  # 更改请求的关键词
        self.df = DataFrame()  # 创建一个DataFrame对象
        self.columns = ['职位名称', '工资', '更新时间', '地点', '公司名称', '需求人数', '学历', '工作经验', '职责']  # 列名
        for i in range(0, int(page) * 90, 90):
            print(i)
            params['start'] = i  # 更改开始页的页数
            try:
                r = requests.get(url, params=params, headers=headers, timeout=10)  # 请求数据,加上try except防止请求失败程序停止
            except:
                continue
            r.encoding = 'utf-8'  # 更改编码格式
            time.sleep(0.5)  # 设置延迟防止IP被封
            print(time.time())  # 显示当前时间,估测程序进度
            self.get_data(r.text)  # 调用函数,函数是爬去数据,上面的所有都是用来请求的,这个函数才是解析请求返回来的数据的

    # 薪资处理函数
    def salary_solve(self, salary):
        now_salary_list = []
        if '薪资面议' in salary:
            pass  # 对面薪资面议的情况忽略
        else:
            #去掉'-',剩下k,['1k','2k']
            salary_heng = salary.split('-')
            for salary_k in salary_heng:
                #去掉'k'
                salary_fin = salary_k.split('k')
                #当前列表为['1','k']
                now_salary_list.append(salary_fin[0])
        return now_salary_list

    def get_data(self, text):
        js = json.loads(text)
        print(js)
        # 薪资存储列表
        salary_list = []
        for i in js['data']['results']:
            idd = i['number']  # id
            title = i['jobName']  # 职位名称
            # print(title)
            salary = i['salary']  # 工资
            print(salary)
            # 获得薪资后调用薪资存储处理函数
            every_avg_salary = self.salary_solve(salary)
            for sl in every_avg_salary:
                if sl != '-':
                    salary_list.append(sl)  # 结果为int型整数
            updateDate = i['endDate']  # 更新时间
            place = i['city']['display']  # 工作地点
            company = i['company']['name']  # 公司名称
            recruitCount = i['recruitCount']  # 需求人数
            eduLevel = i['eduLevel']['name']  # 学历
            workingExp = i['workingExp']['name']  # 工作经验
            u1 = url1 % idd  # 更改详情页的url,
            # 获取二级子界面
            zhize = self.get_zhize(u1)  # 调用这个函数是为了爬取详情页页面的职责的
            data = [title, salary, updateDate, place, company, recruitCount, eduLevel, workingExp,
                    zhize]  # 所有的字段都整理成一个列表
            self.df = self.df.append(Series(data, index=self.columns), ignore_index=True)  # 添加进入dataframe的数据结构中
            # 打印以验证代码
            # print(title)
            self.df.to_excel("%s.xlsx" % self.word)  # 保存数据
        print(salary_list)
        return salary_list

    def get_zhize(self, link):
        try:
            r = requests.get(link, headers=headers, timeout=10)  # 请求详情页
        except:
            return "暂无"
        r.encoding = 'utf-8'  # 更改编码
        soup = BeautifulSoup(r.text, 'lxml')  # 解析数据
        # with open('soup_test.html','w') as f:
        #     f.write('soup')
        # print(soup)
        text = soup.select('div.describtion__detail-content')[0].text.strip()  # 职责
        return text


if __name__ == "__main__":
   print(Spider())