欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

利用requests爬取智联相关职位信息

程序员文章站 2022-03-02 23:24:17
...

初步代码

# -*- encoding: utf-8 -*-


from crawl.WebRequest import *
from crawl.mysqldb import SQL
import time, json, random, math, requests, logging, hashlib

# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='zhilian.log', level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
# 获取请求头
logging.info('begin to get web request header')
# 需要爬取的职位
positions = ['大数据']
# 需要爬取的城市:北京、上海、深圳、广州、成都、杭州、武汉
# city_ids = ['530', '538', '765', '763', '801', '653', '736']
city_ids = ['801']
# 工作经验:无经验、1年以下、1-3年、3-5年、5-10年、10年以上
work_exps = ['0000', '0001', '0103', '0305', '0510', '1099']
# 请求头
header = header()
# 获取代理IP
proxy_list = get_home_proxy()


def main():
    logging.info('begin to sending request')
    sql = SQL()
    latest_jobNums = sql.get_latest_jobNum('zhilian_update')
    for city_id in city_ids:
        for position in positions:
            for work_exp in work_exps:
                base_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId={cityId}&salary=0,0' \
                           '&workExperience={workExp}&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&sortType=publish' \
                           '&kw={position}&kt=3&=0&_v=' + getParam()[0] + "&x-zp-page-request-id=" + getParam()[1]
                base_url = base_url.format(cityId=city_id, workExp=work_exp, position=position)
                header = header_zhilian(city_id, work_exp, position)
                try:
                    response = requests.get(url=base_url, headers=header, )
                    data = json.loads(response.text)
                    if data['code'] == 200:
                        resultCount = data['data']['numFound']  # 职位条数
                        total_page = int(get_page_num(resultCount))  # 总页数
                        if total_page != 0:
                            # 开始进行爬取操作
                            for page in range(0, total_page):
                                logging.info(
                                    "now it's crawling position:" + position + ",city_id:" + city_id + ",work_exp:" + work_exp + ",result_count:" + str(
                                        resultCount) + ",total_page:" + str(total_page) + ",crawling page:" + str(
                                        page + 1))
                                # 获取对应工作年限的增量jobnum信息
                                latest_jobNum = latest_jobNums.get(work_exp)
                                if resultCount < 90:
                                    # 解析获取到的结果数据
                                    results = get_result(latest_jobNum, data['data']['results'])
                                else:
                                    # 计算页面起始值
                                    startIndex = page * 90
                                    if startIndex == 0:
                                        results = get_result(latest_jobNum, data['data']['results'])
                                    else:
                                        # 拼接请求url
                                        crawl_url = base_url + "&start=" + str(startIndex)
                                        res = requests.get(url=crawl_url, headers=header, )
                                        res_data = json.loads(res.text)
                                        # 解析获取到的结果数据
                                        results = get_result(latest_jobNum, res_data['data']['results'])
                                job_results = results[0]
                                # 判断是否停止循环的标签
                                break_flag = results[1]
                                if not job_results and len(job_results) > 0:
                                    # 保存数据到数据库中
                                    if page == 0:
                                        # 根据第一页开始爬取的第一条作为下次增量爬取的结束条件
                                        job_num = job_results[0].get('job_num')
                                        # 增量条件
                                        update_condition = (job_num, work_exp)
                                        # 根据条件保存数据至数据库
                                        save_result(job_results, update_condition)
                                        time.sleep(random.randint(1, 5))
                                    else:
                                        save_result(job_results, ())
                                        time.sleep(random.randint(1, 5))
                                if break_flag is True:
                                    break
                except Exception as e:
                    logging.error('crawl error:', e)


def save_result(result, update_condition):
    if not result or not len(result) > 0:
        return
    sql = SQL()
    try:
        if len(update_condition) == 0:
            for res in result:
                sql.insert('zhilian_original', **res)
            logging.info("save data success")
        else:
            sql.update_jobNum(update_condition)
            for res in result:
                sql.insert('zhilian_original', **res)
            logging.info("save data success")
    except Exception as e:
        logging.error("save data failed", e)


# 格式化数据结果
def get_result(jobNum, results):
    crawl_results = []
    flag = False
    for result in results:
        job_num = str(result['number'])
        if jobNum is not None and jobNum == job_num:
            flag = True
            break
        else:
            crawl_results.append({
                'job_num': job_num,  # 工作编号
                'job_name': result['jobName'],  # 工作名称
                'emp_type': result['emplType'],  # 工作类型
                'job_type': result['jobType']['display'],  # 职业大分类名称
                'job_city': result['city']['display'],  # 工作城市
                # 'business_area': result['businessArea'],  # 商业区
                'working_exp': result['workingExp']['name'],  # 工作经验
                'edu_level': result['eduLevel']['name'],  # 教育水平
                'salary': result['salary'],  # 工资
                'job_light': str(json.loads(result['positionLabel'])['jobLight']),  # 职位亮点
                'job_skill': str(json.loads(result['positionLabel'])['skill']),  # 职位技能
                'company_name': result['company']['name'],  # 公司名称
                'company_size': result['company']['size']['name'],  # 公司规模
                'company_type': result['company']['type']['name'],  # 公司类型
                'create_date': result['createDate'],  # 创建时间
                'update_date': result['updateDate'],
                'end_date': result['endDate'],
                'job_tag': str(result['jobTag']['searchTag']),  # 工作待遇
                'welfare': str(result['welfare'])  # 工作福利
            })
    return (crawl_results, flag)


# 计算要抓取的总页数
def get_page_num(resultCount):
    pageSize = 90
    res = math.ceil(resultCount / pageSize)  # 每页15个职位,向上取整
    return res


def getParam():
    # 1、生成一个随机32位数id
    md5 = hashlib.md5()
    id = str(random.random())
    md5.update(id.encode('utf-8'))
    random_id = md5.hexdigest()
    #  2、生成当前时间戳
    now_time = int(time.time() * 1000)
    #  3、生成随机6位数
    randomnumb = int(random.random() * 1000000)
    # 组合代码生成x-zp-page-request-id
    x_zp_page_request_id = str(random_id) + '-' + str(now_time) + '-' + str(randomnumb)
    # 生成_v
    url_v = str(round(random.random(), 8))
    return [url_v, x_zp_page_request_id]


if __name__ == '__main__':
    main()