欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

利用requests爬取拉勾相关职位信息

程序员文章站 2022-05-09 21:17:21
...

初步代码

# -*- encoding: utf-8 -*-


from com.lagou.crawl.WebRequest import *
from com.lagou.crawl.mysqldb import SQL
import time, json, random, math, requests, logging


# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='lagou.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
# 获取请求头
logging.info('begin to get web request header')
crawl_position = '大数据'  # 需要爬取的职位
crawl_city = '成都'  # 需要爬取的城市
headers = header_lagou(crawl_position, crawl_city)
base_url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city={}&needAddtionalResult=false'.format(
    crawl_city)  # 数据列表地址
proxy_list = get_home_proxy()

def index_page():
    logging.info('begin to sending request')
    data = None
    try:
        response = requests.post(url=base_url, headers=headers, data={
            'first': 'true',
            'pn': '1',
            'kd': crawl_position
        },proxies=random.choice(proxy_list))
        response.encoding = "utf-8"
        data = json.loads(response.text)
    except Exception as e:
        logging.error('crawl error',e)
    all_results = []  # 所有页面的数据总和
    if data['msg'] == None:
        # first_results = data['content']['positionResult']['result'] # 职位信息
        # all_results += first_results
        resultSize = data['content']['positionResult']['resultSize']  # 职位条数
        totalCount = data['content']['positionResult']['totalCount']  # 总职位条数
        total_page = get_page_num(totalCount, resultSize)  # 总页数
        logging.info(
            "result_size is " + str(resultSize) + ",total_count is " + str(totalCount) + ",total_page is " + str(
                total_page))
        # 从数据库中获取最近最新一条数据创建时间,来做全量或增量操作
        logging.info('begin to get latest update time from database')
        latest_time = SQL().get_latest_time('lagou_original', crawl_city)
        page_num = 1
        logging.info('begin to crawl data')
        while page_num <= total_page:
            is_first = 'false'
            if page_num == 1:
                is_first = 'true'
            time.sleep(random.randint(1, 10))
            logging.info("总页数:{},正在爬取第 {} 页".format(total_page, page_num))
            page_result = requests_post(page_num, latest_time, is_first)
            # 判断每页数据创建时间是否大于上次更新时间
            if page_result[1] is False:
                break
            all_results += page_result[0]
            # on_result(page_result)
            page_num += 1
        on_result(get_result(all_results))
    else:
        time.sleep(random.randint(1, 3))
        logging.info('retry to crawl data')
        index_page()


# 采用requests方式请求数据
def requests_post(page_num, latest_time, is_first):
    is_success = True
    retry_num = 0
    while is_success and retry_num <= 2:
        try:
            result = requests.post(url=base_url, headers=headers, data={
                'first': is_first,
                'pn': str(page_num),
                'kd': crawl_position
            },proxies=random.choice(proxy_list))
            result.encoding = 'utf-8'
            if result.status_code == 200:
                req_result = json.loads(result.text)
                if req_result['msg'] == None:
                    is_success = False
                    page_result = req_result['content']['positionResult']['result']  # 职位信息
                    if latest_time is None:
                        print("number of retries crawl pagenum:" + str(page_num), retry_num)
                        return (page_result, True)
                    else:
                        page_res = []
                        invalid_count = 0
                        for info in page_result:
                            create_time = info['createTime']
                            # 此判断不够精准,数据库中可能会出现重复数据,后期做清洗
                            if (create_time != None and latest_time <= create_time) or create_time is None:
                                page_res.append(info)
                                invalid_count += 1
                        print("number of retries crawl pagenum:" + str(page_num), retry_num)
                        if invalid_count == 0:
                            return (page_res, False)
                        else:
                            return (page_res, True)
                else:
                    time.sleep(random.randint(1, 10))
                    requests_post(page_num, latest_time, is_first)
                    retry_num += 1
        except:
            retry_num += 1
            continue


# 格式化数据结果
def get_result(results):
    crawl_results = []
    for result in results:
        crawl_results.append({
            'position_id': result['positionId'],
            'position_name': result['positionName'],
            'job_nature': result['jobNature'],
            'education': result['education'],
            'work_year': result['workYear'],
            'salary': result['salary'],
            'city': result['city'],
            'position_advantage': result['positionAdvantage'],
            'position_lables': ";".join(result['positionLables']),
            'skill_lables': ";".join(result['skillLables']),
            'is_school_job': result['isSchoolJob'],
            'create_time': result['createTime'],
            'company_full_name': result['companyFullName'],
            'company_short_name': result['companyShortName'],
            'finance_stage': result['financeStage'],
            'company_size': result['companySize'],
            'company_label_list': ";".join(result['companyLabelList']),
            'district': result['district'],
            'industry_field': result['industryField']
        })
    return crawl_results


# 计算要抓取的页数
def get_page_num(totalCount, resultSize):
    res = math.ceil(totalCount / resultSize)  # 每页15个职位,向上取整
    # 拉勾网最多显示30页结果
    if res > 30:
        return 30
    else:
        return res


# 将结果数据存入数据库
def on_result(result):
    if not result or not len(result) > 0:
        return
    sql = SQL()
    for res in result:
        sql.insert('lagou_original', **res)


if __name__ == '__main__':
    index_page()