利用requests爬取拉勾相关职位信息
程序员文章站
2022-05-09 21:17:21
...
初步代码
# -*- encoding: utf-8 -*-
from com.lagou.crawl.WebRequest import *
from com.lagou.crawl.mysqldb import SQL
import time, json, random, math, requests, logging
# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='lagou.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
# 获取请求头
logging.info('begin to get web request header')
crawl_position = '大数据' # 需要爬取的职位
crawl_city = '成都' # 需要爬取的城市
headers = header_lagou(crawl_position, crawl_city)
base_url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city={}&needAddtionalResult=false'.format(
crawl_city) # 数据列表地址
proxy_list = get_home_proxy()
def index_page():
logging.info('begin to sending request')
data = None
try:
response = requests.post(url=base_url, headers=headers, data={
'first': 'true',
'pn': '1',
'kd': crawl_position
},proxies=random.choice(proxy_list))
response.encoding = "utf-8"
data = json.loads(response.text)
except Exception as e:
logging.error('crawl error',e)
all_results = [] # 所有页面的数据总和
if data['msg'] == None:
# first_results = data['content']['positionResult']['result'] # 职位信息
# all_results += first_results
resultSize = data['content']['positionResult']['resultSize'] # 职位条数
totalCount = data['content']['positionResult']['totalCount'] # 总职位条数
total_page = get_page_num(totalCount, resultSize) # 总页数
logging.info(
"result_size is " + str(resultSize) + ",total_count is " + str(totalCount) + ",total_page is " + str(
total_page))
# 从数据库中获取最近最新一条数据创建时间,来做全量或增量操作
logging.info('begin to get latest update time from database')
latest_time = SQL().get_latest_time('lagou_original', crawl_city)
page_num = 1
logging.info('begin to crawl data')
while page_num <= total_page:
is_first = 'false'
if page_num == 1:
is_first = 'true'
time.sleep(random.randint(1, 10))
logging.info("总页数:{},正在爬取第 {} 页".format(total_page, page_num))
page_result = requests_post(page_num, latest_time, is_first)
# 判断每页数据创建时间是否大于上次更新时间
if page_result[1] is False:
break
all_results += page_result[0]
# on_result(page_result)
page_num += 1
on_result(get_result(all_results))
else:
time.sleep(random.randint(1, 3))
logging.info('retry to crawl data')
index_page()
# 采用requests方式请求数据
def requests_post(page_num, latest_time, is_first):
is_success = True
retry_num = 0
while is_success and retry_num <= 2:
try:
result = requests.post(url=base_url, headers=headers, data={
'first': is_first,
'pn': str(page_num),
'kd': crawl_position
},proxies=random.choice(proxy_list))
result.encoding = 'utf-8'
if result.status_code == 200:
req_result = json.loads(result.text)
if req_result['msg'] == None:
is_success = False
page_result = req_result['content']['positionResult']['result'] # 职位信息
if latest_time is None:
print("number of retries crawl pagenum:" + str(page_num), retry_num)
return (page_result, True)
else:
page_res = []
invalid_count = 0
for info in page_result:
create_time = info['createTime']
# 此判断不够精准,数据库中可能会出现重复数据,后期做清洗
if (create_time != None and latest_time <= create_time) or create_time is None:
page_res.append(info)
invalid_count += 1
print("number of retries crawl pagenum:" + str(page_num), retry_num)
if invalid_count == 0:
return (page_res, False)
else:
return (page_res, True)
else:
time.sleep(random.randint(1, 10))
requests_post(page_num, latest_time, is_first)
retry_num += 1
except:
retry_num += 1
continue
# 格式化数据结果
def get_result(results):
crawl_results = []
for result in results:
crawl_results.append({
'position_id': result['positionId'],
'position_name': result['positionName'],
'job_nature': result['jobNature'],
'education': result['education'],
'work_year': result['workYear'],
'salary': result['salary'],
'city': result['city'],
'position_advantage': result['positionAdvantage'],
'position_lables': ";".join(result['positionLables']),
'skill_lables': ";".join(result['skillLables']),
'is_school_job': result['isSchoolJob'],
'create_time': result['createTime'],
'company_full_name': result['companyFullName'],
'company_short_name': result['companyShortName'],
'finance_stage': result['financeStage'],
'company_size': result['companySize'],
'company_label_list': ";".join(result['companyLabelList']),
'district': result['district'],
'industry_field': result['industryField']
})
return crawl_results
# 计算要抓取的页数
def get_page_num(totalCount, resultSize):
res = math.ceil(totalCount / resultSize) # 每页15个职位,向上取整
# 拉勾网最多显示30页结果
if res > 30:
return 30
else:
return res
# 将结果数据存入数据库
def on_result(result):
if not result or not len(result) > 0:
return
sql = SQL()
for res in result:
sql.insert('lagou_original', **res)
if __name__ == '__main__':
index_page()