51job-爬虫
程序员文章站
2022-03-10 22:42:50
import requestsimport urllib3from fake_useragent import UserAgentfrom lxml import etreeimport csvimport timeurllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)def get_html(page, keyword, header): url = f'https://search.51job.co...
import requests
import urllib3
from fake_useragent import UserAgent
from lxml import etree
import csv
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def get_html(page, keyword, header):
url = f'https://search.51job.com/list/010000,000000,0000,00,9,99,{keyword},2,{page}.html?'
response = requests.get(url, verify=False, headers=header)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
def parse_html(html):
data = etree.HTML(html)
table_list = data.xpath("//div[@class='dw_table']/div[@class='el']")
datas = []
for info in table_list:
name = info.xpath('p//a/text()') # 职位名称
comn = info.xpath('span/a/text()') # 公司名称
region = info.xpath('span[@class="t3"]/text()') # 地区
salary = info.xpath('span[@class="t4"]/text()') # 薪资
times = info.xpath('span[@class="t5"]/text()') # 日期
rest = []
if name:
rest.append(name[0].strip())
rest.append(comn[0].strip())
rest.append(region[0].strip())
if salary == []:
rest.append('面议')
else:
rest.append(salary[0].strip())
rest.append(times[0].strip())
print(name[0].strip(), comn[0].strip(), region[0].strip(), salary, times[0].strip())
datas.append(rest)
return datas
def save_data(data,filename='python'):
with open(f'{filename}.csv', 'a', encoding='utf-8', newline='')as f:
wo = csv.writer(f)
for i in data:
wo.writerow(i)
if __name__ == '__main__':
"""
"""
ua = UserAgent()
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "search.51job.com",
"Referer": "https://www.51job.com/",
"User-Agent": ua.random
}
keyword = input('请输入您要查询的岗位:')
page = int(input('请输入要获取的页数:'))
for i in range(1, page+1):
print(f'开始爬取第{i}页')
html = get_html(i, keyword, header)
time.sleep(1.1)
datas = parse_html(html)
save_data(datas,keyword)
本文地址:https://blog.csdn.net/weixin_40594668/article/details/107271239