51job-爬虫

程序员文章站 2022-03-10 22:42:50

import requestsimport urllib3from fake_useragent import UserAgentfrom lxml import etreeimport csvimport timeurllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)def get_html(page, keyword, header): url = f'https://search.51job.co...

import requests
import urllib3
from fake_useragent import UserAgent
from lxml import etree
import csv
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def get_html(page, keyword, header):
    url = f'https://search.51job.com/list/010000,000000,0000,00,9,99,{keyword},2,{page}.html?'
    response = requests.get(url, verify=False, headers=header)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        return response.text


def parse_html(html):
    data = etree.HTML(html)
    table_list = data.xpath("//div[@class='dw_table']/div[@class='el']")
    datas = []
    for info in table_list:

        name = info.xpath('p//a/text()')  # 职位名称
        comn = info.xpath('span/a/text()')  # 公司名称
        region = info.xpath('span[@class="t3"]/text()')  # 地区
        salary = info.xpath('span[@class="t4"]/text()')  # 薪资
        times = info.xpath('span[@class="t5"]/text()')  # 日期
        rest = []
        if name:
            rest.append(name[0].strip())
            rest.append(comn[0].strip())
            rest.append(region[0].strip())
            if salary == []:
                rest.append('面议')
            else:
                rest.append(salary[0].strip())
            rest.append(times[0].strip())



            print(name[0].strip(), comn[0].strip(), region[0].strip(), salary, times[0].strip())
        datas.append(rest)
    return datas


def save_data(data,filename='python'):
    with open(f'{filename}.csv', 'a', encoding='utf-8', newline='')as f:
        wo = csv.writer(f)
        for i in data:
            wo.writerow(i)



if __name__ == '__main__':
    """
    
    """
    ua = UserAgent()
    header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "search.51job.com",
        "Referer": "https://www.51job.com/",
        "User-Agent": ua.random
        }
    keyword = input('请输入您要查询的岗位：')
    page = int(input('请输入要获取的页数：'))
    for i in range(1, page+1):
        print(f'开始爬取第{i}页')
        html = get_html(i, keyword, header)
        time.sleep(1.1)
        datas = parse_html(html)
        save_data(datas,keyword)

本文地址：https://blog.csdn.net/weixin_40594668/article/details/107271239

上一篇：魏国作为七雄之一，是如何兴起又是如何衰败的呢？

下一篇： ROG 幻13值得入手吗 ROG 幻13全面评测

51job-爬虫

爬虫基础框架之urllib(一) --- urllib基础

python高阶爬虫实战分析

python用BeautifulSoup库简单爬虫实例分析

Python爬虫包BeautifulSoup实例（三）

python爬虫--模拟12306登录

Python实现可获取网易页面所有文本信息的网易网络爬虫功能示例

网络爬虫(网页追逐者)是什么网络爬虫原理详情介绍

python使用tornado实现简单爬虫

使用Python的Scrapy框架编写web爬虫的简单示例

Python 爬虫招聘信息并存入数据库

51job-爬虫

爬虫基础框架 之urllib(一) --- urllib基础

python高阶爬虫实战分析

python用BeautifulSoup库简单爬虫实例分析

Python爬虫包BeautifulSoup实例（三）

python爬虫--模拟12306登录

Python实现可获取网易页面所有文本信息的网易网络爬虫功能示例

网络爬虫(网页追逐者)是什么 网络爬虫原理详情介绍

python使用tornado实现简单爬虫

使用Python的Scrapy框架编写web爬虫的简单示例

Python 爬虫 招聘信息并存入数据库

爬虫基础框架之urllib(一) --- urllib基础

网络爬虫(网页追逐者)是什么网络爬虫原理详情介绍

Python 爬虫招聘信息并存入数据库