欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

51job-爬虫

程序员文章站 2022-03-10 22:42:50
import requestsimport urllib3from fake_useragent import UserAgentfrom lxml import etreeimport csvimport timeurllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)def get_html(page, keyword, header): url = f'https://search.51job.co...
import requests
import urllib3
from fake_useragent import UserAgent
from lxml import etree
import csv
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def get_html(page, keyword, header):
    url = f'https://search.51job.com/list/010000,000000,0000,00,9,99,{keyword},2,{page}.html?'
    response = requests.get(url, verify=False, headers=header)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        return response.text


def parse_html(html):
    data = etree.HTML(html)
    table_list = data.xpath("//div[@class='dw_table']/div[@class='el']")
    datas = []
    for info in table_list:

        name = info.xpath('p//a/text()')  # 职位名称
        comn = info.xpath('span/a/text()')  # 公司名称
        region = info.xpath('span[@class="t3"]/text()')  # 地区
        salary = info.xpath('span[@class="t4"]/text()')  # 薪资
        times = info.xpath('span[@class="t5"]/text()')  # 日期
        rest = []
        if name:
            rest.append(name[0].strip())
            rest.append(comn[0].strip())
            rest.append(region[0].strip())
            if salary == []:
                rest.append('面议')
            else:
                rest.append(salary[0].strip())
            rest.append(times[0].strip())



            print(name[0].strip(), comn[0].strip(), region[0].strip(), salary, times[0].strip())
        datas.append(rest)
    return datas


def save_data(data,filename='python'):
    with open(f'{filename}.csv', 'a', encoding='utf-8', newline='')as f:
        wo = csv.writer(f)
        for i in data:
            wo.writerow(i)



if __name__ == '__main__':
    """
    
    """
    ua = UserAgent()
    header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "search.51job.com",
        "Referer": "https://www.51job.com/",
        "User-Agent": ua.random
        }
    keyword = input('请输入您要查询的岗位:')
    page = int(input('请输入要获取的页数:'))
    for i in range(1, page+1):
        print(f'开始爬取第{i}页')
        html = get_html(i, keyword, header)
        time.sleep(1.1)
        datas = parse_html(html)
        save_data(datas,keyword)

本文地址:https://blog.csdn.net/weixin_40594668/article/details/107271239