欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取腾讯招聘信息写入TXT文件

程序员文章站 2022-05-08 10:57:21
...
"""
    需求:爬取腾讯招聘数据(职位名称,职位类别,人数,地点, 工作要求,工作职责)
    url = "http://hr.tencent.com/position.php?&start="
"""
import requests
from lxml import etree

BASE_DOMAIN = "http://hr.tencent.com/"
HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
}
BASE_URL = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0"


def parse_detail_page(url):
    position = {}
    response = requests.get(url, headers=HEADERS)
    html = etree.HTML(response.text)
    work_name = html.xpath("//tr[@class='h']/td/text()")[0]
    work_place = html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0]
    work_category = html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0]
    work_lack_number = html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0]
    more_info = html.xpath("//ul[@class='squareli']")
    work_duty = more_info[0].xpath(".//text()")
    work_require = more_info[1].xpath(".//text()")

    position['work_name'] = work_name
    position['work_place'] = work_place
    position['work_category'] = work_category
    position['work_lack_number'] = work_lack_number
    position['work_duty'] = work_duty
    position['work_require'] = work_require

    return position


def get_detail_urls(url):
    response = requests.get(url=BASE_URL, headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    links = html.xpath("//tr[@class='even']//a/@href")
    links = map(lambda url: BASE_DOMAIN + url, links)
    return links


def spider():
    base_url = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a"
    positions = []
    for x in range(0, 4):  # 43
        x *= 10
        url = base_url.format(x)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            position = parse_detail_page(detail_url)
            positions.append(position)
            print(position)
            with open('tecentRecruit.txt', 'a', encoding='utf-8') as f:
                for (key, value) in position.items():
                    if key == 'work_duty':
                        str = 'work_duty :{}'
                        f.write(str.format(value))
                        f.write('\n')
                    elif key == 'work_require':
                        str = "work_require :{}"
                        f.write(str.format(value))
                        f.write('\n')
                    else:
                        f.write(key + ":" + value)
                        f.write('\n')
                f.write('\n' * 3)

    # print(positions)


if __name__ == '__main__':
    spider()

 

相关标签: Spider