爬取腾讯招聘信息写入TXT文件
程序员文章站
2022-05-08 10:57:21
...
"""
需求:爬取腾讯招聘数据(职位名称,职位类别,人数,地点, 工作要求,工作职责)
url = "http://hr.tencent.com/position.php?&start="
"""
import requests
from lxml import etree
BASE_DOMAIN = "http://hr.tencent.com/"
HEADERS = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
}
BASE_URL = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0"
def parse_detail_page(url):
position = {}
response = requests.get(url, headers=HEADERS)
html = etree.HTML(response.text)
work_name = html.xpath("//tr[@class='h']/td/text()")[0]
work_place = html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0]
work_category = html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0]
work_lack_number = html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0]
more_info = html.xpath("//ul[@class='squareli']")
work_duty = more_info[0].xpath(".//text()")
work_require = more_info[1].xpath(".//text()")
position['work_name'] = work_name
position['work_place'] = work_place
position['work_category'] = work_category
position['work_lack_number'] = work_lack_number
position['work_duty'] = work_duty
position['work_require'] = work_require
return position
def get_detail_urls(url):
response = requests.get(url=BASE_URL, headers=HEADERS)
text = response.text
html = etree.HTML(text)
links = html.xpath("//tr[@class='even']//a/@href")
links = map(lambda url: BASE_DOMAIN + url, links)
return links
def spider():
base_url = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a"
positions = []
for x in range(0, 4): # 43
x *= 10
url = base_url.format(x)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
position = parse_detail_page(detail_url)
positions.append(position)
print(position)
with open('tecentRecruit.txt', 'a', encoding='utf-8') as f:
for (key, value) in position.items():
if key == 'work_duty':
str = 'work_duty :{}'
f.write(str.format(value))
f.write('\n')
elif key == 'work_require':
str = "work_require :{}"
f.write(str.format(value))
f.write('\n')
else:
f.write(key + ":" + value)
f.write('\n')
f.write('\n' * 3)
# print(positions)
if __name__ == '__main__':
spider()