爬取腾讯招聘信息存入mongodb数据库
程序员文章站
2022-05-08 10:57:51
...
SpiderTencent.py
import requests
from lxml import etree
import time
import pymongo
import random
class SpiderTencent(object):
def __init__(self):
"""
初始化url
网页页码下标地址
爬虫控制开关
"""
self.url = "http://hr.tencent.com/position.php?&start="
self.index = 0
self.switch = True
self.tencent_data = [] # 创建一个列表用来存储tencent招聘信息
def con_mongodb(self):
"""
创建mongodb对象
连接mongodb
"""
client = pymongo.MongoClient(host="localhost", port=27017)
db = client.py3
collection = db.tencent
for data in self.tencent_data:
collection.insert(data)
print("已将数据全部存入到mongodb中!")
def get_html(self, url):
"""
加载html页面,并解析为xml文档
"""
headers_list = [
{"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
{"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"},
]
headers = random.choice(headers_list)
try:
response = requests.get(url=url, headers=headers, timeout=20)
except:
print("have a error")
finally:
response = requests.get(url=url, headers=headers, timeout=20)
html = response.text
content = etree.HTML(html)
return content
def load_page(self, url):
"""
利用xpaht获取信息,存入mongodb中
"""
content = self.get_html(url)
job_title = content.xpath('(//tr[@class="even"] | //tr[@class="odd"])//a/text()') # 职位名称
job_category = content.xpath('//tr[@class="even"]//td[2]//text() | //tr[@class="odd"]//td[2]//text()') # 职位类别
number = content.xpath('//tr[@class="even"]//td[3]//text() | //tr[@class="odd"]//td[3]//text()') # 人数
location = content.xpath('//tr[@class="even"]//td[4]//text() | //tr[@class="odd"]//td[4]//text()') # 地点
info_list = zip(job_title, job_category, number, location) # 整合信息
for info in info_list:
info = {"job_title": info[0], "job_category": info[1], "number": info[2], "location": info[3]} # 拼接成字典
self.tencent_data.append(info)
print(info_list)
print("正在获取数据" + "-" * 10)
def start_switch(self):
"""
开启控制开关
"""
while self.switch:
tencent_url = self.url + str(self.index) # 拼接url地址
self.load_page(tencent_url)
time.sleep(5)
if self.index < 2500: # 判断是否到了最后一页
self.index += 10
else:
self.switch = False
self.con_mongodb() # 将数据存到mongodb中
print("程序结束")
if __name__ == '__main__':
tencent = SpiderTencent()
tencent.start_switch()