欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取腾讯招聘信息存入mongodb数据库

程序员文章站 2022-05-08 10:57:51
...

SpiderTencent.py

import requests
from lxml import etree
import time
import pymongo
import random


class SpiderTencent(object):
    def __init__(self):
        """
        初始化url
        网页页码下标地址
        爬虫控制开关
        """
        self.url = "http://hr.tencent.com/position.php?&start="
        self.index = 0
        self.switch = True
        self.tencent_data = []  # 创建一个列表用来存储tencent招聘信息

    def con_mongodb(self):
        """
        创建mongodb对象
        连接mongodb
        """
        client = pymongo.MongoClient(host="localhost", port=27017)
        db = client.py3
        collection = db.tencent
        for data in self.tencent_data:
            collection.insert(data)
        print("已将数据全部存入到mongodb中!")

    def get_html(self, url):
        """
            加载html页面,并解析为xml文档
        """
        headers_list = [
            {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
            {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"},
        ]
        headers = random.choice(headers_list)
        try:
            response = requests.get(url=url, headers=headers, timeout=20)
        except:
            print("have a error")
        finally:
            response = requests.get(url=url, headers=headers, timeout=20)
        html = response.text
        content = etree.HTML(html)
        return content

    def load_page(self, url):
        """
        利用xpaht获取信息,存入mongodb中
        """
        content = self.get_html(url)
        job_title = content.xpath('(//tr[@class="even"] | //tr[@class="odd"])//a/text()')    # 职位名称
        job_category = content.xpath('//tr[@class="even"]//td[2]//text() | //tr[@class="odd"]//td[2]//text()')   # 职位类别
        number = content.xpath('//tr[@class="even"]//td[3]//text() | //tr[@class="odd"]//td[3]//text()')  # 人数
        location = content.xpath('//tr[@class="even"]//td[4]//text() | //tr[@class="odd"]//td[4]//text()')  # 地点
        info_list = zip(job_title, job_category, number, location)  # 整合信息
        for info in info_list:
            info = {"job_title": info[0], "job_category": info[1], "number": info[2], "location": info[3]}  # 拼接成字典
            self.tencent_data.append(info)
        print(info_list)
        print("正在获取数据" + "-" * 10)

    def start_switch(self):
        """
        开启控制开关
        """
        while self.switch:
            tencent_url = self.url + str(self.index)  # 拼接url地址
            self.load_page(tencent_url)
            time.sleep(5)
            if self.index < 2500:   # 判断是否到了最后一页
                self.index += 10
            else:
                self.switch = False
                self.con_mongodb()  # 将数据存到mongodb中
                print("程序结束")


if __name__ == '__main__':
    tencent = SpiderTencent()
    tencent.start_switch()

 

相关标签: spider