欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

(67)-- 多线程爬取腾讯招聘并存入数据库

程序员文章站 2022-04-26 22:29:34
...

# 多线程爬取腾讯招聘职位信息并存入数据库 

# mydb.py

import pymysql

class Mydb:
    def __init__(self):
        try:
            self.conn = pymysql.connect('127.0.0.1','root','123456','han',charset='utf8')
            self.cursor = self.conn.cursor()
        except Exception as e:
            print(e)

    def execute(self,sql,data):
        try:
            row = self.cursor.execute(sql,data)
            self.conn.commit()
            return row # 返回影响行数
        except Exception as e:
            print('执行增删改失败')
            print(e)
            self.conn.rollback()

if __name__ == '__main__':
    mydb = Mydb()
    sql = 'insert into py07_58friend(`name`,`age`,`height`,`edu`,`img`) VALUES(%s,%s,%s,%s,%s)'
    data = ("大美",16,170,'博士','')
    row = mydb.execute(sql,data)
    print(row)


# paqu.py

import threading
import requests
from bs4 import BeautifulSoup
import time
from mydb import Mydb
from queue import Queue

class MyThread(threading.Thread):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    def __init__(self,task_q, mydb, lock):
        self.task_q = task_q
        self.mydb = mydb
        self.lock = lock
        super(MyThread, self).__init__()

# 如果线程启动则调用run方法
    def run(self):
        while not self.task_q.empty():
            # 获取任务
            fullurl = self.task_q.get()
            print(fullurl)
            response = requests.get(fullurl, headers=self.headers)

            html = response.text

            html = BeautifulSoup(html, 'lxml')

            tr_list = html.select('tr')[1:-2]

            for tr in tr_list:
                position_name = tr.select('td a')[0].text
                position_cls = tr.select('td')[1].text
                position_num = tr.select('td')[2].text
                position_loc = tr.select('td')[3].text
                position_time = tr.select('td')[4].text

                sql = 'insert into py07_location(p_name,p_type,p_num,p_loc,p_date) values(%s,%s,%s,%s,%s)'
                data = (position_name, position_cls, position_num, position_loc, position_time)

                # lock.acquire()
                # self.mydb.execute(sql, data)
                # lock.release()

                with self.lock:
                    self.mydb.execute(sql,data)

                # print(position_name, position_cls, position_num, position_loc, position_time)




if __name__ == '__main__':
    mydb = Mydb()
    lock = threading.Lock()
    print(time.ctime())
    task_q = Queue()
    base_url = 'https://hr.tencent.com/position.php?start={}'
    for i in range(0, 3000 + 1, 10):
        fullurl = base_url.format(i)
        task_q.put(fullurl)
    thread_list = []
    for i in range(20):
        t = MyThread(task_q, mydb, lock)
        t.start()
        thread_list.append(t)

    for t in thread_list:
        t.join()

    print(time.ctime())

# 爬取结果如下:

(67)-- 多线程爬取腾讯招聘并存入数据库



(67)-- 多线程爬取腾讯招聘并存入数据库


兄弟连学python


Python学习交流、资源共享群:563626388 QQ


相关标签: 腾讯招聘