(67)-- 多线程爬取腾讯招聘并存入数据库
程序员文章站
2022-04-26 22:29:34
...
# 多线程爬取腾讯招聘职位信息并存入数据库
# mydb.py
import pymysql
class Mydb:
def __init__(self):
try:
self.conn = pymysql.connect('127.0.0.1','root','123456','han',charset='utf8')
self.cursor = self.conn.cursor()
except Exception as e:
print(e)
def execute(self,sql,data):
try:
row = self.cursor.execute(sql,data)
self.conn.commit()
return row # 返回影响行数
except Exception as e:
print('执行增删改失败')
print(e)
self.conn.rollback()
if __name__ == '__main__':
mydb = Mydb()
sql = 'insert into py07_58friend(`name`,`age`,`height`,`edu`,`img`) VALUES(%s,%s,%s,%s,%s)'
data = ("大美",16,170,'博士','')
row = mydb.execute(sql,data)
print(row)
# paqu.py
import threading
import requests
from bs4 import BeautifulSoup
import time
from mydb import Mydb
from queue import Queue
class MyThread(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def __init__(self,task_q, mydb, lock):
self.task_q = task_q
self.mydb = mydb
self.lock = lock
super(MyThread, self).__init__()
# 如果线程启动则调用run方法
def run(self):
while not self.task_q.empty():
# 获取任务
fullurl = self.task_q.get()
print(fullurl)
response = requests.get(fullurl, headers=self.headers)
html = response.text
html = BeautifulSoup(html, 'lxml')
tr_list = html.select('tr')[1:-2]
for tr in tr_list:
position_name = tr.select('td a')[0].text
position_cls = tr.select('td')[1].text
position_num = tr.select('td')[2].text
position_loc = tr.select('td')[3].text
position_time = tr.select('td')[4].text
sql = 'insert into py07_location(p_name,p_type,p_num,p_loc,p_date) values(%s,%s,%s,%s,%s)'
data = (position_name, position_cls, position_num, position_loc, position_time)
# lock.acquire()
# self.mydb.execute(sql, data)
# lock.release()
with self.lock:
self.mydb.execute(sql,data)
# print(position_name, position_cls, position_num, position_loc, position_time)
if __name__ == '__main__':
mydb = Mydb()
lock = threading.Lock()
print(time.ctime())
task_q = Queue()
base_url = 'https://hr.tencent.com/position.php?start={}'
for i in range(0, 3000 + 1, 10):
fullurl = base_url.format(i)
task_q.put(fullurl)
thread_list = []
for i in range(20):
t = MyThread(task_q, mydb, lock)
t.start()
thread_list.append(t)
for t in thread_list:
t.join()
print(time.ctime())
# 爬取结果如下:
兄弟连学python
Python学习交流、资源共享群:563626388 QQ