Python3爬取网页数据存入MySQL
程序员文章站
2022-07-12 22:13:09
...
不太会用这个编辑器,就把word截图过来了….
from bs4 import BeautifulSoup
import urllib.request
import ssl #导入ssl认证东西
import time
import random
import mysql.connector
print('connect to mysql...')
conn = mysql.connector.connect(host='localhost', user='root', passwd='dongxue0123', db='mysql',
port=3306, charset='utf8')
print("connected!")
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS COMMM")
sql = """CREATE TABLE COMMM(
school_name char(255) NOT NULL ,
teacher_name char(255) NOT NULL ,
comm_date char(255),
commm char(255),
index(teacher_name))"""
cursor.execute(sql)
ssl._create_default_https_context = ssl._create_unverified_context #访问https证书失败,加上全局取消认证
url="https://www.mysupervisor.org/viewforum.php?f=115&sid=9867c9c03c1efefa23dafda9e7d61d07"
keep_request=True #while_true=True 变量命名更清晰点
while keep_request:
try:
page = urllib.request.urlopen(url, timeout=10).read()
keep_request = False
main = BeautifulSoup(page, "html.parser")
# print(school.title.string)
except:
print("reconnect to web..") #print("重新连接")
time.sleep(1)
for school_list in main.find_all('li',class_="row"):
x = 10 * random.random()
#print(x)
time.sleep(x)
#print(i.a.string) #输出学院名字,即输出标签中字符串
half_school_link=school_list.a.get('href') #得到标签中href中的内容
schlool_link="https://www.mysupervisor.org"+half_school_link.strip('.') #link为每个学院网址
#print(schlool_link)
url1 = schlool_link
keep_request = True
while keep_request:
try:
page1 = urllib.request.urlopen(url1, timeout=20).read()
keep_request = False
school = BeautifulSoup(page1, "html.parser")
#print(school.title.string)
except:
#print("reconnect..")
time.sleep(1)
#################开始访问每个老师###############
for teacher_list in school.find_all('dl', class_="icon"):
count = teacher_list.dd.get_text() ######这是老师评论数量
if (count[0] != '0'): ###########如果评论数量不为零才可以输出
#print(i.a.string, j.a.string)
half_name_link = teacher_list.a.get('href')
name_link = "https://www.mysupervisor.org" + half_name_link.strip('.')
# print(name_link)
url2 = name_link
keep_requestt = True
while keep_requestt:
try:
page2 = urllib.request.urlopen(url2, timeout=20).read()
keep_requestt = False
soup2 = BeautifulSoup(page2, "html.parser")
#print(soup2.title.string)
except:
#print("reconnect..")
time.sleep(1)
for k in soup2.find_all('div', class_='inner'):
if k.find(class_="postprofile") or k.find(class_="content"):
datee = k.find(class_="postprofile").get_text().strip()
# .get_text()/.string有什么区别
date = datee[11:]
comment = k.find(class_="content").get_text().strip()
print(school_list.a.get_text(), teacher_list.a.get_text() , date, comment)
conn = mysql.connector.connect(host='localhost', user='root', passwd='dongxue0123', db='mysql',
port=3306, charset='utf8')
cursor = conn.cursor()
if len(comment)>255:
comment=comment[:255]
insert_commm=("insert into COMMM(school_name,teacher_name,comm_date,commm)" "VALUE (%s,%s,%s,%s)")
data_commm=(school_list.a.get_text(), teacher_list.a.get_text(),date, comment)
cursor.execute(insert_commm,data_commm)
#cursor.execute('insert into mysql(school, name_, date_,comment) value(%s,%s,%s,%s)',(i.a.string, j.a.string,date, comment))
conn.commit()
#print("finish!")
#cursor.close()
#conn.close()
# 解决游客+时间问题,datee输出为一个游客与时间的长字符串
# 只需截取时间,用到字符串的截取
cursor.close()
conn.close()