Python3爬取网页数据存入MySQL

程序员文章站 2022-07-12 22:13:09

...

不太会用这个编辑器，就把word截图过来了….

from bs4 import BeautifulSoup
import urllib.request
import ssl #导入ssl认证东西
import time
import random
import mysql.connector

print('connect to mysql...')
conn = mysql.connector.connect(host='localhost', user='root', passwd='dongxue0123', db='mysql',
                                                   port=3306, charset='utf8')
print("connected!")
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS COMMM")
sql = """CREATE TABLE COMMM(
                           school_name char(255) NOT NULL ,
                           teacher_name char(255) NOT NULL ,
                           comm_date char(255),
                           commm char(255),
                           index(teacher_name))"""
cursor.execute(sql)

ssl._create_default_https_context = ssl._create_unverified_context #访问https证书失败，加上全局取消认证

url="https://www.mysupervisor.org/viewforum.php?f=115&sid=9867c9c03c1efefa23dafda9e7d61d07"
keep_request=True   #while_true=True 变量命名更清晰点
while keep_request:
    try:
        page = urllib.request.urlopen(url, timeout=10).read()
        keep_request = False
        main = BeautifulSoup(page, "html.parser")
        # print(school.title.string)
    except:
        print("reconnect to web..")  #print("重新连接")
        time.sleep(1)

for school_list in main.find_all('li',class_="row"):
    x = 10 * random.random()
    #print(x)
    time.sleep(x)

    #print(i.a.string) #输出学院名字，即输出标签中字符串
    half_school_link=school_list.a.get('href')  #得到标签中href中的内容
    schlool_link="https://www.mysupervisor.org"+half_school_link.strip('.') #link为每个学院网址
    #print(schlool_link)

    url1 = schlool_link
    keep_request = True
    while keep_request:
        try:
            page1 = urllib.request.urlopen(url1, timeout=20).read()
            keep_request = False
            school = BeautifulSoup(page1, "html.parser")
            #print(school.title.string)
        except:
            #print("reconnect..")
            time.sleep(1)
    #################开始访问每个老师###############
    for teacher_list in school.find_all('dl', class_="icon"):
        count = teacher_list.dd.get_text()  ######这是老师评论数量

        if (count[0] != '0'):  ###########如果评论数量不为零才可以输出
            #print(i.a.string, j.a.string)

            half_name_link = teacher_list.a.get('href')
            name_link = "https://www.mysupervisor.org" + half_name_link.strip('.')
            # print(name_link)

            url2 = name_link
            keep_requestt = True
            while keep_requestt:
                try:
                    page2 = urllib.request.urlopen(url2, timeout=20).read()
                    keep_requestt = False
                    soup2 = BeautifulSoup(page2, "html.parser")
                    #print(soup2.title.string)
                except:
                    #print("reconnect..")
                    time.sleep(1)

            for k in soup2.find_all('div', class_='inner'):
                if k.find(class_="postprofile") or k.find(class_="content"):
                    datee = k.find(class_="postprofile").get_text().strip()
                    # .get_text()/.string有什么区别
                    date = datee[11:]
                    comment = k.find(class_="content").get_text().strip()
                    print(school_list.a.get_text(), teacher_list.a.get_text() , date, comment)


                    conn = mysql.connector.connect(host='localhost', user='root', passwd='dongxue0123', db='mysql',
                                                   port=3306, charset='utf8')
                    cursor = conn.cursor()
                    if len(comment)>255:
                        comment=comment[:255]

                    insert_commm=("insert into COMMM(school_name,teacher_name,comm_date,commm)" "VALUE (%s,%s,%s,%s)")
                    data_commm=(school_list.a.get_text(), teacher_list.a.get_text(),date, comment)
                    cursor.execute(insert_commm,data_commm)

                    #cursor.execute('insert into mysql(school, name_, date_,comment) value(%s,%s,%s,%s)',(i.a.string, j.a.string,date, comment))
                    conn.commit()
                    #print("finish!")

                    #cursor.close()
                    #conn.close()
                # 解决游客+时间问题，datee输出为一个游客与时间的长字符串
                # 只需截取时间，用到字符串的截取
cursor.close()
conn.close()

上一篇： Python爬虫爬取微信朋友圈的方法，感兴趣的朋友可以了解下

下一篇： Windows 平台上的使用 Python 语言实现 appium 自动化程序环境配置

Python3爬取网页数据存入MySQL

Python爬取数据并写入MySQL数据库的实例

Scrapy爬取豆瓣图书数据并写入MySQL

python3爬取数据至mysql的方法

荐 Python爬虫：基于Scrapy爬取京东商品数据并保存到mysql且下载商品图片

Python3读取Excel数据存入MySQL的方法

Python爬虫爬取全球疫情数据并存储到mysql数据库的步骤

Python3爬虫学习之MySQL数据库存储爬取的信息详解

Python如何爬取51cto数据并存入MySQL

Python3爬虫之urllib携带cookie爬取网页的方法

python爬虫--爬取某网站电影信息并写入mysql数据库