汽车用户消费投诉数据爬取分析（Python爬虫）

程序员文章站 2022-04-16 16:23:21

"""name:汽车用户消费投诉_品牌url爬取，已完成author:xiaoyu"""import randomimport reimport timeimport pandas as pdimport requestsfrom bs4 import BeautifulSoupfrom selenium import webdriverfrom sqlalchemy import create_enginedef get_url_for_all_brand(): "...

多线程爬虫代码

"""
name:汽车用户消费投诉_品牌url爬取，已完成
author:zhangxiaoyu
"""
import _thread
import random
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from sqlalchemy import create_engine


def new_headers():
    """
    生成随机的Headers
    :return: Headers字典
    """
    a = random.randint(1, 999)
    b = random.randint(1, 99)

    # 随机生成User-Agent
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }
    return headers


def get_url_for_all_brand():
    """
    爬取所有品牌对应的url，并写入数据库
    :return: 无输出
    """

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 确定要打开的网址
    url = 'http://tousu.315che.com/tousulist/serial/93/'

    # 打开界面
    driver.get(url=url)

    # 获取网页的源代码
    source1 = driver.page_source
    # print(source)

    # 通过界面的Xpath定位并点击A-Z界面，如果越界则跳出循环

    source = driver.page_source

    car_name = re.findall('<a href="http://tousu.315che.com/tousulist/serial/.{1,7}/">(.{1,40})</a>', source)
    for i in car_name:
        print(i)

    car_href = re.findall('<a href="(http://tousu.315che.com/tousulist/serial/.{1,7})/">.{1,40}</a>', source)
    for i in car_href:
        print(i)

    data = pd.DataFrame({
        'car_name': car_name,
        'car_href': car_href
    })

    # 链接数据库：mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
    con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
    # 写入数据库
    data.to_sql('品牌url汇总', con=con, if_exists='append')

    print("成功")
    driver.close()


def download_url_for_all_brand():
    """
    从数据库下载所有品牌对应的url
    :return: 返回品牌名称和品牌对应的链接
    """

    # 用sqlalchemy构建数据库链接engine
    con = 'mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8'
    engine = create_engine(con)
    # sql 命令
    sql_cmd = "SELECT * FROM 品牌url汇总"

    url_brand = pd.read_sql(sql=sql_cmd, con=engine)[['car_name', 'car_href']]
    # print(url_brand)
    return url_brand


def get_brand_detail_url(brand, brand_url):
    """
    name:具体品牌的界面爬取
    author:zhangxiaoyu
    """
    all_detail_url_list = []

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 打开界面
    driver.get(url=brand_url)

    # 获取网页的源代码
    source = driver.page_source
    # print(source)

    page_num = re.findall('<span class="pag-tip">共(.+)页</span>', source)
    print("页数:".format(page_num))
    # print(driver.current_url)

    if len(page_num) > 0:
        # 获取所有的页数
        for page in range(1, int(page_num[0]) + 1):
            brand_url_page = driver.current_url + "/0/0/0/" + str(page) + ".htm"
            print(brand_url_page)

            try:
                # 获取新的网页的源代码
                response = requests.get(url=brand_url_page, headers=headers)
                response.encoding = 'utf-8'
                source = response.text

                # 获取每个评论对应的url
                soup = BeautifulSoup(source, 'lxml')
                # print(soup)
                soup1 = soup.find_all(class_="tousu-filter-list")
                # print(soup1)
                detail_url_list = re.findall('<a href="(.+)" target="_blank">', str(soup1))
                print(detail_url_list)
                for feedback_url in detail_url_list:
                    all_detail_url_list.append(feedback_url)
                time.sleep(1)
            except:
                pass

        # 对数据进行去重
        all_detail_url_set = set(all_detail_url_list)
        all_detail_url_list = list(all_detail_url_set)

        # 去访问它的子页面
        print(all_detail_url_list)
        for feedback_url in all_detail_url_list:
            # print(feedback_url)
            try:
                # 获取附属界面信息
                get_feedback(brand, feedback_url)
                print(feedback_url + "成功")
            except:
                print(feedback_url + "出错！！！")


def get_feedback(brand, feedback_url):
    """
    :param brand: 汽车品牌
    :param feedback_url: 具体评论的url
    :return:
    """

    # 随机生成User-Agent
    a = random.randint(1, 999)
    b = random.randint(1, 99)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }

    # 访问评论的url
    response = requests.get(url=feedback_url, headers=headers)

    # 设置网页编码
    response.encoding = 'utf-8'

    # 获取网页源码
    source = response.text

    # 获取单号
    feedback_no = re.findall('<p class="">单号：(.+)</p>', source)
    print(feedback_no)
    # if feedback_no[0] is "":
    #     return 0

    # 获取品牌车型
    brand_model = re.findall('<p class="highlight">品牌车型：(.+)</p>', source)
    # print(brand_model)

    # 投诉问题
    feedback_question = re.findall('<p class="">诉求问题：(.+)</p>', source)
    print(feedback_question)

    # 投诉时间
    feedback_time = re.findall('<p class="">投诉时间：(....-.{0,3}-.{0,3} ..:..:..)</p>', source)
    # print(feedback_time)

    # 经销商
    shop = re.findall('<p class="">经销商：(.+)</p>', source)
    # print(shop)

    # 投诉具体内容
    soup = BeautifulSoup(source, 'lxml')
    # print(soup)
    soup1 = soup.find_all(class_="describe")
    # print(soup1)
    mark = [i.get_text() for i in soup1]
    mark = mark[0][1:-1]
    # print(mark)

    # 投诉状态
    soup2 = soup.find_all(class_="article-tag unsolved")
    status = re.findall('<span class=".+">(.+)</span>', str(soup2))
    # print(status)
    data = pd.DataFrame({
        'feedback_no': feedback_no,
        'brand': brand,
        'brand_model': brand_model,
        'feedback_question': feedback_question,
        'mark': mark,
        'feedback_time': feedback_time,
        'shop': shop,
        'status': status,
        'feedback_url': feedback_url,
    })

    # 链接数据库：mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
    con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
    # 写入数据库
    data.to_sql('汽车用户消费投诉多线程', con=con, if_exists='append')
    time.sleep(0.5)


# 多线程
def print_time(start, end):
    # 从数据库下载所有品牌对应的url
    url_brand = download_url_for_all_brand()
    for i in range(start, end):
        brand = url_brand.loc[i][0]
        brand_url = url_brand.loc[i][1]
        print(brand, brand_url)
        get_brand_detail_url(brand, brand_url)
    print("一个线程结束")


if __name__ == '__main__':
    get_url_for_all_brand()

    _thread.start_new_thread(print_time, (0, 25))
    _thread.start_new_thread(print_time, (25, 50))
    _thread.start_new_thread(print_time, (50, 75))
    _thread.start_new_thread(print_time, (75, 100))

    _thread.start_new_thread(print_time, (100, 125))
    _thread.start_new_thread(print_time, (125, 150))
    _thread.start_new_thread(print_time, (150, 175))
    _thread.start_new_thread(print_time, (175, 200))
    _thread.start_new_thread(print_time, (200, 225))
    _thread.start_new_thread(print_time, (225, 250))
    _thread.start_new_thread(print_time, (250, 275))
    _thread.start_new_thread(print_time, (275, 300))
    _thread.start_new_thread(print_time, (300, 325))
    _thread.start_new_thread(print_time, (325, 350))
    _thread.start_new_thread(print_time, (350, 375))
    _thread.start_new_thread(print_time, (375, 400))

    _thread.start_new_thread(print_time, (400, 425))
    _thread.start_new_thread(print_time, (425, 450))
    _thread.start_new_thread(print_time, (450, 475))
    _thread.start_new_thread(print_time, (475, 500))
    _thread.start_new_thread(print_time, (500, 525))
    _thread.start_new_thread(print_time, (525, 550))
    _thread.start_new_thread(print_time, (550, 557))


    while (1):
        pass

简单的数据清洗

# 数据清洗

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉.xlsx')
data

# # 检查DataFrame中每行数据是否有重复的数据行
# mask = data.duplicated()
# mask
# data[~mask]

# 通过DataFrame内置方法去除重复的行数据
data = data.drop_duplicates()
data

data.to_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')

投诉最多的二十大车型

# 投诉最多的二十大车型

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand_model')[['brand_model']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:20]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

投诉最多的十大品牌

# 投诉最多的十大品牌

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand')[['brand']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:10]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最不靠谱的10大经销商

# 最不靠谱的10大经销商

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

result = data.groupby(by='shop')[['shop']].count()
result['数量'] = data.groupby(by='shop')[['shop']].count()
result = result.sort_values(by='数量',ascending=False)
result

result = result.iloc[1:11]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最近一年本网站接到的投诉数据趋势

# 最近一年本网站接到的投诉数据趋势

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

# 2.数据处理
data['投诉年月'] = pd.DatetimeIndex(data['feedback_time']).strftime('%Y%m')
data.head()

result =  data.groupby(by='投诉年月')[['投诉年月']].count()
result['数量'] = data.groupby(by='投诉年月')[['投诉年月']].count()
result = result.iloc[-12:-1]
result

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图

# 绘制会员增量的折线图
ax = plt.twinx()
ax.plot(result.index,result['数量'],color='r')

# 3.展示
plt.show()

本文地址：https://blog.csdn.net/qq_29537269/article/details/107386998

汽车用户消费投诉数据爬取分析（Python爬虫）

多线程爬虫代码

简单的数据清洗

投诉最多的二十大车型

投诉最多的十大品牌

最不靠谱的10大经销商

最近一年本网站接到的投诉数据趋势

php爬虫：百万级别知乎用户数据爬取与分析

PHP爬虫之百万级别知乎用户数据爬取与分析

网易云歌单信息爬取及数据分析（python爬虫）

汽车用户消费投诉数据爬取分析（Python爬虫）

php爬虫：百万级别知乎用户数据爬取与分析

Python爬虫+数据分析实战--爬取并分析中国天气网的温度信息

python爬虫之Appium爬取手机App数据及模拟用户手势

PHP爬虫之百万级别知乎用户数据爬取与分析

网易云歌单信息爬取及数据分析（python爬虫）

Python爬虫实战，爬取A股公司数据，简单分析A股公司并生成词云