欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

汽车用户消费投诉数据爬取分析(Python爬虫)

程序员文章站 2022-04-16 16:23:21
"""name:汽车用户消费投诉_品牌url爬取,已完成author:xiaoyu"""import randomimport reimport timeimport pandas as pdimport requestsfrom bs4 import BeautifulSoupfrom selenium import webdriverfrom sqlalchemy import create_enginedef get_url_for_all_brand(): "...

多线程爬虫代码

"""
name:汽车用户消费投诉_品牌url爬取,已完成
author:zhangxiaoyu
"""
import _thread
import random
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from sqlalchemy import create_engine


def new_headers():
    """
    生成随机的Headers
    :return: Headers字典
    """
    a = random.randint(1, 999)
    b = random.randint(1, 99)

    # 随机生成User-Agent
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }
    return headers


def get_url_for_all_brand():
    """
    爬取所有品牌对应的url,并写入数据库
    :return: 无输出
    """

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 确定要打开的网址
    url = 'http://tousu.315che.com/tousulist/serial/93/'

    # 打开界面
    driver.get(url=url)

    # 获取网页的源代码
    source1 = driver.page_source
    # print(source)

    # 通过界面的Xpath定位并点击A-Z界面,如果越界则跳出循环

    source = driver.page_source

    car_name = re.findall('<a href="http://tousu.315che.com/tousulist/serial/.{1,7}/">(.{1,40})</a>', source)
    for i in car_name:
        print(i)

    car_href = re.findall('<a href="(http://tousu.315che.com/tousulist/serial/.{1,7})/">.{1,40}</a>', source)
    for i in car_href:
        print(i)

    data = pd.DataFrame({
        'car_name': car_name,
        'car_href': car_href
    })

    # 链接数据库:mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
    con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
    # 写入数据库
    data.to_sql('品牌url汇总', con=con, if_exists='append')

    print("成功")
    driver.close()


def download_url_for_all_brand():
    """
    从数据库下载所有品牌对应的url
    :return: 返回品牌名称和品牌对应的链接
    """

    # 用sqlalchemy构建数据库链接engine
    con = 'mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8'
    engine = create_engine(con)
    # sql 命令
    sql_cmd = "SELECT * FROM 品牌url汇总"

    url_brand = pd.read_sql(sql=sql_cmd, con=engine)[['car_name', 'car_href']]
    # print(url_brand)
    return url_brand


def get_brand_detail_url(brand, brand_url):
    """
    name:具体品牌的界面爬取
    author:zhangxiaoyu
    """
    all_detail_url_list = []

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 打开界面
    driver.get(url=brand_url)

    # 获取网页的源代码
    source = driver.page_source
    # print(source)

    page_num = re.findall('<span class="pag-tip">共(.+)页</span>', source)
    print("页数:".format(page_num))
    # print(driver.current_url)

    if len(page_num) > 0:
        # 获取所有的页数
        for page in range(1, int(page_num[0]) + 1):
            brand_url_page = driver.current_url + "/0/0/0/" + str(page) + ".htm"
            print(brand_url_page)

            try:
                # 获取新的网页的源代码
                response = requests.get(url=brand_url_page, headers=headers)
                response.encoding = 'utf-8'
                source = response.text

                # 获取每个评论对应的url
                soup = BeautifulSoup(source, 'lxml')
                # print(soup)
                soup1 = soup.find_all(class_="tousu-filter-list")
                # print(soup1)
                detail_url_list = re.findall('<a href="(.+)" target="_blank">', str(soup1))
                print(detail_url_list)
                for feedback_url in detail_url_list:
                    all_detail_url_list.append(feedback_url)
                time.sleep(1)
            except:
                pass

        # 对数据进行去重
        all_detail_url_set = set(all_detail_url_list)
        all_detail_url_list = list(all_detail_url_set)

        # 去访问它的子页面
        print(all_detail_url_list)
        for feedback_url in all_detail_url_list:
            # print(feedback_url)
            try:
                # 获取附属界面信息
                get_feedback(brand, feedback_url)
                print(feedback_url + "成功")
            except:
                print(feedback_url + "出错!!!")


def get_feedback(brand, feedback_url):
    """
    :param brand: 汽车品牌
    :param feedback_url: 具体评论的url
    :return:
    """

    # 随机生成User-Agent
    a = random.randint(1, 999)
    b = random.randint(1, 99)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }

    # 访问评论的url
    response = requests.get(url=feedback_url, headers=headers)

    # 设置网页编码
    response.encoding = 'utf-8'

    # 获取网页源码
    source = response.text

    # 获取单号
    feedback_no = re.findall('<p class="">单号:(.+)</p>', source)
    print(feedback_no)
    # if feedback_no[0] is "":
    #     return 0

    # 获取品牌车型
    brand_model = re.findall('<p class="highlight">品牌车型:(.+)</p>', source)
    # print(brand_model)

    # 投诉问题
    feedback_question = re.findall('<p class="">诉求问题:(.+)</p>', source)
    print(feedback_question)

    # 投诉时间
    feedback_time = re.findall('<p class="">投诉时间:(....-.{0,3}-.{0,3} ..:..:..)</p>', source)
    # print(feedback_time)

    # 经销商
    shop = re.findall('<p class="">经销商:(.+)</p>', source)
    # print(shop)

    # 投诉具体内容
    soup = BeautifulSoup(source, 'lxml')
    # print(soup)
    soup1 = soup.find_all(class_="describe")
    # print(soup1)
    mark = [i.get_text() for i in soup1]
    mark = mark[0][1:-1]
    # print(mark)

    # 投诉状态
    soup2 = soup.find_all(class_="article-tag unsolved")
    status = re.findall('<span class=".+">(.+)</span>', str(soup2))
    # print(status)
    data = pd.DataFrame({
        'feedback_no': feedback_no,
        'brand': brand,
        'brand_model': brand_model,
        'feedback_question': feedback_question,
        'mark': mark,
        'feedback_time': feedback_time,
        'shop': shop,
        'status': status,
        'feedback_url': feedback_url,
    })

    # 链接数据库:mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
    con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
    # 写入数据库
    data.to_sql('汽车用户消费投诉多线程', con=con, if_exists='append')
    time.sleep(0.5)


# 多线程
def print_time(start, end):
    # 从数据库下载所有品牌对应的url
    url_brand = download_url_for_all_brand()
    for i in range(start, end):
        brand = url_brand.loc[i][0]
        brand_url = url_brand.loc[i][1]
        print(brand, brand_url)
        get_brand_detail_url(brand, brand_url)
    print("一个线程结束")


if __name__ == '__main__':
    get_url_for_all_brand()

    _thread.start_new_thread(print_time, (0, 25))
    _thread.start_new_thread(print_time, (25, 50))
    _thread.start_new_thread(print_time, (50, 75))
    _thread.start_new_thread(print_time, (75, 100))

    _thread.start_new_thread(print_time, (100, 125))
    _thread.start_new_thread(print_time, (125, 150))
    _thread.start_new_thread(print_time, (150, 175))
    _thread.start_new_thread(print_time, (175, 200))
    _thread.start_new_thread(print_time, (200, 225))
    _thread.start_new_thread(print_time, (225, 250))
    _thread.start_new_thread(print_time, (250, 275))
    _thread.start_new_thread(print_time, (275, 300))
    _thread.start_new_thread(print_time, (300, 325))
    _thread.start_new_thread(print_time, (325, 350))
    _thread.start_new_thread(print_time, (350, 375))
    _thread.start_new_thread(print_time, (375, 400))

    _thread.start_new_thread(print_time, (400, 425))
    _thread.start_new_thread(print_time, (425, 450))
    _thread.start_new_thread(print_time, (450, 475))
    _thread.start_new_thread(print_time, (475, 500))
    _thread.start_new_thread(print_time, (500, 525))
    _thread.start_new_thread(print_time, (525, 550))
    _thread.start_new_thread(print_time, (550, 557))


    while (1):
        pass

简单的数据清洗

# 数据清洗

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉.xlsx')
data

# # 检查DataFrame中每行数据是否有重复的数据行
# mask = data.duplicated()
# mask
# data[~mask]

# 通过DataFrame内置方法去除重复的行数据
data = data.drop_duplicates()
data

data.to_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')

投诉最多的二十大车型

# 投诉最多的二十大车型

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand_model')[['brand_model']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:20]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

投诉最多的十大品牌

# 投诉最多的十大品牌

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand')[['brand']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:10]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最不靠谱的10大经销商

# 最不靠谱的10大经销商

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

result = data.groupby(by='shop')[['shop']].count()
result['数量'] = data.groupby(by='shop')[['shop']].count()
result = result.sort_values(by='数量',ascending=False)
result

result = result.iloc[1:11]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最近一年本网站接到的投诉数据趋势

# 最近一年本网站接到的投诉数据趋势

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

# 2.数据处理
data['投诉年月'] = pd.DatetimeIndex(data['feedback_time']).strftime('%Y%m')
data.head()

result =  data.groupby(by='投诉年月')[['投诉年月']].count()
result['数量'] = data.groupby(by='投诉年月')[['投诉年月']].count()
result = result.iloc[-12:-1]
result

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图

# 绘制会员增量的折线图
ax = plt.twinx()
ax.plot(result.index,result['数量'],color='r')

# 3.展示
plt.show()

本文地址:https://blog.csdn.net/qq_29537269/article/details/107386998

相关标签: Python