汽车用户消费投诉数据爬取分析(Python爬虫)
程序员文章站
2022-04-16 16:23:21
"""name:汽车用户消费投诉_品牌url爬取,已完成author:xiaoyu"""import randomimport reimport timeimport pandas as pdimport requestsfrom bs4 import BeautifulSoupfrom selenium import webdriverfrom sqlalchemy import create_enginedef get_url_for_all_brand(): "...
多线程爬虫代码
"""
name:汽车用户消费投诉_品牌url爬取,已完成
author:zhangxiaoyu
"""
import _thread
import random
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from sqlalchemy import create_engine
def new_headers():
"""
生成随机的Headers
:return: Headers字典
"""
a = random.randint(1, 999)
b = random.randint(1, 99)
# 随机生成User-Agent
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
a) + '.' + str(b)
}
return headers
def get_url_for_all_brand():
"""
爬取所有品牌对应的url,并写入数据库
:return: 无输出
"""
# 随机生成User-Agent
headers = new_headers()
# 进入浏览器设置
options = webdriver.ChromeOptions()
# 更换头部
options.add_argument('user-agent=' + headers['User-Agent'])
# 打开一个空的窗口
driver = webdriver.Chrome(options=options)
# 确定要打开的网址
url = 'http://tousu.315che.com/tousulist/serial/93/'
# 打开界面
driver.get(url=url)
# 获取网页的源代码
source1 = driver.page_source
# print(source)
# 通过界面的Xpath定位并点击A-Z界面,如果越界则跳出循环
source = driver.page_source
car_name = re.findall('<a href="http://tousu.315che.com/tousulist/serial/.{1,7}/">(.{1,40})</a>', source)
for i in car_name:
print(i)
car_href = re.findall('<a href="(http://tousu.315che.com/tousulist/serial/.{1,7})/">.{1,40}</a>', source)
for i in car_href:
print(i)
data = pd.DataFrame({
'car_name': car_name,
'car_href': car_href
})
# 链接数据库:mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
# 写入数据库
data.to_sql('品牌url汇总', con=con, if_exists='append')
print("成功")
driver.close()
def download_url_for_all_brand():
"""
从数据库下载所有品牌对应的url
:return: 返回品牌名称和品牌对应的链接
"""
# 用sqlalchemy构建数据库链接engine
con = 'mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8'
engine = create_engine(con)
# sql 命令
sql_cmd = "SELECT * FROM 品牌url汇总"
url_brand = pd.read_sql(sql=sql_cmd, con=engine)[['car_name', 'car_href']]
# print(url_brand)
return url_brand
def get_brand_detail_url(brand, brand_url):
"""
name:具体品牌的界面爬取
author:zhangxiaoyu
"""
all_detail_url_list = []
# 随机生成User-Agent
headers = new_headers()
# 进入浏览器设置
options = webdriver.ChromeOptions()
# 更换头部
options.add_argument('user-agent=' + headers['User-Agent'])
# 打开一个空的窗口
driver = webdriver.Chrome(options=options)
# 打开界面
driver.get(url=brand_url)
# 获取网页的源代码
source = driver.page_source
# print(source)
page_num = re.findall('<span class="pag-tip">共(.+)页</span>', source)
print("页数:".format(page_num))
# print(driver.current_url)
if len(page_num) > 0:
# 获取所有的页数
for page in range(1, int(page_num[0]) + 1):
brand_url_page = driver.current_url + "/0/0/0/" + str(page) + ".htm"
print(brand_url_page)
try:
# 获取新的网页的源代码
response = requests.get(url=brand_url_page, headers=headers)
response.encoding = 'utf-8'
source = response.text
# 获取每个评论对应的url
soup = BeautifulSoup(source, 'lxml')
# print(soup)
soup1 = soup.find_all(class_="tousu-filter-list")
# print(soup1)
detail_url_list = re.findall('<a href="(.+)" target="_blank">', str(soup1))
print(detail_url_list)
for feedback_url in detail_url_list:
all_detail_url_list.append(feedback_url)
time.sleep(1)
except:
pass
# 对数据进行去重
all_detail_url_set = set(all_detail_url_list)
all_detail_url_list = list(all_detail_url_set)
# 去访问它的子页面
print(all_detail_url_list)
for feedback_url in all_detail_url_list:
# print(feedback_url)
try:
# 获取附属界面信息
get_feedback(brand, feedback_url)
print(feedback_url + "成功")
except:
print(feedback_url + "出错!!!")
def get_feedback(brand, feedback_url):
"""
:param brand: 汽车品牌
:param feedback_url: 具体评论的url
:return:
"""
# 随机生成User-Agent
a = random.randint(1, 999)
b = random.randint(1, 99)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
a) + '.' + str(b)
}
# 访问评论的url
response = requests.get(url=feedback_url, headers=headers)
# 设置网页编码
response.encoding = 'utf-8'
# 获取网页源码
source = response.text
# 获取单号
feedback_no = re.findall('<p class="">单号:(.+)</p>', source)
print(feedback_no)
# if feedback_no[0] is "":
# return 0
# 获取品牌车型
brand_model = re.findall('<p class="highlight">品牌车型:(.+)</p>', source)
# print(brand_model)
# 投诉问题
feedback_question = re.findall('<p class="">诉求问题:(.+)</p>', source)
print(feedback_question)
# 投诉时间
feedback_time = re.findall('<p class="">投诉时间:(....-.{0,3}-.{0,3} ..:..:..)</p>', source)
# print(feedback_time)
# 经销商
shop = re.findall('<p class="">经销商:(.+)</p>', source)
# print(shop)
# 投诉具体内容
soup = BeautifulSoup(source, 'lxml')
# print(soup)
soup1 = soup.find_all(class_="describe")
# print(soup1)
mark = [i.get_text() for i in soup1]
mark = mark[0][1:-1]
# print(mark)
# 投诉状态
soup2 = soup.find_all(class_="article-tag unsolved")
status = re.findall('<span class=".+">(.+)</span>', str(soup2))
# print(status)
data = pd.DataFrame({
'feedback_no': feedback_no,
'brand': brand,
'brand_model': brand_model,
'feedback_question': feedback_question,
'mark': mark,
'feedback_time': feedback_time,
'shop': shop,
'status': status,
'feedback_url': feedback_url,
})
# 链接数据库:mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
# 写入数据库
data.to_sql('汽车用户消费投诉多线程', con=con, if_exists='append')
time.sleep(0.5)
# 多线程
def print_time(start, end):
# 从数据库下载所有品牌对应的url
url_brand = download_url_for_all_brand()
for i in range(start, end):
brand = url_brand.loc[i][0]
brand_url = url_brand.loc[i][1]
print(brand, brand_url)
get_brand_detail_url(brand, brand_url)
print("一个线程结束")
if __name__ == '__main__':
get_url_for_all_brand()
_thread.start_new_thread(print_time, (0, 25))
_thread.start_new_thread(print_time, (25, 50))
_thread.start_new_thread(print_time, (50, 75))
_thread.start_new_thread(print_time, (75, 100))
_thread.start_new_thread(print_time, (100, 125))
_thread.start_new_thread(print_time, (125, 150))
_thread.start_new_thread(print_time, (150, 175))
_thread.start_new_thread(print_time, (175, 200))
_thread.start_new_thread(print_time, (200, 225))
_thread.start_new_thread(print_time, (225, 250))
_thread.start_new_thread(print_time, (250, 275))
_thread.start_new_thread(print_time, (275, 300))
_thread.start_new_thread(print_time, (300, 325))
_thread.start_new_thread(print_time, (325, 350))
_thread.start_new_thread(print_time, (350, 375))
_thread.start_new_thread(print_time, (375, 400))
_thread.start_new_thread(print_time, (400, 425))
_thread.start_new_thread(print_time, (425, 450))
_thread.start_new_thread(print_time, (450, 475))
_thread.start_new_thread(print_time, (475, 500))
_thread.start_new_thread(print_time, (500, 525))
_thread.start_new_thread(print_time, (525, 550))
_thread.start_new_thread(print_time, (550, 557))
while (1):
pass
简单的数据清洗
# 数据清洗
import pandas as pd
import matplotlib.pyplot as plt
# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉.xlsx')
data
# # 检查DataFrame中每行数据是否有重复的数据行
# mask = data.duplicated()
# mask
# data[~mask]
# 通过DataFrame内置方法去除重复的行数据
data = data.drop_duplicates()
data
data.to_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
投诉最多的二十大车型
# 投诉最多的二十大车型
import pandas as pd
import matplotlib.pyplot as plt
# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data
result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand_model')[['brand_model']].count()
result = result.sort_values(by='数量',ascending=False)
result = result.iloc[:20]
# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)
# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)
# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)
# 3.展示
plt.show()
投诉最多的十大品牌
# 投诉最多的十大品牌
import pandas as pd
import matplotlib.pyplot as plt
# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data
result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand')[['brand']].count()
result = result.sort_values(by='数量',ascending=False)
result = result.iloc[:10]
# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)
# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)
# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)
# 3.展示
plt.show()
最不靠谱的10大经销商
# 最不靠谱的10大经销商
import pandas as pd
import matplotlib.pyplot as plt
# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data
result = data.groupby(by='shop')[['shop']].count()
result['数量'] = data.groupby(by='shop')[['shop']].count()
result = result.sort_values(by='数量',ascending=False)
result
result = result.iloc[1:11]
# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)
# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)
# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)
# 3.展示
plt.show()
最近一年本网站接到的投诉数据趋势
# 最近一年本网站接到的投诉数据趋势
import pandas as pd
import matplotlib.pyplot as plt
# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data
# 2.数据处理
data['投诉年月'] = pd.DatetimeIndex(data['feedback_time']).strftime('%Y%m')
data.head()
result = data.groupby(by='投诉年月')[['投诉年月']].count()
result['数量'] = data.groupby(by='投诉年月')[['投诉年月']].count()
result = result.iloc[-12:-1]
result
# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)
# 2.绘图
# 绘制会员增量的折线图
ax = plt.twinx()
ax.plot(result.index,result['数量'],color='r')
# 3.展示
plt.show()
本文地址:https://blog.csdn.net/qq_29537269/article/details/107386998