MySQL二手车数据清洗及特征处理
程序员文章站
2024-01-24 18:53:10
...
1. 数据来源:瓜子二手车爬虫(数据部分截图如下)
数据集字段含义:
- Brand:品牌
- Name:型号
- Boarding_time:上牌时间
- Km:里程数
- Discharge:排放标准
- Sec_price:二手车价格
- New_price:新车价格
2、缺失值的查找和处理
SELECT
count(*)
FROM
used_car_analysis
WHERE
Brand IS NULL
OR Name IS NULL
OR Boarding_time IS NULL
OR Km IS NULL
OR Discharge IS NULL
OR Sec_price IS NULL
OR New_price IS NULL;
3、数据类型不一致的处理
3、1 Brand
查看“品牌”列,无异常值。合计104种品牌。
SELECT Brand, count(*) from used_car_analysis
group by Brand;
SELECT count(*) from (SELECT Brand, count(*) from used_car_analysis
group by Brand) c;
3、2 Name
“型号”列比较复杂,等下会进行特征提取。
3、3 Bording_time
“上牌时间”中有93个“暂无”数据。全部修改为当前年份和月份。
SELECT
Boarding_time,
count(*)
FROM
used_car_analysis
WHERE
Boarding_time NOT LIKE "%年%月";
UPDATE used_car_analysis
SET Boarding_time = "2020年6月"
WHERE
Boarding_time = "未上牌";
3、4 Km
对于“里程数”列,存在部分数据为“百公里内”。将这些数据全部替换为0.01(即0.01万公里);其他数据删除单位“万公里”
SELECT
*
FROM
used_car_analysis
WHERE
Km NOT LIKE "%万%"
GROUP BY
Km;
UPDATE used_car_analysis a
INNER JOIN (
SELECT
NAME,
(
CASE
WHEN Km LIKE "%万公里" THEN
LEFT (Km, LOCATE("万", Km) - 1)
ELSE
0.01
END
) Km
FROM
used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Km = b.Km;
3、5 Discharge
对此列不做处理。本次分析不会用到这一列。
3、6 Sec_price
查看是否有0。结果:无。
3、7 New_price
查看是否有不同于其他数据的数据。
SELECT
*
FROM
used_car_analysis
WHERE
New_price NOT LIKE "%万"
part 2. 特征处理
1、车龄
根据Boarding_time,抽取上牌年份和月份,计算出距离当前时间的月份数,即车龄。
ALTER TABLE used_car_analysis ADD COLUMN Boarding_year INT
UPDATE used_car_analysis a
INNER JOIN (
SELECT
NAME,
LEFT (
Boarding_time,
LOCATE("年", Boarding_time) - 1
) Boarding_year
FROM
used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Boarding_year = b.Boarding_year
#提取上牌年份
ALTER TABLE used_car_analysis ADD COLUMN Boarding_month INT
UPDATE used_car_analysis a
INNER JOIN (
SELECT
NAME,
SUBSTRING(
Boarding_time
FROM
LOCATE("年", Boarding_time) + 1 FOR (
LOCATE("月", Boarding_time) - 1 - LOCATE("年", Boarding_time)
)
) Boarding_month
FROM
used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Boarding_month = b.Boarding_month
#提取上牌月份
ALTER TABLE used_car_analysis ADD COLUMN time INT
UPDATE used_car_analysis a
INNER JOIN (
SELECT
NAME,
(
(
YEAR (CURDATE()) - Boarding_year
) * 12 + MONTH (CURDATE()) - Boarding_month
) time
FROM
used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.time = b.time
#计算车龄
ALTER TABLE used_car_analysis DROP COLUMN Boarding_month
ALTER TABLE used_car_analysis DROP COLUMN Boarding_year
#删除无用的列
2、折旧率(年限平均折旧法)
ALTER TABLE used_car_analysis ADD COLUMN Depreciation FLOAT;
UPDATE used_car_analysis a
INNER JOIN (
SELECT Name,
(case
WHEN time !=0 then ((New_price - Sec_price)*12/time)/New_price
when time = 0 then ((New_price - Sec_price)/(time+1))/New_price
end) Depreciation
from used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Depreciation= b.Depreciation ;
3、折旧率(按照行驶里程数折旧)
ALTER TABLE used_car_analysis ADD COLUMN Dep_Distance FLOAT
UPDATE used_car_analysis a
INNER JOIN (
SELECT Name,
(case
WHEN Km !=0 then ((New_price - Sec_price)*10000/Km)/New_price
ELSE 0
end) Dep_Distance
from used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Dep_Distance = b.Dep_Distance
4、款式
从Name中提取二手车的款式。
select count(*) from used_car_analysis
WHERE Name like "%款%";
#查看是否每个Name中是否都含有“款”字,为之后的款式字符抽取做准备。(结果表明每列都包含有“款”字)
ALTER TABLE used_car_analysis ADD COLUMN Style INT;
UPDATE used_car_analysis a
INNER JOIN (
SELECT
Name,right(style,4) Style from (SELECT Name,SUBSTRING_INDEX(Name,'款',1) style FROM used_car_analysis) c
) AS b ON a. NAME = b. NAME
SET a.Style = b.Style ;
#抽取每款车的款式
5、型号
从Name中提取二手车的型号。
ALTER TABLE used_car_analysis ADD COLUMN model CHAR(255);
UPDATE used_car_analysis a
INNER JOIN (
SELECT
NAME,
SUBSTRING_INDEX(model, "20", 1) model
FROM
used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.model = b.model;
#提取每款车的型号。
UPDATE used_car_analysis a
INNER JOIN (
SELECT
NAME,
SUBSTRING_INDEX(model, " ", 1) model
FROM
(
SELECT
NAME,
Style,
model
FROM
used_car_analysis
WHERE
Style NOT LIKE "20%"
) c
) AS b ON a. NAME = b. NAME
SET a.model = b.model;
#对于2000年以前的型号,另做更新。
(为方便“二八”法则的分析,特提取下列数据信息)
6、各品牌销售量累计求和
#对各品牌的销量累计求和
SET @csum := 0;
SELECT
Brand,
sale,
(@csum := @csum + sale) AS cum_sale
FROM
(
SELECT
Brand,
count(*) sale
FROM
used_car_analysis
GROUP BY
Brand
ORDER BY
sale DESC
) c;
7、各品牌销售额累计求和
SET @csum := 0;
SELECT
Brand,
sale_money,
(@csum := @csum + sale_money) cum_sale_money
FROM (
select Brand,
sum(if(Brand = Brand,Sec_price,0)) sale_money
from used_car_analysis
group by Brand
ORDER by sale_money DESC
) c;