欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

MySQL二手车数据清洗及特征处理

程序员文章站 2024-01-24 18:53:10
...

1. 数据来源:瓜子二手车爬虫(数据部分截图如下)

MySQL二手车数据清洗及特征处理

数据集字段含义:

  • Brand:品牌
  • Name:型号
  • Boarding_time:上牌时间
  • Km:里程数
  • Discharge:排放标准
  • Sec_price:二手车价格
  • New_price:新车价格

2、缺失值的查找和处理

SELECT
	count(*)
FROM
	used_car_analysis
WHERE
	Brand IS NULL
OR Name IS NULL
OR Boarding_time IS NULL
OR Km IS NULL
OR Discharge IS NULL
OR Sec_price IS NULL
OR New_price IS NULL;

3、数据类型不一致的处理

3、1 Brand

查看“品牌”列,无异常值。合计104种品牌。

SELECT Brand, count(*) from used_car_analysis
group by Brand;

SELECT count(*) from (SELECT Brand, count(*) from used_car_analysis
group by Brand) c;

3、2 Name

“型号”列比较复杂,等下会进行特征提取。

3、3 Bording_time

“上牌时间”中有93个“暂无”数据。全部修改为当前年份和月份。

SELECT
	Boarding_time,
	count(*)
FROM
	used_car_analysis
WHERE
	Boarding_time NOT LIKE "%年%月";

UPDATE used_car_analysis
SET Boarding_time = "2020年6月"
WHERE
	Boarding_time = "未上牌";

3、4 Km

对于“里程数”列,存在部分数据为“百公里内”。将这些数据全部替换为0.01(即0.01万公里);其他数据删除单位“万公里”

SELECT
	*
FROM
	used_car_analysis
WHERE
	Km NOT LIKE "%万%"
GROUP BY
	Km;

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		(
			CASE
			WHEN Km LIKE "%万公里" THEN
				LEFT (Km, LOCATE("万", Km) - 1)
			ELSE
				0.01
			END
		) Km
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Km = b.Km;

 

3、5 Discharge

对此列不做处理。本次分析不会用到这一列。

3、6 Sec_price

查看是否有0。结果:无。

3、7 New_price

查看是否有不同于其他数据的数据。

SELECT
	*
FROM
	used_car_analysis
WHERE
	New_price NOT LIKE "%万"

part 2. 特征处理

1、车龄

根据Boarding_time,抽取上牌年份和月份,计算出距离当前时间的月份数,即车龄。

ALTER TABLE used_car_analysis ADD COLUMN Boarding_year INT 

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		LEFT (
			Boarding_time,
			LOCATE("年", Boarding_time) - 1
		) Boarding_year
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Boarding_year = b.Boarding_year 
#提取上牌年份

ALTER TABLE used_car_analysis ADD COLUMN Boarding_month INT 

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		SUBSTRING(
			Boarding_time
			FROM
				LOCATE("年", Boarding_time) + 1 FOR (
					LOCATE("月", Boarding_time) - 1 - LOCATE("年", Boarding_time)
				)
		) Boarding_month
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Boarding_month = b.Boarding_month 
#提取上牌月份

ALTER TABLE used_car_analysis ADD COLUMN time INT 

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		(
			(
				YEAR (CURDATE()) - Boarding_year
			) * 12 + MONTH (CURDATE()) - Boarding_month
		) time
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.time = b.time 
#计算车龄

ALTER TABLE used_car_analysis DROP COLUMN Boarding_month 
ALTER TABLE used_car_analysis DROP COLUMN Boarding_year 
#删除无用的列

2、折旧率(年限平均折旧法)

ALTER TABLE used_car_analysis ADD COLUMN Depreciation FLOAT;

UPDATE used_car_analysis a
INNER JOIN (
SELECT Name,
(case 
WHEN time !=0 then ((New_price - Sec_price)*12/time)/New_price  
when time = 0 then ((New_price - Sec_price)/(time+1))/New_price 
end) Depreciation
from used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Depreciation= b.Depreciation ;


3、折旧率(按照行驶里程数折旧)

ALTER TABLE used_car_analysis ADD COLUMN Dep_Distance FLOAT 
UPDATE used_car_analysis a
INNER JOIN (
SELECT Name,
(case 
WHEN Km !=0 then ((New_price - Sec_price)*10000/Km)/New_price
ELSE 0
end) Dep_Distance
from used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Dep_Distance = b.Dep_Distance 

4、款式

从Name中提取二手车的款式。

select count(*) from used_car_analysis
WHERE Name like "%款%";
#查看是否每个Name中是否都含有“款”字,为之后的款式字符抽取做准备。(结果表明每列都包含有“款”字)

ALTER TABLE used_car_analysis ADD COLUMN Style INT;

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		Name,right(style,4) Style from (SELECT Name,SUBSTRING_INDEX(Name,'款',1) style FROM used_car_analysis) c
) AS b ON a. NAME = b. NAME
SET a.Style = b.Style ;
#抽取每款车的款式

5、型号

从Name中提取二手车的型号。

ALTER TABLE used_car_analysis ADD COLUMN model CHAR(255);

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		SUBSTRING_INDEX(model, "20", 1) model
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.model = b.model;
#提取每款车的型号。

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		SUBSTRING_INDEX(model, " ", 1) model
	FROM
		(
			SELECT
				NAME,
				Style,
				model
			FROM
				used_car_analysis
			WHERE
				Style NOT LIKE "20%"
		) c
) AS b ON a. NAME = b. NAME
SET a.model = b.model;
#对于2000年以前的型号,另做更新。


(为方便“二八”法则的分析,特提取下列数据信息)

6、各品牌销售量累计求和

#对各品牌的销量累计求和
SET @csum := 0;
SELECT
	Brand,
	sale,
	(@csum := @csum + sale) AS cum_sale
FROM
	(
		SELECT
			Brand,
			count(*) sale
		FROM
			used_car_analysis
		GROUP BY
			Brand
		ORDER BY
			sale DESC
	) c;

7、各品牌销售额累计求和

SET @csum := 0;
SELECT
	Brand, 
  sale_money,
	(@csum := @csum + sale_money)  cum_sale_money 
FROM (
select Brand,
  sum(if(Brand = Brand,Sec_price,0)) sale_money
from used_car_analysis
group by Brand 
ORDER by sale_money DESC
) c;