欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

kaggle房价预测

程序员文章站 2024-03-22 08:10:46
...

数据预览

import pandas as pd
import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv(
    'https://labfile.oss.aliyuncs.com/courses/1363/HousePrice.csv')
train

Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	12	2008	WD	Normal	250000
5	6	50	RL	85.0	14115	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	MnPrv	Shed	700	10	2009	WD	Normal	143000
6	7	20	RL	75.0	10084	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2007	WD	Normal	307000
7	8	60	RL	NaN	10382	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	Shed	350	11	2009	WD	Normal	200000
8	9	50	RM	51.0	6120	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	4	2008	WD	Abnorml	129900
9	10	190	RL	50.0	7420	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	1	2008	WD	Normal	118000
10	11	20	RL	70.0	11200	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	2	2008	WD	Normal	129500
11	12	60	RL	85.0	11924	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	7	2006	New	Partial	345000
12	13	20	RL	NaN	12968	Pave	NaN	IR2	Lvl	AllPub	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	144000
13	14	20	RL	91.0	10652	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2007	New	Partial	279500
14	15	20	RL	NaN	10920	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	GdWo	NaN	0	5	2008	WD	Normal	157000
15	16	45	RM	51.0	6120	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	GdPrv	NaN	0	7	2007	WD	Normal	132000
16	17	20	RL	NaN	11241	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	Shed	700	3	2010	WD	Normal	149000
17	18	90	RL	72.0	10791	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	Shed	500	10	2006	WD	Normal	90000
18	19	20	RL	66.0	13695	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2008	WD	Normal	159000
19	20	20	RL	70.0	7560	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	5	2009	COD	Abnorml	139000
20	21	60	RL	101.0	14215	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	11	2006	New	Partial	325300
21	22	45	RM	57.0	7449	Pave	Grvl	Reg	Bnk	AllPub	...	0	NaN	GdPrv	NaN	0	6	2007	WD	Normal	139400
22	23	20	RL	75.0	9742	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	230000
23	24	120	RM	44.0	4224	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2007	WD	Normal	129900
24	25	20	RL	NaN	8246	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	5	2010	WD	Normal	154000
25	26	20	RL	110.0	14230	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	7	2009	WD	Normal	256300
26	27	20	RL	60.0	7200	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2010	WD	Normal	134800
27	28	20	RL	98.0	11478	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2010	WD	Normal	306000
28	29	20	RL	47.0	16321	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	12	2006	WD	Normal	207500
29	30	30	RM	60.0	6324	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2008	WD	Normal	68500
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1430	1431	60	RL	60.0	21930	Pave	NaN	IR3	Lvl	AllPub	...	0	NaN	NaN	NaN	0	7	2006	WD	Normal	192140
1431	1432	120	RL	NaN	4928	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	10	2009	WD	Normal	143750
1432	1433	30	RL	60.0	10800	Pave	Grvl	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2007	WD	Normal	64500
1433	1434	60	RL	93.0	10261	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2008	WD	Normal	186500
1434	1435	20	RL	80.0	17400	Pave	NaN	Reg	Low	AllPub	...	0	NaN	NaN	NaN	0	5	2006	WD	Normal	160000
1435	1436	20	RL	80.0	8400	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	GdPrv	NaN	0	7	2008	COD	Abnorml	174000
1436	1437	20	RL	60.0	9000	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	GdWo	NaN	0	5	2007	WD	Normal	120500
1437	1438	20	RL	96.0	12444	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	11	2008	New	Partial	394617
1438	1439	20	RM	90.0	7407	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	4	2010	WD	Normal	149700
1439	1440	60	RL	80.0	11584	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	11	2007	WD	Normal	197000
1440	1441	70	RL	79.0	11526	Pave	NaN	IR1	Bnk	AllPub	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	191000
1441	1442	120	RM	NaN	4426	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2008	WD	Normal	149300
1442	1443	60	FV	85.0	11003	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	4	2009	WD	Normal	310000
1443	1444	30	RL	NaN	8854	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2009	WD	Normal	121000
1444	1445	20	RL	63.0	8500	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	11	2007	WD	Normal	179600
1445	1446	85	RL	70.0	8400	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2007	WD	Normal	129000
1446	1447	20	RL	NaN	26142	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	4	2010	WD	Normal	157900
1447	1448	60	RL	80.0	10000	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	12	2007	WD	Normal	240000
1448	1449	50	RL	70.0	11767	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	GdWo	NaN	0	5	2007	WD	Normal	112000
1449	1450	180	RM	21.0	1533	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2006	WD	Abnorml	92000
1450	1451	90	RL	60.0	9000	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	9	2009	WD	Normal	136000
1451	1452	20	RL	78.0	9262	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2009	New	Partial	287090
1452	1453	180	RM	35.0	3675	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2006	WD	Normal	145000
1453	1454	20	RL	90.0	17217	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	7	2006	WD	Abnorml	84500
1454	1455	20	FV	62.0	7500	Pave	Pave	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	10	2009	WD	Normal	185000
1455	1456	60	RL	62.0	7917	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2007	WD	Normal	175000
1456	1457	20	RL	85.0	13175	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	2	2010	WD	Normal	210000
1457	1458	70	RL	66.0	9042	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	GdPrv	Shed	2500	5	2010	WD	Normal	266500
1458	1459	20	RL	68.0	9717	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	4	2010	WD	Normal	142125
1459	1460	20	RL	75.0	9937	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2008	WD	Normal	147500



train.head()
train.shape
train.columns
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
color = sns.color_palette()
sns.set_style('darkgrid')

fig, ax = plt.subplots()
# 绘制散点图
ax.scatter(x=train['GrLivArea'], y=train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

kaggle房价预测

# 删除异常值点
train_drop = train.drop(
    train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

# 重新绘制图
fig, ax = plt.subplots()
ax.scatter(train_drop['GrLivArea'], train_drop['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

kaggle房价预测

var = 'OverallQual'
data = pd.concat([train_drop['SalePrice'], train_drop[var]], axis=1)
# 画出箱线图
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000)

kaggle房价预测

import numpy as np

k = 10
corrmat = train_drop.corr()  # 获得相关性矩阵
# 获得相关性最高的 K 个特征
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
# 获得相关性最高的 K 个特征组成的子数据集
cm = np.corrcoef(train_drop[cols].values.T)
# 绘制热图
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={
                 'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

kaggle房价预测

# 绘制散点图
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea',
        'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train_drop[cols], size=2.5)
plt.show()

kaggle房价预测

数据预处理

train_drop1 = train_drop.drop("Id", axis=1)
train_drop1.head()

MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	2	2008	WD	Normal	208500
1	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	5	2007	WD	Normal	181500
2	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	223500
3	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	...	0	NaN	NaN	NaN	0	2	2006	WD	Abnorml	140000
4	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	12	2008	WD	Normal	250000
5 rows × 80 columns
train_drop1['SalePrice'].describe()
count      1458.000000
mean     180932.919067
std       79495.055285
min       34900.000000
25%      129925.000000
50%      163000.000000
75%      214000.000000
max      755000.000000


from scipy.stats import norm, skew

sns.distplot(train_drop1['SalePrice'], fit=norm)

# 获得均值和方差
(mu, sigma) = norm.fit(train_drop1['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# 画出数据分布图
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
           loc='best')
plt.ylabel('Frequency')
# 设置标题
plt.title('SalePrice distribution')

kaggle房价预测

from scipy import stats

fig = plt.figure()
res = stats.probplot(train_drop1['SalePrice'], plot=plt)
plt.show()

kaggle房价预测

# 平滑数据
train_drop1["SalePrice"] = np.log1p(train_drop1["SalePrice"])

# 重新画出数据分布图
sns.distplot(train_drop1['SalePrice'], fit=norm)

# 重新计算平滑后的均值和方差
(mu, sigma) = norm.fit(train_drop1['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
           loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

# 画出 Q-Q 图

fig = plt.figure()
res = stats.probplot(train_drop1['SalePrice'], plot=plt)
plt.show()

kaggle房价预测

kaggle房价预测

特征工程

train_drop1.isnull().sum().sort_values(ascending=False)[:20]  # 取前 20 个数据
PoolQC          1452
MiscFeature     1404
Alley           1367
Fence           1177
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageCond        81
GarageFinish      81
GarageQual        81
GarageYrBlt       81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
MasVnrType         8
Electrical         1
RoofMatl           0
dtype: int64
Markdown Code     

train_na = (train_drop1.isnull().sum() / len(train)) * 100
train_na = train_na.drop(
    train_na[train_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio': train_na})
missing_data.head(20)

	Missing Ratio
PoolQC	99.452055
MiscFeature	96.164384
Alley	93.630137
Fence	80.616438
FireplaceQu	47.260274
LotFrontage	17.739726
GarageYrBlt	5.547945
GarageType	5.547945
GarageFinish	5.547945
GarageQual	5.547945
GarageCond	5.547945
BsmtFinType2	2.602740
BsmtExposure	2.602740
BsmtFinType1	2.534247
BsmtCond	2.534247
BsmtQual	2.534247
MasVnrArea	0.547945
MasVnrType	0.547945
Electrical	0.068493

f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)


kaggle房价预测

feature = ['PoolQC', 'MiscFeature', 'Alley', 'Fence',
           'FireplaceQu', 'GarageType', 'GarageFinish',
           'GarageQual', 'GarageCond', 'BsmtQual',
           'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
           'BsmtFinType2', 'MasVnrType', 'MSSubClass']
for col in feature:
    train_drop1[col] = train_drop1[col].fillna('None')

feature = ['GarageYrBlt', 'GarageArea', 'GarageCars',
           'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
           'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath',
           'MasVnrArea', 'Electrical']
for col in feature:
    train_drop1[col] = train_drop1[col].fillna(0)

train_drop1["LotFrontage"] = train_drop1.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

feature = []
train_drop1['MSZoning'] = train_drop1['MSZoning'].fillna(
    train_drop1['MSZoning'].mode()[0])

train_drop2 = train_drop1.drop(['Utilities'], axis=1)

train_drop1["Functional"] = train_drop1["Functional"].fillna("Typ")

train_drop2.isnull().sum().sort_values(ascending=False)[:20]
SalePrice       0
BsmtQual        0
Exterior1st     0
Exterior2nd     0
MasVnrType      0
MasVnrArea      0
ExterQual       0
ExterCond       0
Foundation      0
BsmtCond        0
RoofStyle       0
BsmtExposure    0
BsmtFinType1    0
BsmtFinSF1      0
BsmtFinType2    0
BsmtFinSF2      0
BsmtUnfSF       0
TotalBsmtSF     0
RoofMatl        0
YearRemodAdd    0
dtype: int64

feature = ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for col in feature:
    train_drop2[col] = train_drop2[col].apply(str)

from sklearn.preprocessing import LabelEncoder

cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
        'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
        'YrSold', 'MoSold']
for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(train_drop2[c].values))
    train_drop2[c] = lbl.transform(list(train_drop2[c].values))
train_drop2[cols].head()



	FireplaceQu	BsmtQual	BsmtCond	GarageQual	GarageCond	ExterQual	ExterCond	HeatingQC	PoolQC	KitchenQual	...	LandSlope	LotShape	PavedDrive	Street	Alley	CentralAir	MSSubClass	OverallCond	YrSold	MoSold
0	3	2	4	5	5	2	4	0	3	2	...	0	3	2	1	1	1	9	4	2	4
1	5	2	4	5	5	3	4	0	3	3	...	0	3	2	1	1	1	4	7	1	7
2	5	2	4	5	5	2	4	0	3	2	...	0	0	2	1	1	1	9	4	2	11
3	2	4	1	5	5	3	4	2	3	2	...	0	0	2	1	1	1	10	4	0	4
4	5	2	4	5	5	2	4	0	3	2	...	0	0	2	1	1	1	9	4	2	3



train_drop2['TotalSF'] = train_drop2['TotalBsmtSF'] + \
    train_drop2['1stFlrSF'] + train_drop2['2ndFlrSF']

numeric_feats = train_drop2.dtypes[train_drop2.dtypes != "object"].index

# 检测特征值
skewed_feats = train_drop2[numeric_feats].apply(
    lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew': skewed_feats})
skewness.head(10)

Skew
MiscVal	24.434913
PoolArea	15.932532
LotArea	12.560986
3SsnPorch	10.286510
LowQualFinSF	8.995688
LandSlope	4.805032
KitchenAbvGr	4.480268
BsmtFinSF2	4.247550
ScreenPorch	4.114690
BsmtHalfBath	4.095895

from scipy.special import boxcox1p
skewness = skewness[abs(skewness) > 0.75]


skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    train_drop2[feat] = boxcox1p(train_drop2[feat], lam)


data_y = train_drop2['SalePrice']
data_X = train_drop2.drop(['SalePrice'], axis=1)


data_X_oh = pd.get_dummies(data_X)
print(data_X_oh.shape)
(1458, 220)

预测模型

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error


data_y_v = data_y.values  # 转换为 NumPy 数组
data_X_v = data_X_oh.values
length = int(len(data_y)*0.7)

# 划分数据集
train_y = data_y_v[:length]
train_X = data_X_v[:length]
test_y = data_y_v[length:]
test_X = data_X_v[length:]

model = Lasso()
model.fit(train_X, train_y)



y_pred = model.predict(test_X)
mean_squared_error(test_y, y_pred)
0.00016901038226755657