kaggle房价预测
程序员文章站
2024-03-22 08:10:46
...
数据预览
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv(
'https://labfile.oss.aliyuncs.com/courses/1363/HousePrice.csv')
train
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000
5 6 50 RL 85.0 14115 Pave NaN IR1 Lvl AllPub ... 0 NaN MnPrv Shed 700 10 2009 WD Normal 143000
6 7 20 RL 75.0 10084 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 8 2007 WD Normal 307000
7 8 60 RL NaN 10382 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN Shed 350 11 2009 WD Normal 200000
8 9 50 RM 51.0 6120 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 4 2008 WD Abnorml 129900
9 10 190 RL 50.0 7420 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 1 2008 WD Normal 118000
10 11 20 RL 70.0 11200 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 129500
11 12 60 RL 85.0 11924 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 7 2006 New Partial 345000
12 13 20 RL NaN 12968 Pave NaN IR2 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 144000
13 14 20 RL 91.0 10652 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 8 2007 New Partial 279500
14 15 20 RL NaN 10920 Pave NaN IR1 Lvl AllPub ... 0 NaN GdWo NaN 0 5 2008 WD Normal 157000
15 16 45 RM 51.0 6120 Pave NaN Reg Lvl AllPub ... 0 NaN GdPrv NaN 0 7 2007 WD Normal 132000
16 17 20 RL NaN 11241 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN Shed 700 3 2010 WD Normal 149000
17 18 90 RL 72.0 10791 Pave NaN Reg Lvl AllPub ... 0 NaN NaN Shed 500 10 2006 WD Normal 90000
18 19 20 RL 66.0 13695 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 6 2008 WD Normal 159000
19 20 20 RL 70.0 7560 Pave NaN Reg Lvl AllPub ... 0 NaN MnPrv NaN 0 5 2009 COD Abnorml 139000
20 21 60 RL 101.0 14215 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 11 2006 New Partial 325300
21 22 45 RM 57.0 7449 Pave Grvl Reg Bnk AllPub ... 0 NaN GdPrv NaN 0 6 2007 WD Normal 139400
22 23 20 RL 75.0 9742 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 230000
23 24 120 RM 44.0 4224 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 6 2007 WD Normal 129900
24 25 20 RL NaN 8246 Pave NaN IR1 Lvl AllPub ... 0 NaN MnPrv NaN 0 5 2010 WD Normal 154000
25 26 20 RL 110.0 14230 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 7 2009 WD Normal 256300
26 27 20 RL 60.0 7200 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2010 WD Normal 134800
27 28 20 RL 98.0 11478 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2010 WD Normal 306000
28 29 20 RL 47.0 16321 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2006 WD Normal 207500
29 30 30 RM 60.0 6324 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 5 2008 WD Normal 68500
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1430 1431 60 RL 60.0 21930 Pave NaN IR3 Lvl AllPub ... 0 NaN NaN NaN 0 7 2006 WD Normal 192140
1431 1432 120 RL NaN 4928 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 10 2009 WD Normal 143750
1432 1433 30 RL 60.0 10800 Pave Grvl Reg Lvl AllPub ... 0 NaN NaN NaN 0 8 2007 WD Normal 64500
1433 1434 60 RL 93.0 10261 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 5 2008 WD Normal 186500
1434 1435 20 RL 80.0 17400 Pave NaN Reg Low AllPub ... 0 NaN NaN NaN 0 5 2006 WD Normal 160000
1435 1436 20 RL 80.0 8400 Pave NaN Reg Lvl AllPub ... 0 NaN GdPrv NaN 0 7 2008 COD Abnorml 174000
1436 1437 20 RL 60.0 9000 Pave NaN Reg Lvl AllPub ... 0 NaN GdWo NaN 0 5 2007 WD Normal 120500
1437 1438 20 RL 96.0 12444 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 11 2008 New Partial 394617
1438 1439 20 RM 90.0 7407 Pave NaN Reg Lvl AllPub ... 0 NaN MnPrv NaN 0 4 2010 WD Normal 149700
1439 1440 60 RL 80.0 11584 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 11 2007 WD Normal 197000
1440 1441 70 RL 79.0 11526 Pave NaN IR1 Bnk AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 191000
1441 1442 120 RM NaN 4426 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2008 WD Normal 149300
1442 1443 60 FV 85.0 11003 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 4 2009 WD Normal 310000
1443 1444 30 RL NaN 8854 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2009 WD Normal 121000
1444 1445 20 RL 63.0 8500 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 11 2007 WD Normal 179600
1445 1446 85 RL 70.0 8400 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 129000
1446 1447 20 RL NaN 26142 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 4 2010 WD Normal 157900
1447 1448 60 RL 80.0 10000 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 12 2007 WD Normal 240000
1448 1449 50 RL 70.0 11767 Pave NaN Reg Lvl AllPub ... 0 NaN GdWo NaN 0 5 2007 WD Normal 112000
1449 1450 180 RM 21.0 1533 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 8 2006 WD Abnorml 92000
1450 1451 90 RL 60.0 9000 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 9 2009 WD Normal 136000
1451 1452 20 RL 78.0 9262 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2009 New Partial 287090
1452 1453 180 RM 35.0 3675 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2006 WD Normal 145000
1453 1454 20 RL 90.0 17217 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 7 2006 WD Abnorml 84500
1454 1455 20 FV 62.0 7500 Pave Pave Reg Lvl AllPub ... 0 NaN NaN NaN 0 10 2009 WD Normal 185000
1455 1456 60 RL 62.0 7917 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 8 2007 WD Normal 175000
1456 1457 20 RL 85.0 13175 Pave NaN Reg Lvl AllPub ... 0 NaN MnPrv NaN 0 2 2010 WD Normal 210000
1457 1458 70 RL 66.0 9042 Pave NaN Reg Lvl AllPub ... 0 NaN GdPrv Shed 2500 5 2010 WD Normal 266500
1458 1459 20 RL 68.0 9717 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 4 2010 WD Normal 142125
1459 1460 20 RL 75.0 9937 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 6 2008 WD Normal 147500
train.head()
train.shape
train.columns
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
color = sns.color_palette()
sns.set_style('darkgrid')
fig, ax = plt.subplots()
# 绘制散点图
ax.scatter(x=train['GrLivArea'], y=train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
# 删除异常值点
train_drop = train.drop(
train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
# 重新绘制图
fig, ax = plt.subplots()
ax.scatter(train_drop['GrLivArea'], train_drop['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
var = 'OverallQual'
data = pd.concat([train_drop['SalePrice'], train_drop[var]], axis=1)
# 画出箱线图
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000)
import numpy as np
k = 10
corrmat = train_drop.corr() # 获得相关性矩阵
# 获得相关性最高的 K 个特征
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
# 获得相关性最高的 K 个特征组成的子数据集
cm = np.corrcoef(train_drop[cols].values.T)
# 绘制热图
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={
'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
# 绘制散点图
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea',
'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train_drop[cols], size=2.5)
plt.show()
数据预处理
train_drop1 = train_drop.drop("Id", axis=1)
train_drop1.head()
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000
5 rows × 80 columns
train_drop1['SalePrice'].describe()
count 1458.000000
mean 180932.919067
std 79495.055285
min 34900.000000
25% 129925.000000
50% 163000.000000
75% 214000.000000
max 755000.000000
from scipy.stats import norm, skew
sns.distplot(train_drop1['SalePrice'], fit=norm)
# 获得均值和方差
(mu, sigma) = norm.fit(train_drop1['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
# 画出数据分布图
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
# 设置标题
plt.title('SalePrice distribution')
from scipy import stats
fig = plt.figure()
res = stats.probplot(train_drop1['SalePrice'], plot=plt)
plt.show()
# 平滑数据
train_drop1["SalePrice"] = np.log1p(train_drop1["SalePrice"])
# 重新画出数据分布图
sns.distplot(train_drop1['SalePrice'], fit=norm)
# 重新计算平滑后的均值和方差
(mu, sigma) = norm.fit(train_drop1['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
# 画出 Q-Q 图
fig = plt.figure()
res = stats.probplot(train_drop1['SalePrice'], plot=plt)
plt.show()
特征工程
train_drop1.isnull().sum().sort_values(ascending=False)[:20] # 取前 20 个数据
PoolQC 1452
MiscFeature 1404
Alley 1367
Fence 1177
FireplaceQu 690
LotFrontage 259
GarageType 81
GarageCond 81
GarageFinish 81
GarageQual 81
GarageYrBlt 81
BsmtFinType2 38
BsmtExposure 38
BsmtQual 37
BsmtCond 37
BsmtFinType1 37
MasVnrArea 8
MasVnrType 8
Electrical 1
RoofMatl 0
dtype: int64
Markdown Code
train_na = (train_drop1.isnull().sum() / len(train)) * 100
train_na = train_na.drop(
train_na[train_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio': train_na})
missing_data.head(20)
Missing Ratio
PoolQC 99.452055
MiscFeature 96.164384
Alley 93.630137
Fence 80.616438
FireplaceQu 47.260274
LotFrontage 17.739726
GarageYrBlt 5.547945
GarageType 5.547945
GarageFinish 5.547945
GarageQual 5.547945
GarageCond 5.547945
BsmtFinType2 2.602740
BsmtExposure 2.602740
BsmtFinType1 2.534247
BsmtCond 2.534247
BsmtQual 2.534247
MasVnrArea 0.547945
MasVnrType 0.547945
Electrical 0.068493
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
feature = ['PoolQC', 'MiscFeature', 'Alley', 'Fence',
'FireplaceQu', 'GarageType', 'GarageFinish',
'GarageQual', 'GarageCond', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
'BsmtFinType2', 'MasVnrType', 'MSSubClass']
for col in feature:
train_drop1[col] = train_drop1[col].fillna('None')
feature = ['GarageYrBlt', 'GarageArea', 'GarageCars',
'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath',
'MasVnrArea', 'Electrical']
for col in feature:
train_drop1[col] = train_drop1[col].fillna(0)
train_drop1["LotFrontage"] = train_drop1.groupby("Neighborhood")["LotFrontage"].transform(
lambda x: x.fillna(x.median()))
feature = []
train_drop1['MSZoning'] = train_drop1['MSZoning'].fillna(
train_drop1['MSZoning'].mode()[0])
train_drop2 = train_drop1.drop(['Utilities'], axis=1)
train_drop1["Functional"] = train_drop1["Functional"].fillna("Typ")
train_drop2.isnull().sum().sort_values(ascending=False)[:20]
SalePrice 0
BsmtQual 0
Exterior1st 0
Exterior2nd 0
MasVnrType 0
MasVnrArea 0
ExterQual 0
ExterCond 0
Foundation 0
BsmtCond 0
RoofStyle 0
BsmtExposure 0
BsmtFinType1 0
BsmtFinSF1 0
BsmtFinType2 0
BsmtFinSF2 0
BsmtUnfSF 0
TotalBsmtSF 0
RoofMatl 0
YearRemodAdd 0
dtype: int64
feature = ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for col in feature:
train_drop2[col] = train_drop2[col].apply(str)
from sklearn.preprocessing import LabelEncoder
cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
'YrSold', 'MoSold']
for c in cols:
lbl = LabelEncoder()
lbl.fit(list(train_drop2[c].values))
train_drop2[c] = lbl.transform(list(train_drop2[c].values))
train_drop2[cols].head()
FireplaceQu BsmtQual BsmtCond GarageQual GarageCond ExterQual ExterCond HeatingQC PoolQC KitchenQual ... LandSlope LotShape PavedDrive Street Alley CentralAir MSSubClass OverallCond YrSold MoSold
0 3 2 4 5 5 2 4 0 3 2 ... 0 3 2 1 1 1 9 4 2 4
1 5 2 4 5 5 3 4 0 3 3 ... 0 3 2 1 1 1 4 7 1 7
2 5 2 4 5 5 2 4 0 3 2 ... 0 0 2 1 1 1 9 4 2 11
3 2 4 1 5 5 3 4 2 3 2 ... 0 0 2 1 1 1 10 4 0 4
4 5 2 4 5 5 2 4 0 3 2 ... 0 0 2 1 1 1 9 4 2 3
train_drop2['TotalSF'] = train_drop2['TotalBsmtSF'] + \
train_drop2['1stFlrSF'] + train_drop2['2ndFlrSF']
numeric_feats = train_drop2.dtypes[train_drop2.dtypes != "object"].index
# 检测特征值
skewed_feats = train_drop2[numeric_feats].apply(
lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew': skewed_feats})
skewness.head(10)
Skew
MiscVal 24.434913
PoolArea 15.932532
LotArea 12.560986
3SsnPorch 10.286510
LowQualFinSF 8.995688
LandSlope 4.805032
KitchenAbvGr 4.480268
BsmtFinSF2 4.247550
ScreenPorch 4.114690
BsmtHalfBath 4.095895
from scipy.special import boxcox1p
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
train_drop2[feat] = boxcox1p(train_drop2[feat], lam)
data_y = train_drop2['SalePrice']
data_X = train_drop2.drop(['SalePrice'], axis=1)
data_X_oh = pd.get_dummies(data_X)
print(data_X_oh.shape)
(1458, 220)
预测模型
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
data_y_v = data_y.values # 转换为 NumPy 数组
data_X_v = data_X_oh.values
length = int(len(data_y)*0.7)
# 划分数据集
train_y = data_y_v[:length]
train_X = data_X_v[:length]
test_y = data_y_v[length:]
test_X = data_X_v[length:]
model = Lasso()
model.fit(train_X, train_y)
y_pred = model.predict(test_X)
mean_squared_error(test_y, y_pred)
0.00016901038226755657