欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

house price

程序员文章站 2022-03-19 14:56:58
...

Kaggle房价预测

链接:link

供个人学习复习用

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/house price/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/house price/house-prices-advanced-regression-techniques/test.csv')
train = train_data.copy()
test = test_data.copy()

house price

train.shape,test.shape

house price

#check for dupes for Id
idsUnique = len(set(train.Id))#set是集合
idsTotal = train.shape[0]
#这里是集合过滤重复id,只余下唯一值,然后总数减去唯一值查看不重复的数量
idsdupe = idsTotal - idsUnique
print(idsdupe)  #输出是0
#drop id col
train.drop(['Id'],axis=1,inplace=True)

进行可视化

#correlation matrix相关矩阵
corrmat = train.corr()
f,ax = plt.subplots(figsize=(20,9))
sns.heatmap(corrmat,vmax=.8,annot=True)

house price

# most correlated features
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat['SalePrice'])>0.5]#corrmat.index取出所有特征名,然后取出与特征SalePrice相关性大于0.5的其他特征
plt.figure(figsize=(10,10))
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap='RdYlGn')#再查看这些特征与特征之间的相关性

house price

#我们将在下图中看到OverallQual如何影响销售价格。(因为它与销售价格高度相关)
sns.barplot(train.OverallQual,train.SalePrice)

house price

#下面可以看到每一个特征与销售价格之间的关联
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols],size=2.5)
plt.show()

house price
house price

因为最终目的是要预测销售价格,所以下面可以进行对改变量进行分析

from scipy import stats
from scipy.stats import norm, skew #for some statistics,norm实现正态分布,skew表示概率分布密度曲线相对于平均值不对称程度的特征数,也即偏度
#skew直观来看就是密度函数曲线尾部的相对长度
sns.distplot(train['SalePrice'] , fit=norm);#正态分布曲线拟合图
#通过函数获取拟合参数(Get the fitted parameters used by the function)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
(mu, sigma) = norm.fit(train['SalePrice'])#返回mu均值,sigma是方差
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))    #{:,2f}是保留两位小数
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)  #检验样本数据概率分布图(如正态分布(默认))的方法
plt.show()

house price

train.SalePrice = np.log1p(train.SalePrice)#对销售价格进行平滑处理(即将数据压缩到一个区间,逆运算是expm1)
y = train.SalePrice

house price

#进行加工预处理,查看两个特征之间的散点图
plt.scatter(y=train.SalePrice,x=train.GrLivArea,c='black')
plt.show()

house price
house price

train_nas = train.isnull().sum()#计算每个特征的空值总数
train_nas = train_nas[train_nas>0]#筛选出有空值的特征
train_nas.sort_values(ascending = False)#按空值数量进行排序

house price

#同理对训练集进行相同操作
test_nas = test.isnull().sum()
test_nas = test_nas[test_nas>0]
test_nas.sort_values(ascending = False)

house price

print("Find most important features relative to target")
corr = train.corr()#得到特征之间的相关性矩阵
corr.sort_values(['SalePrice'],ascending=False,inplace=True)#按照列(特征)SalePrice进行排序
print(corr.SalePrice)

house price

#区分数字特征(减去目标)和分类特征,Differentiate numerical features (minus the target) and categorical features
categorical_features = train.select_dtypes(include=['object']).columns#只获取分类特征
categorical_features

house price

numerical_features = train.select_dtypes(exclude = ["object"]).columns#获取非分类特征
numerical_features
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("SalePrice")#非分类特征中删去目标值(销售价格)
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]

house price

#使用mean()来填充na值,实际上在进行特征工程时有很多需要探索的地方。
#NOTE: i simply used median() to fill na values, actually there is lot to explore when you do feature engineering. But this notebook aim is to simplify things(no heavy code)

## Handle remaining missing values for numerical features by using median as replacement
#使用中位数来填充处理数值特征缺失的部分
print('NAs for numerical features in train:' + str(train_num.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print('Remaining NAs for numerical features in train:'+str(train_num.isnull().values.sum()))

house price

from scipy.stats import skew
skewness = train_num.apply(lambda x:skew(x))#遍历每一列,将每一列都调用匿名函数
skewness.sort_values(ascending=False)

house price

skewness = skewness[abs(skewness)>0.5]
skewness.index#取dataframe的特征名,---没有复制过来图片

skew_features = train[skewness.index]#从训练集中选出已经挑选出的特征,(它们是非分类型特征且这些特征之间的不对称度大于0.5)
skew_features.columns

house price

#we can treat skewness of a feature with the help fof log transformation.so we'll apply the same here.
#借助对数转换来处理特征的偏斜度,因此我们将在此处应用相同的偏度。
skew_features = np.log1p(skew_features)  #将目标矩阵skew_features中的值全部取对数
train_cat.head()

house price

str(train_cat.isnull().values.sum())#查看非分类特征中有无空值---0

下面开始进行模型

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer #metrics 是指标,make_scorer从性能指标或损失函数中创建一个计分标准
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.concat([train_cat,train_num],axis=1)#将预处理的训练集合并(原本分为了分类集和非分类集,用来预处理)
X_train,X_test,y_train,y_test = train_test_split(train,y,test_size = 0.3,random_state= 0)

#用交叉验证集分布检测训练集和测试集
n_folds = 5
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold#K折交叉验证
scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)#在K折交叉验证中将训练集再次划分
    rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)
def rmse_CV_test(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)
#Linear model without Regularization
lr = LinearRegression()
lr.fit(X_train,y_train)
test_pre = lr.predict(X_test)
train_pre = lr.predict(X_train)
print('rmse on train',rmse_CV_train(lr).mean())
print('rmse on train',rmse_CV_test(lr).mean())

house price

#plot between predicted values and residuals
plt.scatter(train_pre, train_pre - y_train, c = "blue",  label = "Training data")#残差即预测值与真实值之间的差异
plt.scatter(test_pre,test_pre - y_test, c = "black",  label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

house price

# Plot predictions - Real values绘画真实值和预测值散点图
plt.scatter(train_pre, y_train, c = "blue",  label = "Training data")
plt.scatter(test_pre, y_test, c = "black",  label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

house price

正则化是处理共线性,从数据中滤除噪声并最终防止过度拟合的非常有用的方法。
正则化背后的概念是引入附加信息(偏差)以惩罚极端参数权重。

Regularization is a very useful method to handle collinearity, filter out noise from data, and eventually prevent overfitting.
The concept behind regularization is to introduce additional information (bias) to penalize extreme parameter weights.
house price

#RidgeCV内置交叉验证的岭回归,默认情况下,它执行通用的交叉验证,这是一种有效的留一交叉验证的形式。alpha是正则化的力度
#Ridge:固定阿尔法,求出最佳w,阿尔法与w的范数成反比,
#RidgeCV:多个阿尔法,得出多个对应最佳的w,然后得到最佳的w及对应的阿尔法
ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])

ridge.fit(X_train,y_train)
alpha = ridge.alpha_#一轮下来得到最好的alpha
print('best alpha',alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],cv = 5)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)
print("Ridge RMSE on Training set :", rmse_CV_train(ridge).mean())#K折交叉验证结果的均值
print("Ridge RMSE on Test set :", rmse_CV_test(ridge).mean())
y_train_rdg = ridge.predict(X_train)#岭回归的返回分数
y_test_rdg = ridge.predict(X_test)

house price

print("Kcv RMSE on Training set :", y_train_rdg.mean())#K折交叉验证结果的均值
print("Kcv RMSE on Test set :", y_test_rdg.mean())

house price

coef = pd.Series(ridge.coef_, index = X_train.columns)

print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

house price

# Plot residuals
plt.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue",  label = "Training data")
plt.scatter(y_test_rdg, y_test_rdg - y_test, c = "black", marker = "v", label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

house price

# Plot predictions - Real values
plt.scatter(y_train_rdg, y_train, c = "blue",  label = "Training data")
plt.scatter(y_test_rdg, y_test, c = "black",  label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

house price