house price
程序员文章站
2022-03-19 14:56:58
...
Kaggle房价预测
链接:link
供个人学习复习用
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
train_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/house price/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/house price/house-prices-advanced-regression-techniques/test.csv')
train = train_data.copy()
test = test_data.copy()
train.shape,test.shape
#check for dupes for Id
idsUnique = len(set(train.Id))#set是集合
idsTotal = train.shape[0]
#这里是集合过滤重复id,只余下唯一值,然后总数减去唯一值查看不重复的数量
idsdupe = idsTotal - idsUnique
print(idsdupe) #输出是0
#drop id col
train.drop(['Id'],axis=1,inplace=True)
进行可视化
#correlation matrix相关矩阵
corrmat = train.corr()
f,ax = plt.subplots(figsize=(20,9))
sns.heatmap(corrmat,vmax=.8,annot=True)
# most correlated features
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat['SalePrice'])>0.5]#corrmat.index取出所有特征名,然后取出与特征SalePrice相关性大于0.5的其他特征
plt.figure(figsize=(10,10))
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap='RdYlGn')#再查看这些特征与特征之间的相关性
#我们将在下图中看到OverallQual如何影响销售价格。(因为它与销售价格高度相关)
sns.barplot(train.OverallQual,train.SalePrice)
#下面可以看到每一个特征与销售价格之间的关联
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols],size=2.5)
plt.show()
因为最终目的是要预测销售价格,所以下面可以进行对改变量进行分析
from scipy import stats
from scipy.stats import norm, skew #for some statistics,norm实现正态分布,skew表示概率分布密度曲线相对于平均值不对称程度的特征数,也即偏度
#skew直观来看就是密度函数曲线尾部的相对长度
sns.distplot(train['SalePrice'] , fit=norm);#正态分布曲线拟合图
#通过函数获取拟合参数(Get the fitted parameters used by the function)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
(mu, sigma) = norm.fit(train['SalePrice'])#返回mu均值,sigma是方差
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) #{:,2f}是保留两位小数
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt) #检验样本数据概率分布图(如正态分布(默认))的方法
plt.show()
train.SalePrice = np.log1p(train.SalePrice)#对销售价格进行平滑处理(即将数据压缩到一个区间,逆运算是expm1)
y = train.SalePrice
#进行加工预处理,查看两个特征之间的散点图
plt.scatter(y=train.SalePrice,x=train.GrLivArea,c='black')
plt.show()
train_nas = train.isnull().sum()#计算每个特征的空值总数
train_nas = train_nas[train_nas>0]#筛选出有空值的特征
train_nas.sort_values(ascending = False)#按空值数量进行排序
#同理对训练集进行相同操作
test_nas = test.isnull().sum()
test_nas = test_nas[test_nas>0]
test_nas.sort_values(ascending = False)
print("Find most important features relative to target")
corr = train.corr()#得到特征之间的相关性矩阵
corr.sort_values(['SalePrice'],ascending=False,inplace=True)#按照列(特征)SalePrice进行排序
print(corr.SalePrice)
#区分数字特征(减去目标)和分类特征,Differentiate numerical features (minus the target) and categorical features
categorical_features = train.select_dtypes(include=['object']).columns#只获取分类特征
categorical_features
numerical_features = train.select_dtypes(exclude = ["object"]).columns#获取非分类特征
numerical_features
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("SalePrice")#非分类特征中删去目标值(销售价格)
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]
#使用mean()来填充na值,实际上在进行特征工程时有很多需要探索的地方。
#NOTE: i simply used median() to fill na values, actually there is lot to explore when you do feature engineering. But this notebook aim is to simplify things(no heavy code)
## Handle remaining missing values for numerical features by using median as replacement
#使用中位数来填充处理数值特征缺失的部分
print('NAs for numerical features in train:' + str(train_num.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print('Remaining NAs for numerical features in train:'+str(train_num.isnull().values.sum()))
from scipy.stats import skew
skewness = train_num.apply(lambda x:skew(x))#遍历每一列,将每一列都调用匿名函数
skewness.sort_values(ascending=False)
skewness = skewness[abs(skewness)>0.5]
skewness.index#取dataframe的特征名,---没有复制过来图片
skew_features = train[skewness.index]#从训练集中选出已经挑选出的特征,(它们是非分类型特征且这些特征之间的不对称度大于0.5)
skew_features.columns
#we can treat skewness of a feature with the help fof log transformation.so we'll apply the same here.
#借助对数转换来处理特征的偏斜度,因此我们将在此处应用相同的偏度。
skew_features = np.log1p(skew_features) #将目标矩阵skew_features中的值全部取对数
train_cat.head()
str(train_cat.isnull().values.sum())#查看非分类特征中有无空值---0
下面开始进行模型
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer #metrics 是指标,make_scorer从性能指标或损失函数中创建一个计分标准
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.concat([train_cat,train_num],axis=1)#将预处理的训练集合并(原本分为了分类集和非分类集,用来预处理)
X_train,X_test,y_train,y_test = train_test_split(train,y,test_size = 0.3,random_state= 0)
#用交叉验证集分布检测训练集和测试集
n_folds = 5
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold#K折交叉验证
scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)#在K折交叉验证中将训练集再次划分
rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf))
return (rmse)
def rmse_CV_test(model):
kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)
rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf))
return (rmse)
#Linear model without Regularization
lr = LinearRegression()
lr.fit(X_train,y_train)
test_pre = lr.predict(X_test)
train_pre = lr.predict(X_train)
print('rmse on train',rmse_CV_train(lr).mean())
print('rmse on train',rmse_CV_test(lr).mean())
#plot between predicted values and residuals
plt.scatter(train_pre, train_pre - y_train, c = "blue", label = "Training data")#残差即预测值与真实值之间的差异
plt.scatter(test_pre,test_pre - y_test, c = "black", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()
# Plot predictions - Real values绘画真实值和预测值散点图
plt.scatter(train_pre, y_train, c = "blue", label = "Training data")
plt.scatter(test_pre, y_test, c = "black", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()
正则化是处理共线性,从数据中滤除噪声并最终防止过度拟合的非常有用的方法。
正则化背后的概念是引入附加信息(偏差)以惩罚极端参数权重。
Regularization is a very useful method to handle collinearity, filter out noise from data, and eventually prevent overfitting.
The concept behind regularization is to introduce additional information (bias) to penalize extreme parameter weights.
#RidgeCV内置交叉验证的岭回归,默认情况下,它执行通用的交叉验证,这是一种有效的留一交叉验证的形式。alpha是正则化的力度
#Ridge:固定阿尔法,求出最佳w,阿尔法与w的范数成反比,
#RidgeCV:多个阿尔法,得出多个对应最佳的w,然后得到最佳的w及对应的阿尔法
ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
ridge.fit(X_train,y_train)
alpha = ridge.alpha_#一轮下来得到最好的alpha
print('best alpha',alpha)
print("Try again for more precision with alphas centered around " + str(alpha))
ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],cv = 5)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)
print("Ridge RMSE on Training set :", rmse_CV_train(ridge).mean())#K折交叉验证结果的均值
print("Ridge RMSE on Test set :", rmse_CV_test(ridge).mean())
y_train_rdg = ridge.predict(X_train)#岭回归的返回分数
y_test_rdg = ridge.predict(X_test)
print("Kcv RMSE on Training set :", y_train_rdg.mean())#K折交叉验证结果的均值
print("Kcv RMSE on Test set :", y_test_rdg.mean())
coef = pd.Series(ridge.coef_, index = X_train.columns)
print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
# Plot residuals
plt.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue", label = "Training data")
plt.scatter(y_test_rdg, y_test_rdg - y_test, c = "black", marker = "v", label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()
# Plot predictions - Real values
plt.scatter(y_train_rdg, y_train, c = "blue", label = "Training data")
plt.scatter(y_test_rdg, y_test, c = "black", label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()
上一篇: Spring基础---IoC使用拓展