First ML Project Procedure
大框架
对课题有个整体的了解,明确背景,情况,矛盾,目标。
明确问题
了解解决这个问题的作用,也许是为了其他工作做准备,故抓重点,明流程。
评判标准
Root Mean Square Error (RMSE):均方根误差
Mean Absolute Error(MAE):绝对平均误差
检查假设
记录所有的假设,标准,定义等。例如0-100 -> cheap, 100-2000 -> medium 等
获取数据
工具
交互式python工具 jupyter ,Anaconda Navigator
python 独立环境 virtualenv , 廖雪峰关于virtualenc的教程
下载数据
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
print ("fetch_housing_data is done!")
数据大观
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing #显示全部数据
housing.head() #显示前部部分数据
housing.info() #显示各项数据类型
housing["ocean_proximity"].value_counts() #显示某个属性下的值的情况
housing.describe() #显示所有数值类型数据
%matplotlib inline
import matplotlib.pyplot as plt
housing["median_income"].hist(bins=20, figsize=(15,10)) #为某个属性画直方图
#bins 直方图立柱个数, figsize 直方图长宽大小
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15)) #为所有属性画直方图
plt.show()
划分测试集
随机划分法
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")
每次划分都会产生不同的测试集和训练集,以致于多次操作后,ML算法会经历所有的数据。
ID-HASH法
import hashlib
def test_set_check(identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
housing_with_id = housing.reset_index()
print(len(housing_with_id) )
print(len(housing) )
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
print(len(train_set), "train +",len(test_set) , "test")
虽然该方法每次对于固定的数据来说,都会划分一致的训练集和测试集,但是也对未来新的数据有一些要求。新的数据必须在旧数据末尾追加,并且不能丢失旧数据,旧数据顺序不能改变。
对此进行改进如下:
housing_with_id = housing.reset_index() # adds an `index` column but new data has to append the old one and no old data deleted
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
print(len(housing_with_id) )
print(len(housing) )
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
print(len(train_set), "train +",len(test_set) , "test")
自定义id,这样避免了归纳刚才的约束。
分层抽样法
例如某个社会有53%的男性,47%的女性。现需要对某个与性别无关的问题进行社会抽样调查100人,就应在所有男性中随机抽取53名男性,所用女性中抽取47名女性。
本案例中,对meidan_income进行分层。
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) #向上取整
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
然后进行分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
别忘了去除过程属性“income_cat”
housing["income_cat"].value_counts() / len(housing)
for set in (strat_train_set, strat_test_set):
set.drop(["income_cat"], axis=1, inplace=True)
数据可视化与规律总结
housing = strat_train_set.copy()
地理信息可视化
import matplotlib.pyplot as plt
housing.plot(kind="scatter", x="longitude", y="latitude") #点图
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) #透明点图
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
) #颜色点图,可能不能运行成功
plt.legend()
相关性
corr_matrix = housing.corr() #standard correlation coefficient
corr_matrix["median_house_value"].sort_values(ascending=False) #正相关查询,以median_house_value为基参
取值范围为[-1,1],-1表示负相关性,1表示正相关性。0表示无关。
对选定属性之间进行相关性检查,若有4个属性,则有16次检查。
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
其中
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1)
发现
- 几乎正相关
- 500k,450k,350k,280k处有应约的“横线”,应为outlier,移除之
属性调整
增加更“有用”的属性。这个要自己判断了。
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
在查看相关性的变化
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
处理数据
在训练集中,不需要被预测的属性
housing = strat_train_set.drop("median_house_value" , axis=1) # deal with missing values by dropping it
housing_labels = strat_train_set["median_house_value"].copy()
数据清理
某些行丢失了某些属性,有三种处理方法:
- 丢弃所有丢失数据的行。
- 丢弃某个属性。
- 以其他行的平均数填充。
housing.dropna(subset=["total_bedrooms"]) # option 1
housing.drop("total_bedrooms", axis=1) # option 2
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median) # option 3
使用Imputer策略填充器
from sklearn.preprocessing import Imputer # deal with missing value by fill a default value
import pandas as pd
imputer = Imputer(strategy="median") #平均数策略
housing_num = housing.drop("ocean_proximity", axis=1) #丢弃文本属性
imputer.fit(housing_num)
print("imputer.statistics = " ,imputer.statistics_) #显示统计数据
print("housing_num.median().values = " , housing_num.median().values) #显示被处理数据的数据,这个和上一个应该是一致的
类型转化
X = imputer.transform(housing_num) #tramsform the pandas.dataframe to numpy array
print("X = ",X)
housing_tr = pd.DataFrame(X, columns=housing_num.columns) #tramsform the numpy array to pandas.dataframe
文本数据
文本数据数值化,聚类分值[0,1,2,3,4……]
from sklearn.preprocessing import LabelEncoder # deal with text value and transform it to numpy value
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat)
print(housing_cat_encoded)
print("encoder.classes = " , encoder.classes_)
one-hot-encoder[0,1,0,0,0]
from sklearn.preprocessing import OneHotEncoder # one hot encoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) #将数值类型的数据转化为01类型
housing_cat_1hot #储存的是点对,即值为1的位置
housing_cat_1hot.toarray() #转化为数组类型
from sklearn.preprocessing import LabelBinarizer #do previous steps in onece
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat) #将文本类型的数据转化为01类型
housing_cat_1hot
自定义转换器
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
数据范围
归一化方法 normalization
min-max scaling:取值[0,1],Y(x) = (x-min)/(max-min) ,容易受到噪声的影响,如果min或者max是异常值,就很惨。
表转化方法 standardization
standardization:取值[0,1],Y(x)=(x-average)/标准差 。均值为0,标准差为1。
Pipeline
from sklearn.pipeline import Pipeline #pipeline from Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
num_pipeline = Pipeline([
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),])
housing_num_tr = num_pipeline.fit_transform(housing_num)
当调用pipieline的fit函数,依次调用每步操作的fit_transform函数 ,直到最后一步调用fit函数。
pipeline拥有最后一个estimator的方法。
FeatureUnion
pipeline的pipeline。
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
from sklearn.pipeline import FeatureUnion #pipeline from FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape
retpreparedData()
选择模型
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
线性回归模型
from sklearn.linear_model import LinearRegression #线性回归模型
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))
用均方根误差和交叉验证来评判优劣。
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
from sklearn.model_selection import cross_val_score
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)#交叉验证线性回归模型
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
判断出欠拟合:需要更多数据,模型约束太多,不够强。
决策树模型
from sklearn.tree import DecisionTreeRegressor #决策树模型
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
用均方根误差和交叉验证来评判优劣。
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
lin_scores = cross_val_score(lin_reg, housing_prepared,
housing_labels,scoring="neg_mean_squared_error", cv=10)#交叉验证线性回归模型
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
随机森林模型
from sklearn.ensemble import RandomForestRegressor #随机森林模型
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
from sklearn.metrics import mean_squared_error
housing_predictions = forest_reg.predict(housing_prepared)
用均方根误差和交叉验证来评判优劣。
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
from sklearn.model_selection import cross_val_score #交叉验证决策树模型
scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
微调参数
Grid Search
from sklearn.model_selection import GridSearchCV #Grid Search
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
查看该方法调参的最好结果
grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
查看整体效果
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importance
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
param_grid = {'n_estimators': sp_randint(3,30), 'max_features': sp_randint(2,10)}
forest_reg = RandomForestRegressor()
rand_search = RandomizedSearchCV(forest_reg, param_grid, n_iter=10, random_state=5)
rand_search.fit(housing_prepared, housing_labels)
查看该方法调参的最好结果
rand_search.best_params_
rand_search.best_estimator_
cvres = rand_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
发布ML
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse