案例:逻辑回归Logistic Regression做乳腺癌预测
程序员文章站
2022-07-14 13:01:57
...
使用逻辑回归算法解决乳腺癌检测问题,使用sk-learn自带的乳腺癌数据集
1 导入数据
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
X=cancer.data
y=cancer.target
print('data shape:{0};positive:{1},negative:{2}'.format(X.shape,y[y==1].shape,y[y==0].shape))
print('腺癌数据的前两行为:')
print(cancer.data[0:2])
data shape:(569, 30);positive:(357,),negative:(212,)
腺癌数据的前两行为:
[[ 1.79900000e+01 1.03800000e+01 1.22800000e+02 1.00100000e+03
1.18400000e-01 2.77600000e-01 3.00100000e-01 1.47100000e-01
2.41900000e-01 7.87100000e-02 1.09500000e+00 9.05300000e-01
8.58900000e+00 1.53400000e+02 6.39900000e-03 4.90400000e-02
5.37300000e-02 1.58700000e-02 3.00300000e-02 6.19300000e-03
2.53800000e+01 1.73300000e+01 1.84600000e+02 2.01900000e+03
1.62200000e-01 6.65600000e-01 7.11900000e-01 2.65400000e-01
4.60100000e-01 1.18900000e-01]
[ 2.05700000e+01 1.77700000e+01 1.32900000e+02 1.32600000e+03
8.47400000e-02 7.86400000e-02 8.69000000e-02 7.01700000e-02
1.81200000e-01 5.66700000e-02 5.43500000e-01 7.33900000e-01
3.39800000e+00 7.40800000e+01 5.22500000e-03 1.30800000e-02
1.86000000e-02 1.34000000e-02 1.38900000e-02 3.53200000e-03
2.49900000e+01 2.34100000e+01 1.58800000e+02 1.95600000e+03
1.23800000e-01 1.86600000e-01 2.41600000e-01 1.86000000e-01
2.75000000e-01 8.90200000e-02]]
print('腺癌特征有:')
cancer.feature_names
乳腺癌特征有:
array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension'],
dtype='<U23')
2 模型训练
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
#模型训练
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train)
train_score=model.score(X_train,y_train)
test_score=model.score(X_test,y_test)
print('train score:{0};test score{1}'.format(train_score,test_score))
train score:0.9538461538461539;test score0.9649122807017544
#样本预测
import numpy as np
y_pred=model.predict(X_test)
print('预测正确的样本数/测试集总数')
print('match:{}/{}'.format(np.equal(y_pred,y_test).shape[0],y_test.shape[0]))
预测正确的样本数/测试集总数
match:114/114
由结果114/114结果得全部预测正确,但test_score!=1,因为test_score是是使用预测y率计算得分的而不是预测正确个数
#预测概率:找出预测概率低于0.9的样本
print('预测概率输出形式')
y_pred_proba=model.predict_proba(X_test)#计算每个测试样本的预测概率
#打印出第一个样本的数据,了解数据形式
print('sample of predict probability:{0}'.format(y_pred_proba[0]))
#打印出第一列:预测为阴性的概率大于0.1的样本,保存于result
result=y_pred_proba[y_pred_proba[:,0]>0.1]
#在result中找到第二列:预测为阳性的概率大于0.1的样本
result=result[result[:,1]>0.1]
print('所有预测概率低于0.9的样本:')
result
预测概率输出形式
sample of predict probability:[ 9.99999994e-01 6.07363434e-09]
所有预测概率低于0.9的样本:
array([[ 0.1247275 , 0.8752725 ],
[ 0.81486957, 0.18513043],
[ 0.80170649, 0.19829351],
[ 0.8841406 , 0.1158594 ],
[ 0.20235829, 0.79764171],
[ 0.74137791, 0.25862209],
[ 0.70031446, 0.29968554],
[ 0.10240006, 0.89759994],
[ 0.76374429, 0.23625571],
[ 0.10171958, 0.89828042],
[ 0.19183231, 0.80816769],
[ 0.13520306, 0.86479694],
[ 0.25013441, 0.74986559],
[ 0.24004072, 0.75995928],
[ 0.65056509, 0.34943491],
[ 0.62913915, 0.37086085]])
可以说是非常低了
3 模型优化
尝试通过增加多项式特征,L1,L2正则化提高模型准确性
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def polynomial_model(degree=1,**kwarg):
"""增加多项式特征"""
polynomial_features=PolynomialFeatures(degree=degree,include_bias=False)
logistic_regression=LogisticRegression(**kwarg)
pipeline=Pipeline([(' polynomial_features', polynomial_features),( 'logistic_regression', logistic_regression)])
return pipeline
import time
model=polynomial_model(degree=2,penalty='l1')#二阶多项式,l1范数正则化
start=time.clock()
model.fit(X_train,y_train)
train_score=model.score(X_train,y_train)
test_score=model.score(X_test,y_test)
print('polynomial_model(degree=2),elaspe:{0:.6f};train_score:{1:.6f}:test_score:{2:.6f}'.format(time.clock()-start,train_score,test_score))
polynomial_model(degree=2),elaspe:0.550339;train_score:1.000000:test_score:0.982456
有结果可得:采用二阶多项式,L1范数正则化,训练数据集评分,测试数据集评分均提升
L1范数正则化,实现特征参数的稀疏化,自动选择对模型有关联的特征
#查看特征
logistic_regression=model.named_steps['logistic_regression']
print('model_parameter_shape:{0};count of non_zero element:{1}'.format(logistic_regression.coef_.shape,np.count_nonzero(logistic_regression.coef_)))
model_parameter_shape:(1, 495);count of non_zero element:91
增加二阶多项式特征后,输入特征由30个增长至495个,使用l1正则化,大多数特征被抛弃,只保留113个有效特征
4 学习曲线
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from matplotlib import pyplot as plt
%matplotlib inline
def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,train_sizes=np.linspace(.1,1.0,5)):
"""生成学习曲线"""
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel('Training examples')
plt.ylabel('Score')
train_sizes,train_scores,test_scores=learning_curve(estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes)
train_scores_mean=np.mean(train_scores,axis=1)
train_scores_std=np.std(train_scores,axis=1)
test_scores_mean=np.mean(test_scores,axis=1)
test_scores_std=np.std(test_scores,axis=1)
plt.grid()
plt.fill_between(train_sizes,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.1,color='r')
plt.fill_between(train_sizes,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.1,color='g')
plt.plot(train_sizes,train_scores_mean,'o--',color='r',label='Test score')
plt.plot(train_sizes,test_scores_mean,'o-',color='g',label='Cross_validation score')
plt.legend(loc='best')
return plt
#使用L1范数作为正则项所对应的一阶二阶多项式学习曲线
cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)
title='learning curves(degree={0},penalty={1})'
degrees=[1,2]
penalty='l1'
start=time.clock()
plt.figure(figsize=(10,4))
for i in range(len(degrees)):
plt.subplot(1,len(degrees),i+1)
plot_learning_curve(polynomial_model(degree=degrees[i],penalty=penalty),
title.format(degrees[i],penalty),X,y,ylim=(0.8,1.01),cv=cv)
print('elaspe:{0:.6}s'.format(time.clock()-start))
elaspe:31.1267s
#使用L2范数作为正则项所对应的一阶二阶多项式学习曲线
penalty='l2'
start=time.clock()
plt.figure(figsize=(10,4))
for i in range(len(degrees)):
plt.subplot(1,len(degrees),i+1)
plot_learning_curve(polynomial_model(degree=degrees[i],penalty=penalty),
title.format(degrees[i],penalty),X,y,ylim=(0.8,1.01),cv=cv)
print('elaspe:{0:.6f}s'.format(time.clock()-start))
elaspe:7.630889s
由学习曲线可得,使用二阶多项式并使用L1范数正则化的模型做优,其训练样本评分分最高,测试样本评分最高
上一篇: dash入门
下一篇: 逻辑回归——乳腺癌分类