【机器学习】逻辑回归
程序员文章站
2022-03-08 17:25:40
...
logistic回归又称logistic回归分析,是一种广义的线性回归分析模型,常用于数据挖掘,疾病自动诊断,经济预测等领域。例如,探讨引发疾病的危险因素,并根据危险因素预测疾病发生的概率等。
sigmoid函数
通过sigmoid函数,可以将任何实数值转换为区间为【0,1】之间值相应的值就符合了概率对应的值域
import numpy as np
from matplotlib import pyplot as plt
# 定义sigmoid函数
def sigmoid(t):
return 1 / (1 + np.exp(-t))
x = np.linspace(-10, 10, 1000)
y = sigmoid(x)
plt.plot(x, y)
plt.show()
逻辑回归损失函数:表征模型预测值与真实值的不一致程度。
损失函数为什么选择用交叉验证。原因是平方损失在训练的时候会出现一定的问题。当预测值与真实值之间的差距过大时,这时候参数的调整就需要变大,但是如果使用平方损失,训练的时候可能看到的情况是预测值和真实值之间的差距越大,参数调整的越小,训练的越慢。
from sklearn import datasets
iris=datasets.load_iris()
x=iris.data
y=iris.target
x=x[y<2,:2]
y=y[y<2]
plt.scatter(x[y==0,0],x[y==0,1],color='r')
plt.scatter(x[y==1,0],x[y==1,1],color='b')
plt.show()
训练集数据分割
train_data,test_data,train_label,test_label
= train_test_split(X,Y,random_state=666)
#定义sigmoid函数
def
sigmoid(t):
return 1/(1+np.exp(-t))
#定义损失函数
def
J(theta,Xb,y):
y_hat = sigmoid(Xb.dot(theta))
return -np.sum(y * np.log(y_hat) + (1-y) *
np.log(1 - y_hat)) / len(y)
# 定义损失函数倒数
def
dj(theta,Xb,y):
y_hat = sigmoid(Xb.dot(theta))
return (Xb.T.dot(y_hat - y))/len(y)
#梯度下降法
def
gradient_descent(Xb,y,initial_theta,eta,epsilon= 1e-8):
theta = initial_theta
while True:
gradient = dj(theta,Xb,y)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta,Xb,y) -
J(last_theta,Xb,y)) < epsilon):
break
return theta
Xb = np.hstack([np.ones((len(train_data),1)),train_data])
initial_theta
= np.zeros(Xb.shape[1])
eta
=0.001
res_theta
= gradient_descent(Xb,train_label,initial_theta,eta)
print(res_theta)
[-4.25636661
5.94884505 -8.99992917]
#模型训练
X_test =
np.hstack([np.ones((len(test_data),1)),train_data])
predict_test
= sigmoid(X_test.dot(res_theta))
predict_test
= np.array(predict_test>=0.5,dtype='int')
print(predict_test)
#模型评测
accuracy_score(test_label,predict_test)
多项式逻辑回归:
逻辑回归正则化
逻辑回归中的超参数:
C用于控制在求解最优化值时,损失函数所占的权重
Penalty
选择正则化类型,L1,L2
使用逻辑回归解决多分类问题
改造方案:OvR(one vs Rest)
OvO(one vs one)
import numpy as np
from sklearn.model_selection import
train_test_split
import matplotlib.pyplot as plt
np.random.seed(666)
x = np.random.normal(0,1,size=[200,2])
y = np.array(x[:,0]**2+x[:,1]**2<1.5,dtype='int')
plt.scatter(x[y==0,0],x[y==0,1],color ='r')
plt.scatter(x[y==1,0],x[y==1,1],color ='b')
plt.show()
# 数据集分割
train_data, test_data, train_label, test_label = train_test_split(x, y, random_state=10)
def sigmoid(t):
return 1 / (1 + np.exp(-t))
# 定义损失函数
def J(theta, Xb, y):
y_hat = sigmoid(Xb.dot(theta))
return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
# 定义损失函数倒数
def dj(theta, Xb, y):
y_hat = sigmoid(Xb.dot(theta))
return (Xb.T.dot(y_hat - y)) / len(y)
# 梯度下降法
def gradient_descent(Xb, y, initial_theta, eta, epsilon=1e-8):
theta = initial_theta
while True:
gradient = dj(theta, Xb, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, Xb, y) - J(last_theta, Xb, y)) < epsilon):
break
return theta
def fit(train_data, train_label):
Xb = np.hstack([np.ones((len(train_data), 1)), train_data])
initial_theta = np.zeros(Xb.shape[1])
eta = 0.001
return gradient_descent(Xb, train_label, initial_theta, eta)
def predict(test_data, theta):
Xb = np.hstack([np.ones((len(test_data), 1)), test_data])
predict_test = sigmoid(Xb.dot(theta))
predict_test = np.array(predict_test >= 0.5, dtype='int')
return predict_test
theta = fit(train_data, train_label)
predict_test = predict(test_data, theta)
# 分类评测
accuracy_score(test_label, predict_test)
# 决策边界绘制
def plot_decision_boundary(axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int(axis[1] - axis[0]) * 100),
np.linspace(axis[2], axis[3], int(axis[3] - axis[2]) * 100)
)
x_new = np.c_[x0.ravel(), x1.ravel()]
predict_y = predict(x_new, theta)
zz = predict_y.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, cmap=custom_cmap)
plot_decision_boundary(axis=[-4, 4, -4, 4])
plt.scatter(x[y == 0, 0], x[y == 0, 1], color='r')
plt.scatter(x[y == 1, 0], x[y == 1, 1], color='b')
plt.show()
# sklearn中的逻辑回归
poly_log_reg = Pipeline([('poly', PolynomialFeatures(degree=2)),
('std_sacler', StandardScaler()),
('lin_reg', LinearRegression())])
poly_log_reg.fit(train_data, train_label)
poly_log_reg.score(test_data, test_label)
def plot_decision_boundary2(model, axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int(axis[1] - axis[0]) * 100),
np.linspace(axis[2], axis[3], int(axis[3] - axis[2]) * 100)
)
x_new = np.c_[x0.ravel(), x1.ravel()]
predict_y = model.predict(x_new, theta)
zz = predict_y.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CADF9'])
plt.contourf(x0, x1, zz, cmap=custom_cmap)
plot_decision_boundary2(poly_log_reg, axis=[-4, 4, -4, 4])
plt.scatter(x[y == 0, 0], x[y == 0, 1], color='r')
plt.scatter(x[y == 1, 0], x[y == 1, 1], color='b')
plt.show()
上一篇: Python爬虫Scrapy框架IP代理的配置与调试
下一篇: C语言中数组的使用详解