逻辑回归
程序员文章站
2023-12-27 10:22:15
...
1. 读取数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df_X = pd.read_csv('./logistic_x.txt', sep='\ +',header=None, engine='python') #读取X值
ys = pd.read_csv('./logistic_y.txt', sep='\ +',header=None, engine='python') #读取y值
ys = ys.astype(int)
df_X['label'] = ys[0].values #将X按照y值的结果一一打标签
ax = plt.axes()
#在二维图中描绘X点所处位置,直观查看数据点的分布情况
df_X.query('label == 0').plot.scatter(x=0, y=1, ax=ax, color='blue')
df_X.query('label == 1').plot.scatter(x=0, y=1, ax=ax, color='red')
#提取用于学习的数据
Xs = df_X[[0, 1]].values
ys = df_X['label'].values
2. 代码实现
2.1 代码
import numpy as np
from sklearn.metrics import accuracy_score
class LogisticRegression:
def __init__(self):
self.coef_ = None
self.intercept_ = None
self._theta = None
def _sigmoid(self, t):
return 1. / (1. + np.exp(-t))
def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
y_hat = self._sigmoid(X_b.dot(theta))
try:
return - np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict_proba(self, X_predict):
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return self._sigmoid(X_b.dot(self._theta))
def predict(self, X_predict):
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
proba = self.predict_proba(X_predict)
return np.array(proba >= 0.5, dtype='int')
def score(self, X_test, y_test):
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)
logistic_regression = LogisticRegression() #因为前面已经将截距项的值合并到变量中,此处参数设置不需要截距项
logistic_regression.fit(Xs, ys) #拟合
score = logistic_regression.score(Xs, ys) #结果评价
print("Coefficient: %s" % logistic_regression.coef_)
print("Score: %s" % score)
def x2(x1):
return (-logistic_regression.coef_[0] * x1 - logistic_regression.intercept_) / logistic_regression.coef_[1]
x1_plot = np.linspace(0, 8, 1000)
x2_plot = x2(x1_plot)
ax = plt.axes()
df_X.query('label == 0').plot.scatter(x=0, y=1, ax=ax, color='blue')
df_X.query('label == 1').plot.scatter(x=0, y=1, ax=ax, color='red')
plt.plot(x1_plot, x2_plot)
plt.show()
2.2 sklearn对比
Xs = df_X[[0, 1]].values
Xs = np.hstack([np.ones((Xs.shape[0], 1)), Xs])
ys = df_X['label'].values
from __future__ import print_function
import numpy as np
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(fit_intercept=False) #因为前面已经将截距项的值合并到变量中,此处参数设置不需要截距项
lr.fit(Xs, ys) #拟合
score = lr.score(Xs, ys) #结果评价
print("Coefficient: %s" % lr.coef_)
print("Score: %s" % score)
ax = plt.axes()
df_X.query('label == 0').plot.scatter(x=0, y=1, ax=ax, color='blue')
df_X.query('label == 1').plot.scatter(x=0, y=1, ax=ax, color='red')
_xs = np.array([np.min(Xs[:,1]), np.max(Xs[:,1])])
#将数据以二维图形式描点,并用学习得出的参数结果作为阈值,划分数据区域
_ys = (lr.coef_[0][0] + lr.coef_[0][1] * _xs) / (- lr.coef_[0][2])
plt.plot(_xs, _ys, lw=1)
将自己实现的代码和sklearn进行对比(准确率和绘制图),结果相差无几。