【机器学习】线性回归
程序员文章站
2022-03-08 17:26:40
...
线性回归是利用数理统计中回归分析,来确定两种或两种以上变量间相互依赖的定量关系的一种统计分析方法,运用十分广泛。其表达形式为y = w’x+e,e为误差服从均值为0的正态分布。
最小二乘法
一般来说,线性回归都可以通过最小二乘法求出其方程,可以计算出对于y=ax+b的直线。
import numpy as np
import matplotlib.pyplot as plt
x = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 3, 3, 5])
plt.scatter(x, y)
plt.axis([0, 6, 0, 6])
plt.show()
x_mean = np.mean(x)
y_mean = np.mean(y)
# 分子
num = 0.0
# 分母
d = 0.0
for x_i, y_i in zip(x, y):
num += (x_i - x_mean) * (y_i - y_mean)
d += (x_i - x_mean) ** 2
a = num / d
b = y_mean - a * x_mean
print(a)
# 模型预测
predict_y = a * x + b
plt.scatter(x, y)
plt.plot(x, predict_y, color='r')
plt.show()
向量化运算,提高运算效率
# 向量化运算
vector_num = (x-x_mean).dot(y-y_mean)
vector_d =(x-x_mean).dot(x-x_mean)
a = vector_num/vector_d
print(a)
均方误差
# 均方误差
mse = np.sum((y - predict_y) ** 2) / len(y)
print(mse)
# 均方根误差
rmse = np.sqrt(np.sum((y - predict_y) ** 2) / len(y))
print(rmse)
sklearn中的数据集
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data[:, 5]
y = boston.target
plt.scatter(x, y)
plt.show()
# 移除异常数据
x = x[y < 50]
y = y[y < 50]
plt.scatter(x, y)
plt.show()
x_mean = np.mean(x)
y_mean = np.mean(y)
# 向量化运算
vector_num = (x - x_mean).dot(y - y_mean)
vector_d = (x - x_mean).dot(x - x_mean)
a = vector_num / vector_d
print(a)
b = y_mean - a * x_mean
# 模型预测
predict_y = a * x + b
plt.scatter(x, y)
plt.plot(x, predict_y, color='r')
plt.show()
rmse = np.sqrt(np.sum((y - predict_y) ** 2) / len(y))
print('rmse',rmse)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data[:, 5]
y = boston.target
# 数据集分割
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_lable = train_test_split(x, y, test_size=0.2, random_state=666)
# 模型训练 最小二乘法
x_mean = np.mean(train_data)
y_mean = np.mean(train_label)
# 向量化运算
vector_num = (x - x_mean).dot(y - y_mean)
vector_d = (x - x_mean).dot(x - x_mean)
a = vector_num / vector_d
print(a)
b = y_mean - a * x_mean
# 模型预测
predict_y = a * test_data + b
rmse = np.sqrt(np.sum((test_lable - predict_y) ** 2) / len(y))
print('rmse',rmse)
plt.scatter(test_data, test_lable)
plt.plot(test_data, predict_y, color='r')
plt.show()
# sklearn 回归评测指标
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mse = mean_squared_error(test_lable,predict_y)
mae = mean_absolute_error(test_lable,predict_y)
多元线性回归
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data
y = boston.target
# 数据集分割
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_lable = train_test_split(x, y, test_size=0.2, random_state=666)
xb = np.hstack([np.ones((len(train_data),1)),train_data])
theta = np.linalg.inv(xb.T.dot(xb)).dot(xb.T).dot(train_label)
# 截距
interceptor = theta[0]
# 斜率
ceef = theta[0:]
# 模型预测
xb_test = np.hstack([np.ones((len(test_data),1)),test_data])
predict_label = xb_test.dot(theta)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mse = mean_squared_error(test_lable,predict_label)
print(mse)
sklearn中的线性回归
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data
y = boston.target
# 数据集分割
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_lable = train_test_split(x, y, test_size=0.2, random_state=666)
from sklearn.linear_model import LinearRegression
lre = LinearRegression()
lre.fit(train_data,train_label)
p_l=lre.predict(test_data)
score=lre.score(test_data,test_lable)
print(score)
spark中的线性回归
那就得用Scala写程序了,导入包,训练模型,又是熟悉的流程。。。
上一篇: postman接口做关联测试的方法步骤