PM2.5预测
程序员文章站
2024-03-07 17:55:21
...
本篇为李宏毅机器学习第三次作业内容,不使用sklearn包来手写线性回归完成对PM2.5的预测,先说本次代码的不足和欠缺思考的部分,首先对数据的特征没有进行过多的处理,如异常值和标准化,其次使用的是最简单的一次线性模型,可能存在拟合程度不够,最后采用的梯度下降方法不够优化,没有使用Adagrad方法进行梯度下降.这次的作业和内容还有很多值得完善和思考的地方,但自己手写代码和推导对于机器学习的理解可以更加深刻.
# 导入相应的包
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
#修改默认文件路径
import os
os.chdir(r'D:\datawale\week1')
os.getcwd()
# 读取训练文件
data = pd.read_csv('train.csv')
# 删除无用字段
data.drop(['stations'],inplace=True,axis = 1)
# 查看数据
data.head()
# 选择PM2.5的数据
data1 = data[data['observation'] == 'PM2.5']
#将前两列无关字段去除
data2 = data1.iloc[:,2:]
#查看data2
data2.head()
#创建两个list来接收数据
x_train = []
y_train = []
for i in range(15):
#特征是每行数据的前9个小时数据
x = data2.iloc[:,i:i+9]
#给这些列添加相同的列名方便后期合并
x.columns = np.array(range(9))
# 每行的第10个小时为数据的标签
y = data2.iloc[:,i+9]
#给这些列添加相同的列名方便后期合并
y.columns = np.array(range(1))
#将特征和标签放入到列表中
x_train.append(x)
y_train.append(y)
# 合并特征数据
x_train_ts = pd.concat(x_train,axis = 0)
y_train_ts = pd.concat(y_train,axis = 0)
x_train_ts
#x_train_ts.describe()
#y_train_ts
# 转换为浮点型进行计算
x_train_ts = np.array(x_train_ts,float)
y_train_ts = np.array(y_train_ts,float)
x_train_ts
# 对数据进行增加偏置的修正
x_train_ts = np.concatenate((np.ones((x_train_ts.shape[0],1)),x_train_ts),axis = 1)
x_train_ts
# 将测试数据集处理好
data_test = pd.read_csv('test(1).csv',header= None)
data_test.head()
data_new = data_test[data_test[1] == 'PM2.5']
x_test = data_new.iloc[:,2:]
x_test = np.array(x_test,float)
x_test = np.concatenate((np.ones((x_test.shape[0],1)),x_test),axis = 1)
y_data = pd.read_csv('answer.csv',header=0)
y_test = np.array(y_data['value'],float)
y_test
# 反思,这里最初就是对的,但是由于学习率设置过大导致发散,无法获得w值
w = np.zeros(len(x_train_ts[0]))
lr = 0.0000000001
#先写最简单的模型
for i in range(100000):
y_hat = np.dot(x_train_ts,w)
errors = y_hat - y_train_ts
# 最小化损失函数
delta_w = 2 * np.dot(x_train_ts.transpose(),errors)
w = w - lr*delta_w #更新w权值
#loss = 0.5*(errors**2).sum()
y_mean = np.mean(y_train_ts)
SSR = ((y_hat - y_train_ts)**2).sum()
SST = ((y_train_ts - y_mean)**2).sum()
R_square = 1 - SSR/SST
if i%10000 == 0:
print('第%i轮误差和%.4f'%(i+1,R_square))
#plt.scatter(i,R_square,'red')
print(w)
'''
# 使用Adagrad梯度下降(参考)
w=np.zeros(len(x_train_ts[0]))
lr=10
Iteration=10000
sum_gra=np.zeros(len(x_train_ts[0])) #define sum=0
for i in range(Iteration):
y_new=np.dot(x_train_ts,w)
loss=y_new- y_train_ts
gra= 2*np.dot(x_train_ts.transpose(),loss) #notice we must transpose train_x
sum_gra += gra**2
ada=np.sqrt(sum_gra)
w=w-lr*gra/ada
y_mean = np.mean(y_train_ts)
SSR = ((y_hat - y_train_ts)**2).sum()
SST = ((y_train_ts - y_mean)**2).sum()
R_square = 1 - SSR/SST
if i%1000 == 0:
print('第%i轮误差和%.4f'%(i+1,R_square))
w
'''
# 定义一个测试函数来检验预测效果
# R_square得分验证
def test(x_test,y_test):
y_hat = np.dot(x_test,w)
y_mean = np.mean(y_test)
SSR = ((y_hat - y_test)**2).sum()
SST = ((y_test - y_mean)**2).sum()
R_square = 1 - SSR/SST
return R_square
test(x_test,y_test)
下一篇: 合并表、数据清洗