欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

2019全国高校大数据应用创新竞赛Baseline

程序员文章站 2022-04-01 08:41:12
...

网址 : https://ai.futurelab.tv/tournament/2

### 1. 导入需要的工具包并查看数据
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error as mse
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
import warnings

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# 读入并查看数据
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y%m%d')
df = pd.read_csv('bd2019-weather-prediction-training-20190608.csv', 
							parse_dates=True, date_parser=dateparse, index_col='date')
df.head(5)
### 2. 定义的一些方法
def getMetrics(y_true, y_pred):
    '''
    结果评估函数
    y_hat : 预测结果
    y : 真实结果
    '''
    
    plt.figure(figsize=(18,6))
    plt.plot(y_true)
    plt.plot(y_pred, color='red')
    plt.show()
    
    r2 = 1 - np.sum(np.square(np.array(y_pred)-np.array(y_true)))/np.sum(np.square(np.array(y_true)-np.mean(y_true)))
    print('R2 拟合度为: {} , MSE: {}    '.format(r2, mse(y_true, y_pred)))

def getWinDire(wind_dire):
    '''
    wind_direction 字段处理函数
    '''
    if wind_dire == 999999 or wind_dire == 999998:
        return 165.691
    if wind_dire > 361:
        return (wind_dire % 100 - 1) * 22.5
    else:
        return wind_dire
# (temperature, humidity, rain20, rain08)缺失值初步处理, 可改进使用拉格朗日插值法修补缺失值
temperature_mean = np.mean([_ for _ in df.temperature if _ < 888889])
df['temperature'] = df.temperature.replace([999990, 999999], temperature_mean)

humidity_mean = np.mean([_ for _ in df.humidity if _ < 888889])
df['humidity'] = df.humidity.replace([999990, 999999], humidity_mean)
# 对风向根据字段定义预处理
df['wind_direction'] = df['wind_direction'].apply(getWinDire)
df.head(5)
### 4. 数据分析
dta = df[['temperature', 'humidity','station']]
dta.station.unique()
#### 4.1 使用 sta_temp_dict, sta_humi_dict存放每个站台的温度, 湿度信息
sta_temp_dict = {}
sta_humi_dict = {}

for _ in dta.station.unique():
    sta_temp_dict[_] = dta[(dta['station'] == _)][['temperature']]
    sta_humi_dict[_] = dta[(dta['station'] == _)][['humidity']]
#### 5.1  定义构建滑窗数据集函数
def getMovingWindowData(dict_, window_size=17, th_day=1, train_size=0.7):
    '''
    Input: dict_ : 输入数据源dict
            window_size: 窗口大小
            th_day : 构建第几天的结果
            train_size :  训练集大小
    return train_X, , train_Y, test_X, test_Y  每个站台取(1-train_size)用来测试
    '''
    # 切分时每个盒子的大小
    box_size = window_size + th_day 
    # 构造的训练集,测试集数据
    train_X, train_Y = [],[]
    test_X, test_Y = [],[]
    # 遍历每个站台下的数据
    for sta in dict_.keys():
        # 站台下的数据
        dta = list(dict_.get(sta).values.ravel())
        # 站台下的数据可以构建的box数目
        box_list_len = len(dta) - box_size + 1
        # 构建数据集
        for _ in range(box_list_len):
            # 得到单个box数据
            box = dta[_:_+box_size]
            # 构建训练测试集 按train_size
            if _ < box_list_len * train_size:
                train_X.append(box[:window_size])
                train_Y.append(box[-1])
            else:
                test_X.append(box[:window_size])
                test_Y.append(box[-1])
                
    return  np.array(train_X), np.array(train_Y), np.array(test_X), np.array(test_Y)


#### 5.2 构建数据集并预测
train_X, train_Y, test_X, test_Y = getMovingWindowData(sta_temp_dict, th_day=1)

###### 构建ANN,并用train进行训练

def make_model(window_size=17):
    model = Sequential()
    model.add(Dense(60, input_dim=window_size, init="uniform",activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dense(60, init="uniform", activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dropout(0.3))
    model.add(Dense(60, init="uniform", activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dropout(0.3))
    model.add(Dense(60, init="uniform", activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dropout(0.3))
    model.add(Dense(1))
    model.add(Activation("linear"))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model
model = make_model(window_size=17)
# model.fit(train_X,train_Y, nb_epoch=66, batch_size=1788, validation_split = .05)
model.summary()
# # 训练集拟合情况查看
# getMetrics(train_Y, model.predict(train_X).reshape(-1))
# # 查看在测试集上的表现
# getMetrics(test_Y, model.predict(test_X).reshape(-1))
#### 5.3 预测
##### 加载模型
model_temp_1 = load_model('./model/temp_1.h5')
model_temp_2 = load_model('./model/temp_2.h5')
model_temp_3 = load_model('./model/temp_3.h5')
model_temp_4 = load_model('./model/temp_4.h5')
model_temp_5 = load_model('./model/temp_5.h5')
model_temp_6 = load_model('./model/temp_6.h5')
model_temp_7 = load_model('./model/temp_7.h5')

model_humi_1 = load_model('./model/humi_1.h5')
model_humi_2 = load_model('./model/humi_2.h5')
model_humi_3 = load_model('./model/humi_3.h5')
model_humi_4 = load_model('./model/humi_4.h5')
model_humi_5 = load_model('./model/humi_5.h5')
model_humi_6 = load_model('./model/humi_6.h5')
model_humi_7 = load_model('./model/humi_7.h5')

# 预测未来7日的温度
ann_temp_predicted = []
for _ in sta_temp_dict.keys():
    temp_history = np.reshape(list(sta_temp_dict.get(_)['temperature'].values[-17:]), (1, -1))

    ann_temp_predicted.append(model_temp_1.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_2.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_3.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_4.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_5.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_6.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_7.predict(temp_history)[0][0])
    

# 预测未来7日的湿度度
ann_humi_predicted = []
for _ in sta_humi_dict.keys():
    humi_history = np.reshape(list(sta_humi_dict.get(_)['humidity'].values[-17:]), (1, -1))  
    
    ann_humi_predicted.append(model_humi_1.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_2.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_3.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_4.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_5.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_6.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_7.predict(humi_history)[0][0])

import csv
date = ['20180101', '20180102', '20180103', '20180104', '20180105', '20180106', '20180107'] * len(sta_humi_dict.keys())
station = np.transpose(np.reshape(list(sta_humi_dict.keys()) * 7, (7, len(sta_humi_dict.keys())))).ravel()
with open('./np.csv', 'w', newline='') as csvfile:
    write = csv.writer(csvfile)
    write.writerow(('date','station', 'temperature','humidity'))
    write.writerows(map(list, zip(date,station, ann_temp_predicted,ann_humi_predicted)))
相关标签: MyCode