欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

资金流入流出预测-挑战Baseline-排名445 /122.17

程序员文章站 2024-03-22 08:32:22
...

介绍
蚂蚁金服拥有上亿会员并且业务场景中每天都涉及大量的资金流入和流出,面对如此庞大的用户群,资金管理压力会非常大。在既保证资金流动性风险最小,又满足日常业务运转的情况下,精准地预测资金的流入流出情况变得尤为重要。此届大赛以《资金流入流出预测》为题,期望参赛者能够通过对例如余额宝用户的申购赎回数据的把握,精准预测未来每日的资金流入流出情况。对货币基金而言,资金流入意味着申购行为,资金流出为赎回行为 。

具体请移步:资金流入流出预测

思路介绍:
计算统计特征
把时间作为时序特征
构建cnn模型

资金流入流出预测-挑战Baseline-排名445 /122.17

具体代码:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import optimizers
from keras.callbacks import EarlyStopping
from keras.layers import Input, Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from keras.models import Model
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

import os

os.chdir(r'E:\项目文件\资金流入流出预测')

user_balance = pd.read_csv('user_balance_table.csv')
# user_profile = pd.read_csv('user_profile_table.csv')

df_tmp = user_balance.groupby(['report_date'])['total_purchase_amt', 'total_redeem_amt'].sum()
df_tmp.index = pd.to_datetime(df_tmp.index, format='%Y%m%d')

holidays = ('20130813', '20130902', '20131001', '20131111', '20130919', '20131225', '20140101', '20140130', '20140131',
            '20140214', '20140405', '20140501', '20140602', '20140802', '20140901', '20140908')


def create_features(timeindex):
    n = len(timeindex)
    features = np.zeros((n, 4))
    features[:, 0] = timeindex.day.values / 31
    features[:, 1] = timeindex.month.values / 12
    features[:, 2] = timeindex.weekday.values / 6
    for i in range(n):
        if timeindex[i].strftime('%Y%m%d') in holidays:
            features[i, 3] = 1
    return features


features = create_features(df_tmp.index)
september = pd.to_datetime(['201409%02d' % i for i in range(1, 31)])
features_sep = create_features(september)

scaler_pur = MinMaxScaler()
scaler_red = MinMaxScaler()
data_pur = scaler_pur.fit_transform(df_tmp.values[:, 0:1])
data_red = scaler_red.fit_transform(df_tmp.values[:, 1:2])


def create_dataset(data, back, forward=30):
    n_samples = len(data) - back - forward + 1
    X, Y = np.zeros((n_samples, back, data.shape[-1])), np.zeros((n_samples, forward, data.shape[-1]))
    for i in range(n_samples):
        X[i, ...] = data[i:i + back, :]
        Y[i, ...] = data[i + back:i + back + forward, :]
    return X, Y


def build_cnn(X_trn, lr, n_outputs, dropout_rate):
    inputs = Input(X_trn.shape[1:])
    z = Conv1D(64, 14, padding='valid', activation='relu', kernel_initializer='he_uniform')(inputs)
    #     z = MaxPooling1D(2)(z)
    z = Conv1D(128, 7, padding='valid', activation='relu', kernel_initializer='he_uniform')(z)
    z = MaxPooling1D(2)(z)
    z = Conv1D(256, 3, padding='valid', activation='relu', kernel_initializer='he_uniform')(z)
    z = Conv1D(256, 3, padding='valid', activation='relu', kernel_initializer='he_uniform')(z)
    z = MaxPooling1D(2)(z)
    z = Flatten()(z)
    z = Dropout(dropout_rate)(z)
    z = Dense(128, activation='relu', kernel_initializer='he_uniform')(z)
    z = Dropout(dropout_rate)(z)
    z = Dense(84, activation='relu', kernel_initializer='he_uniform')(z)
    outputs = Dense(n_outputs)(z)
    model = Model(inputs=inputs, outputs=outputs)
    adam = optimizers.Adam(lr=lr)
    model.compile(loss='mse', optimizer=adam, metrics=['mae'])
    model.summary()
    return model


back = 60
forward = 30
X_pur_data, Y_pur_data = create_dataset(data_pur, back, forward)
X_red_data, Y_red_data = create_dataset(data_red, back, forward)
X_features, Y_features = create_dataset(features, back, forward)
Y_features = np.concatenate((Y_features, np.zeros((Y_features.shape[0], back-forward, Y_features.shape[-1]))), axis=1)
# X_pur, X_red = np.concatenate((X_pur_data, X_features, Y_features), axis=-1), np.concatenate((X_red_data, X_features, Y_features), axis=-1)
# X_pur_trn, X_pur_val, X_red_trn, X_red_val = X_pur[:-forward, ...], X_pur[-1:, ...], X_red[:-forward, ...], X_red[-1:, ...]
# Y_pur_trn, Y_pur_val, Y_red_trn, Y_red_val = Y_pur_data[:-forward, ...], Y_pur_data[-1:, ...], Y_red_data[:-forward, ...], Y_red_data[-1:, ...]
Y_fea_sep = np.concatenate((features_sep, np.zeros((back-forward, features_sep.shape[-1]))), axis=0)
# X_pur_tst = np.concatenate((data_pur[-back:, :], features[-back:, :], Y_fea_sep), axis=-1)[None, ...]
# X_red_tst = np.concatenate((data_red[-back:, :], features[-back:, :], Y_fea_sep), axis=-1)[None, ...]
X = np.concatenate((X_pur_data, X_red_data, X_features, Y_features), axis=-1)
Y = np.concatenate((Y_pur_data, Y_red_data), axis=1)
X_trn, X_val, Y_trn, Y_val = X[:-forward, ...], X[-1:, ...], Y[:-forward, ...], Y[-1:, ...]
X_tst = np.concatenate((data_pur[-back:, :], data_red[-back:, :], features[-back:, :], Y_fea_sep), axis=-1)[None, ...]
cnn = build_cnn(X_trn, lr=0.0008, n_outputs=2 * forward, dropout_rate=0.5)
history = cnn.fit(X_trn, Y_trn, batch_size=32, epochs=1000, verbose=2,
                  validation_data=(X_val, Y_val),
                  callbacks=[EarlyStopping(monitor='val_mae', patience=200, restore_best_weights=True)])

plt.figure(figsize=(8, 5))
plt.plot(history.history['mae'], label='train mae')
plt.plot(history.history['val_mae'], label='validation mae')
plt.ylim([0, 0.2])
plt.legend()
plt.show()


def plot_prediction(y_pred, y_true):
    plt.figure(figsize=(16, 4))
    plt.plot(np.squeeze(y_pred), label='prediction')
    plt.plot(np.squeeze(y_true), label='true')
    plt.legend()
    plt.show()
    print('MAE: %.3f' % mean_absolute_error(np.squeeze(y_pred), np.squeeze(y_true)))


pred = cnn.predict(X_val)
plot_prediction(pred, Y_val)

history = cnn.fit(X, Y, batch_size=32, epochs=500, verbose=2,
                  callbacks=[EarlyStopping(monitor='mae', patience=30, restore_best_weights=True)])

plt.figure(figsize=(8, 5))
plt.plot(history.history['mae'], label='train mae')
plt.legend()
plt.show()
print(cnn.evaluate(X, Y, verbose=2))

pred_tst = cnn.predict(X_tst)
pur_sep = scaler_pur.inverse_transform(pred_tst[:, :forward].transpose())
red_sep = scaler_red.inverse_transform(pred_tst[:, forward:].transpose())
test_user = pd.DataFrame({'report_date': [20140900 + i for i in range(1, 31)]})
test_user['pur'] = pur_sep.astype('int')
test_user['red'] = red_sep.astype('int')
test_user.to_csv('submission.csv', encoding='utf-8', index=False, header=False)