欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

天池o2o优惠券使用预测

程序员文章站 2022-07-01 23:25:27
...

天池o2o优惠券使用预测

准备工作

# 导库
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import date
# 魔法函数
%matplotlib inline
# matplotlib 支持中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# matplotlib 正常显示负号
plt.rcParams['axes.unicode_minus'] = False
# dataframe显示不换行
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
# 选择一个绘图主题
from jupyterthemes import jtplot
jtplot.style(theme='onedork')
# 一个cell显示多个输出结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# 1754884*7
off_train = pd.read_csv('ccf_offline_stage1_train.csv')
off_train.columns = ['user_id', 'merchant_id', 'coupon_id',
                     'discount_rate', 'distance', 'date_received', 'date']
# 113640*6
off_test = pd.read_csv('ccf_offline_stage1_test_revised.csv')
off_test.columns = ['user_id', 'merchant_id', 'coupon_id',
                    'discount_rate', 'distance', 'date_received']
# 11429826*7
on_train = pd.read_csv('ccf_online_stage1_train.csv')
on_train.columns = ['user_id', 'merchant_id', 'action',
                    'coupon_id', 'discount_rate', 'date_received', 'date']
term 来源 内容
dataset3 table3,off_test off_test数据
dataset2 table2,off_train 领券日期在20160515-20160615之间的
dataset1 table2,off_train 领券日期在20160414-20160514的
feature3 table2,off_train 消费日期在20160315-20160630的,或领券日期在20160315-20160630但没有消费的
feature2 table2,off_train 消费日期在20160201-20160514的,或领券日期在20160201-20160514但没有消费的
feature1 table2,off_train 消费日期在20160101-20160413的,或领券日期在20160101-20160413但没有消费的
项目 预测区间(提取label) 特征区间(提取feature)
- 领券了的 消费了的+领券了没消费的
测试集 dataset3 feature3
训练集1 dataset2 feature2
训练集2 dataset1 feature1

这里找NaN空值用isnull(),不能用=='null’或者==np.nan之类的。
读取数据集时没有把两个时间列的所有元素转换为datetime格式,不等号右边就用数字20160315这样的就行,如果转换了,不等号右边得用pd.to_datetime转换

dataset3 = off_test
feature3 = off_train[((off_train.date >= 20160315) & (off_train.date <= 20160630)) | (
    (off_train.date.isnull()) & (off_train.date_received >= 20160315) & (off_train.date_received <= 20160630))]
dataset2 = off_train[(off_train.date_received >= 20160515)
                     & (off_train.date_received <= 20160615)]
feature2 = off_train[(off_train.date >= 20160201) & (off_train.date <= 20160514) | (
    (off_train.date.isnull()) & (off_train.date_received >= 20160201) & (off_train.date_received <= 20160514))]
dataset1 = off_train[(off_train.date_received >= 20160414)
                     & (off_train.date_received <= 20160514)]
feature1 = off_train[(off_train.date >= 20160101) & (off_train.date <= 20160413) | (
    (off_train.date.isnull()) & (off_train.date_received >= 20160101) & (off_train.date_received <= 20160413))]

其他特征

column 含义
this_month_user_receive_all_coupon_count 用户领取优惠券的总次数
this_month_user_receive_same_coupon_count 用户领取该行中的优惠券的次数
this_month_user_receive_same_coupon_firstone 用户领取某张优惠券(使用次数不止1次)的首次领取时间
this_month_user_receive_same_coupon_lastone 用户领取某张优惠券(使用次数不止1次)的末次领取时间
this_day_user_receive_all_coupon_count 用户领券当天领券总数量
this_day_user_receive_same_coupon_count 用户领券当天领取该优惠券的数量

dataset3

t = dataset3[['user_id']].copy()
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby(['user_id'], as_index=False).count()
t.head(2)
user_id this_month_user_receive_all_coupon_count
0 209 2
1 215 1
t1 = dataset3[['user_id', 'coupon_id']].copy()
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['user_id', 'coupon_id'], as_index=False).count()
t1.head(2)
user_id coupon_id this_month_user_receive_same_coupon_count
0 209 825 1
1 209 7557 1
# 每个用户领取某张优惠券(使用次数大于1次)的首次和末次使用时间
t2 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t2.date_received = t2.date_received.astype('str')
# agg(lambda x:':'.join(x))表示用冒号分隔开,第6行就是一个例子
t2 = t2.groupby(['user_id', 'coupon_id'], as_index=False)[
    'date_received'].agg(lambda x: ':'.join(x))
t2.iloc[5:7]
t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
t2 = t2[t2.receive_number > 1]
t2['max_date_received'] = t2.date_received.apply(
    lambda s: max([int(d) for d in s.split(':')]))
t2['min_date_received'] = t2.date_received.apply(
    lambda s: min([int(d) for d in s.split(':')]))
t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]
t2.head(2)
user_id coupon_id date_received
5 432 10438 20160706
6 448 10927 20160710:20160707:20160706:20160704
user_id coupon_id max_date_received min_date_received
6 448 10927 20160710 20160704
13 736 3686 20160724 20160713
t3 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t3 = pd.merge(t3, t2, how='left', on=['user_id', 'coupon_id'])
t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
    t3.date_received
t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - \
    t3.min_date_received


def is_firstlastone(x):
    if x == 0:
        return 1
    elif x > 0:
        return 0
    else:
        return -1  # those only receive once


# this_month_user_receive_same_coupon_lastone为1,表示该行是该用户最后一次领取该券
t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
    is_firstlastone)
t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
    is_firstlastone)
t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
         'this_month_user_receive_same_coupon_firstone']]
t3.head(2)
user_id coupon_id date_received this_month_user_receive_same_coupon_lastone this_month_user_receive_same_coupon_firstone
0 4129537 9983 20160712 -1 -1
1 6949378 3429 20160706 -1 -1
# this_day_user_receive_all_coupon_count:用户当天领券数量
t4 = dataset3[['user_id', 'date_received']].copy()
t4['this_day_user_receive_all_coupon_count'] = 1
t4 = t4.groupby(['user_id', 'date_received'], as_index=False).count()
t4.head(2)
user_id date_received this_day_user_receive_all_coupon_count
0 209 20160721 2
1 215 20160703 1
# this_day_user_receive_same_coupon_count:用户领券当天领取该优惠券的数量
t5 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t5['this_day_user_receive_same_coupon_count'] = 1
t5 = t5.groupby(['user_id', 'coupon_id', 'date_received'],
                as_index=False).count()
t5.head(2)
user_id coupon_id date_received this_day_user_receive_same_coupon_count
0 209 825 20160721 1
1 209 7557 20160721 1
'''t6 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
    'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n    'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''# 前、后一张券的时间间隔
def get_day_gap_before(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
            date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)


def get_day_gap_after(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)


t7 = dataset3[['user_id', 'coupon_id', 'date_received']]
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
         'day_gap_before', 'day_gap_after']]
t7.head()'''
"# 前、后一张券的时间间隔\ndef get_day_gap_before(s):\n    date_received, dates = s.split('-')\n    dates = dates.split(':')\n    gaps = []\n    for d in dates:\n        this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n            date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n        if this_gap > 0:\n            gaps.append(this_gap)\n    if len(gaps) == 0:\n        return -1\n    else:\n        return min(gaps)\n\n\ndef get_day_gap_after(s):\n    date_received, dates = s.split('-')\n    dates = dates.split(':')\n    gaps = []\n    for d in dates:\n        this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n        if this_gap > 0:\n            gaps.append(this_gap)\n    if len(gaps) == 0:\n        return -1\n    else:\n        return min(gaps)\n\n\nt7 = dataset3[['user_id', 'coupon_id', 'date_received']]\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n         'day_gap_before', 'day_gap_after']]\nt7.head()"
other_feature3 = pd.merge(t1, t, on='user_id')
other_feature3 = pd.merge(other_feature3, t3, on=['user_id', 'coupon_id'])
other_feature3 = pd.merge(other_feature3, t4, on=['user_id', 'date_received'])
other_feature3 = pd.merge(other_feature3, t5, on=[
                          'user_id', 'coupon_id', 'date_received'])
other_feature3.to_csv('other_feature3.csv', index=None)
other_feature3.shape
(113640, 9)

dataset2

t = dataset2[['user_id']].copy()
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('user_id', as_index=False).count()
t.head(2)
user_id this_month_user_receive_all_coupon_count
0 4 1
1 165 1
t1 = dataset2[['user_id', 'coupon_id']].copy()
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['user_id', 'coupon_id'], as_index=False).count()
t1.head(2)
user_id coupon_id this_month_user_receive_same_coupon_count
0 4 2902.0 1
1 165 7571.0 1
t2 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t2.date_received = t2.date_received.astype('int').astype('str')
t2 = t2.groupby(['user_id', 'coupon_id'], as_index=False)[
    'date_received'].agg(lambda x: ':'.join(x))
t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
t2 = t2[t2.receive_number > 1]
t2['max_date_received'] = t2.date_received.apply(
    lambda s: max([int(d) for d in s.split(':')]))
t2['min_date_received'] = t2.date_received.apply(
    lambda s: min([int(d) for d in s.split(':')]))
t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]
t2.head(2)
user_id coupon_id max_date_received min_date_received
18 696 7571.0 20160524 20160520
31 947 12414.0 20160610 20160609
t3 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t3 = pd.merge(t3, t2, on=['user_id', 'coupon_id'], how='left')
t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
    t3.date_received.astype('int')
t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(
    'int') - t3.min_date_received


def is_firstlastone(x):
    if x == 0:
        return 1
    elif x > 0:
        return 0
    else:
        return -1  # those only receive once


t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
    is_firstlastone)
t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
    is_firstlastone)
t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
         'this_month_user_receive_same_coupon_firstone']]
t3.head(2)
user_id coupon_id date_received this_month_user_receive_same_coupon_lastone this_month_user_receive_same_coupon_firstone
0 1439408 11002.0 20160528.0 -1 -1
1 1439408 8591.0 20160613.0 1 0
t4 = dataset2[['user_id', 'date_received']].copy()
t4['this_day_user_receive_all_coupon_count'] = 1
t4 = t4.groupby(['user_id', 'date_received'], as_index=False).count()
t4.head(2)
user_id date_received this_day_user_receive_all_coupon_count
0 4 20160607.0 1
1 165 20160525.0 1
t5 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t5['this_day_user_receive_same_coupon_count'] = 1
t5 = t5.groupby(['user_id', 'coupon_id', 'date_received'],
                as_index=False).count()
t5.head(2)
user_id coupon_id date_received this_day_user_receive_same_coupon_count
0 4 2902.0 20160607.0 1
1 165 7571.0 20160525.0 1
'''t6 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('int').astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
    'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('int').astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n    'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''def get_day_gap_before(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
            date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)


def get_day_gap_after(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)


t7 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype(
    'int').astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
         'day_gap_before', 'day_gap_after']]
t7.head(2)'''
"def get_day_gap_before(s):\n    date_received, dates = s.split('-')\n    dates = dates.split(':')\n    gaps = []\n    for d in dates:\n        this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n            date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n        if this_gap > 0:\n            gaps.append(this_gap)\n    if len(gaps) == 0:\n        return -1\n    else:\n        return min(gaps)\n\n\ndef get_day_gap_after(s):\n    date_received, dates = s.split('-')\n    dates = dates.split(':')\n    gaps = []\n    for d in dates:\n        this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n        if this_gap > 0:\n            gaps.append(this_gap)\n    if len(gaps) == 0:\n        return -1\n    else:\n        return min(gaps)\n\n\nt7 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype(\n    'int').astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n         'day_gap_before', 'day_gap_after']]\nt7.head(2)"
other_feature2 = pd.merge(t1, t, on='user_id')
other_feature2 = pd.merge(other_feature2, t3, on=['user_id', 'coupon_id'])
other_feature2 = pd.merge(other_feature2, t4, on=['user_id', 'date_received'])
other_feature2 = pd.merge(other_feature2, t5, on=[
                          'user_id', 'coupon_id', 'date_received'])
other_feature2.to_csv('other_feature2.csv', index=None)
other_feature2.head()
user_id coupon_id this_month_user_receive_same_coupon_count this_month_user_receive_all_coupon_count date_received this_month_user_receive_same_coupon_lastone this_month_user_receive_same_coupon_firstone this_day_user_receive_all_coupon_count this_day_user_receive_same_coupon_count
0 4 2902.0 1 1 20160607.0 -1 -1 1 1
1 165 7571.0 1 1 20160525.0 -1 -1 1 1
2 166 9261.0 1 1 20160525.0 -1 -1 1 1
3 215 8944.0 1 1 20160524.0 -1 -1 1 1
4 236 11002.0 1 1 20160528.0 -1 -1 1 1
other_feature2.shape
(258446, 9)
dataset1.head()
user_id merchant_id coupon_id discount_rate distance date_received date
7 1832624 3381 7610.0 200:20 0.0 20160429.0 NaN
18 163606 1569 5054.0 200:30 10.0 20160421.0 NaN
43 4061024 3381 7610.0 200:20 10.0 20160426.0 NaN
52 106443 450 3732.0 30:5 NaN 20160429.0 NaN
96 114747 1569 5054.0 200:30 9.0 20160426.0 NaN
# for dataset1
t = dataset1[['user_id']].copy()
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('user_id', as_index=False).count()
t.head(2)
user_id this_month_user_receive_all_coupon_count
0 173 1
1 285 1
t1 = dataset1[['user_id', 'coupon_id']].copy()
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['user_id', 'coupon_id'], as_index=False).count()
t1.head(2)
user_id coupon_id this_month_user_receive_same_coupon_count
0 173 7610.0 1
1 285 1532.0 1
t2 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t2.date_received = t2.date_received.astype('int').astype('str')
t2 = t2.groupby(['user_id', 'coupon_id'], as_index=False)[
    'date_received'].agg(lambda x: ':'.join(x))
t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
t2 = t2[t2.receive_number > 1]
t2['max_date_received'] = t2.date_received.apply(
    lambda s: max([int(d) for d in s.split(':')]))
t2['min_date_received'] = t2.date_received.apply(
    lambda s: min([int(d) for d in s.split(':')]))
t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]
t2.head(2)
user_id coupon_id max_date_received min_date_received
38 2227 7088.0 20160511 20160426
147 8397 8556.0 20160506 20160425
t3 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t3 = pd.merge(t3, t2, on=['user_id', 'coupon_id'], how='left')
t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
    t3.date_received.astype('int')
t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(
    'int') - t3.min_date_received


def is_firstlastone(x):
    if x == 0:
        return 1
    elif x > 0:
        return 0
    else:
        return -1  # those only receive once


t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
    is_firstlastone)
t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
    is_firstlastone)
t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
         'this_month_user_receive_same_coupon_firstone']]
t3.head(2)
user_id coupon_id date_received this_month_user_receive_same_coupon_lastone this_month_user_receive_same_coupon_firstone
0 1832624 7610.0 20160429.0 -1 -1
1 163606 5054.0 20160421.0 -1 -1
t4 = dataset1[['user_id', 'date_received']].copy()
t4['this_day_user_receive_all_coupon_count'] = 1
t4 = t4.groupby(['user_id', 'date_received'], as_index=False).count()
t4.head(2)
user_id date_received this_day_user_receive_all_coupon_count
0 173 20160414.0 1
1 285 20160501.0 1
t5 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t5['this_day_user_receive_same_coupon_count'] = 1
t5 = t5.groupby(['user_id', 'coupon_id', 'date_received'],
                as_index=False).count()
t5.head(2)
user_id coupon_id date_received this_day_user_receive_same_coupon_count
0 173 7610.0 20160414.0 1
1 285 1532.0 20160501.0 1
'''t6 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('int').astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
    'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('int').astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n    'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''def get_day_gap_before(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
            date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)


def get_day_gap_after(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)


t7 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype(
    'int').astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
         'day_gap_before', 'day_gap_after']]
t7.head(2)'''
"def get_day_gap_before(s):\n    date_received, dates = s.split('-')\n    dates = dates.split(':')\n    gaps = []\n    for d in dates:\n        this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n            date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n        if this_gap > 0:\n            gaps.append(this_gap)\n    if len(gaps) == 0:\n        return -1\n    else:\n        return min(gaps)\n\n\ndef get_day_gap_after(s):\n    date_received, dates = s.split('-')\n    dates = dates.split(':')\n    gaps = []\n    for d in dates:\n        this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n        if this_gap > 0:\n            gaps.append(this_gap)\n    if len(gaps) == 0:\n        return -1\n    else:\n        return min(gaps)\n\n\nt7 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype(\n    'int').astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n         'day_gap_before', 'day_gap_after']]\nt7.head(2)"
other_feature1 = pd.merge(t1, t, on='user_id')
other_feature1 = pd.merge(other_feature1, t3, on=['user_id', 'coupon_id'])
other_feature1 = pd.merge(other_feature1, t4, on=['user_id', 'date_received'])
other_feature1 = pd.merge(other_feature1, t5, on=[
                          'user_id', 'coupon_id', 'date_received'])
other_feature1.to_csv('other_feature1.csv', index=None)
other_feature1.head()
user_id coupon_id this_month_user_receive_same_coupon_count this_month_user_receive_all_coupon_count date_received this_month_user_receive_same_coupon_lastone this_month_user_receive_same_coupon_firstone this_day_user_receive_all_coupon_count this_day_user_receive_same_coupon_count
0 173 7610.0 1 1 20160414.0 -1 -1 1 1
1 285 1532.0 1 1 20160501.0 -1 -1 1 1
2 316 8952.0 1 1 20160430.0 -1 -1 1 1
3 377 2857.0 1 1 20160512.0 -1 -1 1 1
4 387 7610.0 1 1 20160421.0 -1 -1 1 1
other_feature1.shape
(137167, 9)

coupon相关特征

column 含义
discount_rate 优惠券打折率,“满师减一”或0.9统一为0.9
distance 到店距离
day_of_week 周几
day_of_month 几号
days_distance 领券时间距6月30号距离(dataset2和dataset1作相应变化)
this_day_user_receive_same_coupon_count 用户领券当天领取该优惠券的数量
discount_man “满减”折扣中满多少
discount_jian “满减”折扣中减多少
is_man_jian 优惠券是不是满减形式
coupon_count 优惠券出现次数
def calc_discount_rate(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return float(s[0])
    else:
        return 1.0-float(s[1])/float(s[0])


def get_discount_man(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return 0
    else:
        return int(s[0])


def get_discount_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return 0
    else:
        return int(s[1])


def is_man_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return 0
    else:
        return 1

dataset3

dataset3['day_of_week'] = dataset3.date_received.astype('int').astype('str').apply(
    lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)
dataset3['day_of_month'] = dataset3.date_received.astype('int').astype(
    'str').apply(lambda x: int(x[6:8]))
dataset3['days_distance'] = dataset3.date_received.astype('int').astype('str').apply(
    lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(2016, 6, 30)).days)
dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian)
dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian)
dataset3['discount_rate'] = dataset3.discount_rate.apply(calc_discount_rate)
dataset3.head()
user_id merchant_id coupon_id discount_rate distance date_received day_of_week day_of_month days_distance discount_man discount_jian is_man_jian
0 4129537 450 9983 0.833333 1.0 20160712 2 12 12 30 5 1
1 6949378 1300 3429 0.833333 NaN 20160706 3 6 6 30 5 1
2 2166529 7113 6928 0.900000 5.0 20160727 3 27 27 200 20 1
3 2166529 7113 1808 0.900000 5.0 20160727 3 27 27 100 10 1
4 6172162 7605 6500 0.966667 2.0 20160708 5 8 8 30 1 1
d = dataset3[['coupon_id']].copy()
d['coupon_count'] = 1
d = d.groupby('coupon_id', as_index=False).count()
d.head(2)
coupon_id coupon_count
0 3 16
1 7 1
dataset3 = pd.merge(dataset3, d, on='coupon_id', how='left')
dataset3.to_csv('coupon3_feature.csv', index=None)
dataset3.head()
user_id merchant_id coupon_id discount_rate distance date_received day_of_week day_of_month days_distance discount_man discount_jian is_man_jian coupon_count
0 4129537 450 9983 0.833333 1.0 20160712 2 12 12 30 5 1 11586
1 6949378 1300 3429 0.833333 NaN 20160706 3 6 6 30 5 1 4887
2 2166529 7113 6928 0.900000 5.0 20160727 3 27 27 200 20 1 205
3 2166529 7113 1808 0.900000 5.0 20160727 3 27 27 100 10 1 205
4 6172162 7605 6500 0.966667 2.0 20160708 5 8 8 30 1 1 132
dataset2.head()
user_id merchant_id coupon_id discount_rate distance date_received date
1 1439408 4663 11002.0 150:20 1.0 20160528.0 NaN
4 1439408 2632 8591.0 20:1 0.0 20160613.0 NaN
6 1439408 2632 8591.0 20:1 0.0 20160516.0 20160613.0
9 2029232 450 1532.0 30:5 0.0 20160530.0 NaN
10 2029232 6459 12737.0 20:1 0.0 20160519.0 NaN

dataset2

dataset2['day_of_week'] = dataset2['date_received'].copy().astype('str').apply(
    lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)

dataset2['day_of_month'] = dataset2.date_received.copy().astype(
    'str').apply(lambda x: int(x[6:8]))
dataset2['days_distance'] = dataset2.date_received.copy().astype('str').apply(
    lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(2016, 5, 14)).days)
dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man)
dataset2['discount_jian'] = dataset2.discount_rate.apply(get_discount_jian)
dataset2['is_man_jian'] = dataset2.discount_rate.apply(is_man_jian)
dataset2['discount_rate'] = dataset2.discount_rate.apply(calc_discount_rate)
dataset2.head()
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
user_id merchant_id coupon_id discount_rate distance date_received date day_of_week day_of_month days_distance discount_man discount_jian is_man_jian
1 1439408 4663 11002.0 0.866667 1.0 20160528.0 NaN 6 28 14 150 20 1
4 1439408 2632 8591.0 0.950000 0.0 20160613.0 NaN 1 13 30 20 1 1
6 1439408 2632 8591.0 0.950000 0.0 20160516.0 20160613.0 1 16 2 20 1 1
9 2029232 450 1532.0 0.833333 0.0 20160530.0 NaN 1 30 16 30 5 1
10 2029232 6459 12737.0 0.950000 0.0 20160519.0 NaN 4 19 5 20 1 1
d = dataset2[['coupon_id']].copy()
d['coupon_count'] = 1
d = d.groupby('coupon_id', as_index=False).count()
d.head(2)
coupon_id coupon_count
0 1.0 4
1 3.0 12
dataset2 = pd.merge(dataset2, d, on='coupon_id', how='left')
dataset2.to_csv('coupon2_feature.csv', index=None)
dataset2.head()
user_id merchant_id coupon_id discount_rate distance date_received date day_of_week day_of_month days_distance discount_man discount_jian is_man_jian coupon_count
0 1439408 4663 11002.0 0.866667 1.0 20160528.0 NaN 6 28 14 150 20 1 7730
1 1439408 2632 8591.0 0.950000 0.0 20160613.0 NaN 1 13 30 20 1 1 5
2 1439408 2632 8591.0 0.950000 0.0 20160516.0 20160613.0 1 16 2 20 1 1 5
3 2029232 450 1532.0 0.833333 0.0 20160530.0 NaN 1 30 16 30 5 1 11728
4 2029232 6459 12737.0 0.950000 0.0 20160519.0 NaN 4 19 5 20 1 1 16

dataset1

dataset1['day_of_week'] = dataset1.date_received.copy().astype(
    'str').apply(lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)
dataset1['day_of_month'] = dataset1.date_received.copy().astype(
    'str').apply(lambda x: int(x[6:8]))
dataset1['days_distance'] = dataset1.date_received.copy().astype('str').apply(
    lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(2016, 4, 13)).days)
dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man)
dataset1['discount_jian'] = dataset1.discount_rate.apply(get_discount_jian)
dataset1['is_man_jian'] = dataset1.discount_rate.apply(is_man_jian)
dataset1['discount_rate'] = dataset1.discount_rate.apply(calc_discount_rate)
dataset1.head()
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
user_id merchant_id coupon_id discount_rate distance date_received date day_of_week day_of_month days_distance discount_man discount_jian is_man_jian
7 1832624 3381 7610.0 0.900000 0.0 20160429.0 NaN 5 29 16 200 20 1
18 163606 1569 5054.0 0.850000 10.0 20160421.0 NaN 4 21 8 200 30 1
43 4061024 3381 7610.0 0.900000 10.0 20160426.0 NaN 2 26 13 200 20 1
52 106443 450 3732.0 0.833333 NaN 20160429.0 NaN 5 29 16 30 5 1
96 114747 1569 5054.0 0.850000 9.0 20160426.0 NaN 2 26 13 200 30 1
d = dataset1[['coupon_id']].copy()
d['coupon_count'] = 1
d = d.groupby('coupon_id', as_index=False).count()
d.head(2)
coupon_id coupon_count
0 1.0 1
1 2.0 2
dataset1 = pd.merge(dataset1, d, on='coupon_id', how='left')
dataset1.to_csv('coupon1_feature.csv', index=None)
dataset1.head(2)
user_id merchant_id coupon_id discount_rate distance date_received date day_of_week day_of_month days_distance discount_man discount_jian is_man_jian coupon_count
0 1832624 3381 7610.0 0.90 0.0 20160429.0 NaN 5 29 16 200 20 1 22281
1 163606 1569 5054.0 0.85 10.0 20160421.0 NaN 4 21 8 200 30 1 11893

Merchant相关特征

column 含义
total_sales 商家被消费次数
sales_use_coupon 商家被领券后消费的次数
total_coupon 商家被领券次数
coupon_rate sales_use_coupon/total_sales
transfer_rate sales_use_coupon/total_coupon
merchant_min_distance 最小距离
merchant_max_distance 最大距离
merchant_min_distance 最小距离
merchant_max_distance 最大距离

dataset3

merchant3 = feature3[['merchant_id', 'coupon_id',
                      'distance', 'date_received', 'date']].copy()
t = merchant3[['merchant_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
merchant_id
1 4663
3 2632
t1 = merchant3[merchant3.date.notnull()][['merchant_id']].copy()
t1['total_sales'] = 1
t1 = t1.groupby('merchant_id', as_index=False).count()
t1.head(2)
merchant_id total_sales
0 1 10
1 2 4
t2 = merchant3[(merchant3.date.notnull()) & (
    merchant3.coupon_id.notnull())][['merchant_id']].copy()
t2['sales_use_coupon'] = 1
t2 = t2.groupby('merchant_id', as_index=False).count()
t2.head(2)
merchant_id sales_use_coupon
0 3 1
1 4 5
t3 = merchant3[merchant3.coupon_id.notnull()][['merchant_id']].copy()
t3['total_coupon'] = 1
t3 = t3.groupby('merchant_id', as_index=False).count()
t3.head(2)
merchant_id total_coupon
0 2 7
1 3 10
t4 = merchant3[(merchant3.date.notnull()) & (
    merchant3.coupon_id.notnull())][['merchant_id', 'distance']].copy()
t4.head(2)
merchant_id distance
6 2632 0.0
33 1361 0.0
t5 = t4.groupby('merchant_id', as_index=False).min()
t5.rename(columns={'distance': 'merchant_min_distance'}, inplace=True)

t6 = t4.groupby('merchant_id', as_index=False).max()
t6.rename(columns={'distance': 'merchant_max_distance'}, inplace=True)

t7 = t4.groupby('merchant_id', as_index=False).mean()
t7.rename(columns={'distance': 'merchant_mean_distance'}, inplace=True)

t8 = t4.groupby('merchant_id', as_index=False).median()
t8.rename(columns={'distance': 'merchant_median_distance'}, inplace=True)
t8.head(2)
merchant_id merchant_median_distance
0 3 0.0
1 4 0.0
merchant3_feature = pd.merge(t, t1, on='merchant_id', how='left')
merchant3_feature = pd.merge(
    merchant3_feature, t2, on='merchant_id', how='left')
merchant3_feature = pd.merge(
    merchant3_feature, t3, on='merchant_id', how='left')
merchant3_feature = pd.merge(
    merchant3_feature, t5, on='merchant_id', how='left')
merchant3_feature = pd.merge(
    merchant3_feature, t6, on='merchant_id', how='left')
merchant3_feature = pd.merge(
    merchant3_feature, t7, on='merchant_id', how='left')
merchant3_feature = pd.merge(
    merchant3_feature, t8, on='merchant_id', how='left')
merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(
    np.nan, 0)
merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon / \
    merchant3_feature.total_coupon
merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype(
    'float') / merchant3_feature.total_sales
merchant3_feature.total_sales = merchant3_feature.total_sales.replace(np.nan, 0)
merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(
    np.nan, 0)
merchant3_feature.to_csv('merchant3_feature.csv', index=None)
merchant3_feature.head()
merchant_id total_sales sales_use_coupon total_coupon merchant_min_distance merchant_max_distance merchant_mean_distance merchant_median_distance merchant_coupon_transfer_rate coupon_rate
0 4663 393.0 39.0 11445.0 0.0 10.0 1.911765 1.0 0.003408 0.099237
1 2632 8.0 3.0 23.0 0.0 1.0 0.666667 1.0 0.130435 0.375000
2 3381 9579.0 416.0 46729.0 0.0 10.0 1.713580 1.0 0.008902 0.043428
3 450 7757.0 1239.0 35933.0 0.0 10.0 0.894068 0.0 0.034481 0.159727
4 6459 20.0 0.0 16.0 NaN NaN NaN NaN 0.000000 0.000000
merchant3_feature.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8293 entries, 0 to 8292
Data columns (total 10 columns):
merchant_id                      8293 non-null int64
total_sales                      8293 non-null float64
sales_use_coupon                 8293 non-null float64
total_coupon                     8293 non-null float64
merchant_min_distance            3552 non-null float64
merchant_max_distance            3552 non-null float64
merchant_mean_distance           3552 non-null float64
merchant_median_distance         3552 non-null float64
merchant_coupon_transfer_rate    5363 non-null float64
coupon_rate                      8194 non-null float64
dtypes: float64(9), int64(1)
memory usage: 712.7 KB

dataset2

merchant2 = feature2[['merchant_id', 'coupon_id',
                      'distance', 'date_received', 'date']].copy()
t = merchant2[['merchant_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
merchant_id
0 2632
7 3381
t1 = merchant2[merchant2.date.notnull()][['merchant_id']].copy()
t1['total_sales'] = 1
t1 = t1.groupby('merchant_id').agg('sum').reset_index()
t1.head(2)
merchant_id total_sales
0 1 3
1 3 8
t2 = merchant2[(merchant2.date.notnull()) & (
    merchant2.coupon_id.notnull())][['merchant_id']].copy()
t2['sales_use_coupon'] = 1
t2 = t2.groupby('merchant_id').agg('sum').reset_index()
t2.head(2)
merchant_id sales_use_coupon
0 15 11
1 17 1
t3 = merchant2[merchant2.coupon_id.notnull()][['merchant_id']].copy()
t3['total_coupon'] = 1
t3 = t3.groupby('merchant_id').agg('sum').reset_index()
t3.head(2)
merchant_id total_coupon
0 5 2
1 8 2
t4 = merchant2[(merchant2.date.notnull()) & (
    merchant2.coupon_id.notnull())][['merchant_id', 'distance']].copy()
t5 = t4.groupby('merchant_id').agg('min').reset_index()
t5.rename(columns={'distance': 'merchant_min_distance'}, inplace=True)

t6 = t4.groupby('merchant_id').agg('max').reset_index()
t6.rename(columns={'distance': 'merchant_max_distance'}, inplace=True)

t7 = t4.groupby('merchant_id').agg('mean').reset_index()
t7.rename(columns={'distance': 'merchant_mean_distance'}, inplace=True)

t8 = t4.groupby('merchant_id').agg('median').reset_index()
t8.rename(columns={'distance': 'merchant_median_distance'}, inplace=True)
t8.head(2)
merchant_id merchant_median_distance
0 15 0.0
1 17 1.0
merchant2_feature = pd.merge(t, t1, on='merchant_id', how='left')
merchant2_feature = pd.merge(
    merchant2_feature, t2, on='merchant_id', how='left')
merchant2_feature = pd.merge(
    merchant2_feature, t3, on='merchant_id', how='left')
merchant2_feature = pd.merge(
    merchant2_feature, t5, on='merchant_id', how='left')
merchant2_feature = pd.merge(
    merchant2_feature, t6, on='merchant_id', how='left')
merchant2_feature = pd.merge(
    merchant2_feature, t7, on='merchant_id', how='left')
merchant2_feature = pd.merge(
    merchant2_feature, t8, on='merchant_id', how='left')
merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(
    np.nan, 0)  # fillna with 0
merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype(
    'float') / merchant2_feature.total_coupon
merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype(
    'float') / merchant2_feature.total_sales
merchant2_feature.total_sales = merchant2_feature.total_sales.replace(np.nan, 0)
merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(
    np.nan, 0)
merchant2_feature.to_csv('merchant2_feature.csv', index=None)
merchant2_feature.head()
merchant_id total_sales sales_use_coupon total_coupon merchant_min_distance merchant_max_distance merchant_mean_distance merchant_median_distance merchant_coupon_transfer_rate coupon_rate
0 2632 14.0 3.0 31.0 1.0 1.0 1.000000 1.0 0.096774 0.214286
1 3381 11537.0 1980.0 56499.0 0.0 10.0 1.697002 1.0 0.035045 0.171622
2 2099 5711.0 1497.0 12215.0 0.0 10.0 1.004814 0.0 0.122554 0.262126
3 1569 493.0 91.0 25173.0 0.0 10.0 2.388235 1.0 0.003615 0.184584
4 8390 1207.0 133.0 690.0 0.0 10.0 0.865079 0.0 0.192754 0.110191

dataset1

merchant1 = feature1[['merchant_id', 'coupon_id',
                      'distance', 'date_received', 'date']].copy()
t = merchant1[['merchant_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
merchant_id
0 2632
8 3381
t1 = merchant1[merchant1.date.notnull()][['merchant_id']].copy()
t1['total_sales'] = 1
t1 = t1.groupby('merchant_id').agg('sum').reset_index()
t1.head(2)
merchant_id total_sales
0 1 4
1 4 12
t2 = merchant1[(merchant1.date.notnull()) & (
    merchant1.coupon_id.notnull())][['merchant_id']].copy()
t2['sales_use_coupon'] = 1
t2 = t2.groupby('merchant_id').agg('sum').reset_index()
t2.head(2)
merchant_id sales_use_coupon
0 13 1
1 14 1
t3 = merchant1[merchant1.coupon_id.notnull()][['merchant_id']].copy()
t3['total_coupon'] = 1
t3 = t3.groupby('merchant_id').agg('sum').reset_index()
t3.head(2)
merchant_id total_coupon
0 8 2
1 13 3
t4 = merchant1[(merchant1.date.notnull()) & (
    merchant1.coupon_id.notnull())][['merchant_id', 'distance']]
t5 = t4.groupby('merchant_id').agg('min').reset_index()
t5.rename(columns={'distance': 'merchant_min_distance'}, inplace=True)

t6 = t4.groupby('merchant_id').agg('max').reset_index()
t6.rename(columns={'distance': 'merchant_max_distance'}, inplace=True)

t7 = t4.groupby('merchant_id').agg('mean').reset_index()
t7.rename(columns={'distance': 'merchant_mean_distance'}, inplace=True)

t8 = t4.groupby('merchant_id').agg('median').reset_index()
t8.rename(columns={'distance': 'merchant_median_distance'}, inplace=True)
t8.head(2)
merchant_id merchant_median_distance
0 13 0.0
1 14 0.0
merchant1_feature = pd.merge(t, t1, on='merchant_id', how='left')
merchant1_feature = pd.merge(
    merchant1_feature, t2, on='merchant_id', how='left')
merchant1_feature = pd.merge(
    merchant1_feature, t3, on='merchant_id', how='left')
merchant1_feature = pd.merge(
    merchant1_feature, t5, on='merchant_id', how='left')
merchant1_feature = pd.merge(
    merchant1_feature, t6, on='merchant_id', how='left')
merchant1_feature = pd.merge(
    merchant1_feature, t7, on='merchant_id', how='left')
merchant1_feature = pd.merge(
    merchant1_feature, t8, on='merchant_id', how='left')
merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(
    np.nan, 0)
merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype(
    'float') / merchant1_feature.total_coupon
merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype(
    'float') / merchant1_feature.total_sales
merchant1_feature.total_sales = merchant1_feature.total_sales.replace(
    np.nan, 0)
merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(
    np.nan, 0)
merchant1_feature.to_csv('merchant1_feature.csv', index=None)
merchant1_feature.head()
merchant_id total_sales sales_use_coupon total_coupon merchant_min_distance merchant_max_distance merchant_mean_distance merchant_median_distance merchant_coupon_transfer_rate coupon_rate
0 2632 14.0 1.0 28.0 1.0 1.0 1.000000 1.0 0.035714 0.071429
1 3381 14962.0 2168.0 100369.0 0.0 10.0 1.650457 1.0 0.021600 0.144900
2 2099 5783.0 1705.0 16824.0 0.0 10.0 0.968072 0.0 0.101343 0.294830
3 4833 522.0 116.0 8321.0 0.0 10.0 3.037736 2.0 0.013941 0.222222
4 8390 1056.0 133.0 690.0 0.0 10.0 0.865079 0.0 0.192754 0.125947

User相关特征

column 含义
count_merchant 每个用户在多少商家产生消费
user_%_distance 距离特征
buy_use_coupon 领券又消费的数量
buy_total 消费数量
coupon_received 领券数量
%_user_date_datereceived_gap 领券和消费的时间差的三个特征
buy_use_coupon_rate -
user_coupon_transfer_rate -

dataset3

user3 = feature3[['user_id', 'merchant_id', 'coupon_id',
                  'discount_rate', 'distance', 'date_received', 'date']].copy()
t = user3[['user_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
user_id
1 1439408
7 1832624
# count_merchant:每个用户在多少商家产生消费
t1 = user3[user3.date.notnull()][['user_id', 'merchant_id']].copy()
t1.drop_duplicates(inplace=True)
t1 = t1.groupby('user_id').count().reset_index()
t1.rename(columns={'merchant_id': 'count_merchant'}, inplace=True)
t1.head(2)
user_id count_merchant
0 165 2
1 209 1
# 距离特征
t2 = user3[(user3.date.notnull()) & (user3.coupon_id.notnull())
           ][['user_id', 'distance']].copy()
t3 = t2.groupby('user_id').min().reset_index()
t3.rename(columns={'distance': 'user_min_distance'}, inplace=True)

t4 = t2.groupby('user_id').max().reset_index()
t4.rename(columns={'distance': 'user_max_distance'}, inplace=True)

t5 = t2.groupby('user_id').mean().reset_index()
t5.rename(columns={'distance': 'user_mean_distance'}, inplace=True)

t6 = t2.groupby('user_id').median().reset_index()
t6.rename(columns={'distance': 'user_median_distance'}, inplace=True)
t6.head(2)
user_id user_median_distance
0 417 0.0
1 687 NaN
# 领券又消费的数量
t7 = user3[(user3.date.notnull()) & (
    user3.coupon_id.notnull())][['user_id']].copy()
t7['buy_use_coupon'] = 1
t7 = t7.groupby('user_id').count().reset_index()
t7.head(2)
user_id buy_use_coupon
0 417 1
1 687 1
# 消费数量
t8 = user3[user3.date.notnull()][['user_id']].copy()
t8['buy_total'] = 1
t8 = t8.groupby('user_id').count().reset_index()
t8.head(2)
user_id buy_total
0 165 12
1 209 1
# 领券数量
t9 = user3[user3.coupon_id.notnull()][['user_id']].copy()
t9['coupon_received'] = 1
t9 = t9.groupby('user_id').count().reset_index()
t9.head(2)
user_id coupon_received
0 4 1
1 165 2
def get_user_date_datereceived_gap(s):
    s = s.split(':')
    return (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) - 
            date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days
# 领券和消费的时间差
t10 = user3[(user3.date_received.notnull()) & (
    user3.date.notnull())][['user_id', 'date_received', 'date']].copy()
t10[['date_received', 'date']] = t10[[
    'date_received', 'date']].astype(str)
t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(
    get_user_date_datereceived_gap)
t10 = t10[['user_id', 'user_date_datereceived_gap']]
t10.head(2)
user_id user_date_datereceived_gap
6 1439408 28
33 1113008 6
# 时间差的三个特征
t11 = t10.groupby('user_id').mean().reset_index()
t11.rename(columns={
           'user_date_datereceived_gap': 'avg_user_date_datereceived_gap'}, inplace=True)
t12 = t10.groupby('user_id').min().reset_index()
t12.rename(columns={
           'user_date_datereceived_gap': 'min_user_date_datereceived_gap'}, inplace=True)
t13 = t10.groupby('user_id').max().reset_index()
t13.rename(columns={
           'user_date_datereceived_gap': 'max_user_date_datereceived_gap'}, inplace=True)
t13.head(2)
user_id max_user_date_datereceived_gap
0 417 14
1 687 5
user3_feature = pd.merge(t, t1, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t3, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t4, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t5, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t6, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t7, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t8, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t9, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t11, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t12, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t13, on='user_id', how='left')

user3_feature.buy_use_coupon = user3_feature.buy_use_coupon.replace(np.nan, 0)
user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan, 0)
user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon / \
    user3_feature.buy_total
user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon / \
    user3_feature.coupon_received
user3_feature.buy_total = user3_feature.buy_total.replace(np.nan, 0)
user3_feature.coupon_received = user3_feature.coupon_received.replace(
    np.nan, 0)
user3_feature.to_csv('user3_feature.csv', index=None)
user3_feature.isnull().sum()
user3_feature.head()
user_id                                0
count_merchant                         0
user_min_distance                 320676
user_max_distance                 320676
user_mean_distance                320676
user_median_distance              320676
buy_use_coupon                         0
buy_total                              0
coupon_received                        0
avg_user_date_datereceived_gap    316780
min_user_date_datereceived_gap    316780
max_user_date_datereceived_gap    316780
buy_use_coupon_rate               170895
user_coupon_transfer_rate          38354
dtype: int64
user_id count_merchant user_min_distance user_max_distance user_mean_distance user_median_distance buy_use_coupon buy_total coupon_received avg_user_date_datereceived_gap min_user_date_datereceived_gap max_user_date_datereceived_gap buy_use_coupon_rate user_coupon_transfer_rate
0 1439408 1.0 0.0 0.0 0.0 0.0 1.0 2.0 4.0 28.0 28.0 28.0 0.5 0.25
1 1832624 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.00
2 2029232 1.0 NaN NaN NaN NaN 0.0 2.0 2.0 NaN NaN NaN 0.0 0.00
3 2747744 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.00
4 196342 1.0 NaN NaN NaN NaN 0.0 1.0 1.0 NaN NaN NaN 0.0 0.00

dataset2

user2 = feature2[['user_id', 'merchant_id', 'coupon_id',
                  'discount_rate', 'distance', 'date_received', 'date']].copy()
t = user2[['user_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
user_id
0 1439408
7 1832624
# count_merchant:每个用户在多少商家产生消费
t1 = user2[user2.date.notnull()][['user_id', 'merchant_id']].copy()
t1.drop_duplicates(inplace=True)
t1 = t1.groupby('user_id').count().reset_index()
t1.rename(columns={'merchant_id': 'count_merchant'}, inplace=True)
t1.head(2)
user_id count_merchant
0 165 2
1 184 1
# 距离特征
t2 = user2[(user2.date.notnull()) & (user2.coupon_id.notnull())
           ][['user_id', 'distance']].copy()
t3 = t2.groupby('user_id').min().reset_index()
t3.rename(columns={'distance': 'user_min_distance'}, inplace=True)

t4 = t2.groupby('user_id').max().reset_index()
t4.rename(columns={'distance': 'user_max_distance'}, inplace=True)

t5 = t2.groupby('user_id').mean().reset_index()
t5.rename(columns={'distance': 'user_mean_distance'}, inplace=True)

t6 = t2.groupby('user_id').median().reset_index()
t6.rename(columns={'distance': 'user_median_distance'}, inplace=True)
t6.head(2)
user_id user_median_distance
0 184 0.0
1 417 0.0
# 领券又消费的数量
t7 = user2[(user2.date.notnull()) & (
    user2.coupon_id.notnull())][['user_id']].copy()
t7['buy_use_coupon'] = 1
t7 = t7.groupby('user_id').count().reset_index()
t7.head(2)
user_id buy_use_coupon
0 184 1
1 417 1
# 消费数量
t8 = user2[user2.date.notnull()][['user_id']].copy()
t8['buy_total'] = 1
t8 = t8.groupby('user_id').count().reset_index()
t8.head(2)
user_id buy_total
0 165 8
1 184 1
# 领券数量
t9 = user2[user2.coupon_id.notnull()][['user_id']].copy()
t9['coupon_received'] = 1
t9 = t9.groupby('user_id').count().reset_index()
t9.head(2)
user_id coupon_received
0 4 1
1 144 1
def get_user_date_datereceived_gap(s):
    s = s.split(':')
    return (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) - date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days
# 领券和消费的时间差
t10 = user2[(user2.date_received.notnull()) & (
    user2.date.notnull())][['user_id', 'date_received', 'date']].copy()
t10[['date_received', 'date']] = t10[[
    'date_received', 'date']].astype(int).astype(str)
t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(
    get_user_date_datereceived_gap)
t10 = t10[['user_id', 'user_date_datereceived_gap']]
t10.head(2)
user_id user_date_datereceived_gap
38 2881376 8
75 114747 25
# 时间差的三个特征
t11 = t10.groupby('user_id').mean().reset_index()
t11.rename(columns={
           'user_date_datereceived_gap': 'avg_user_date_datereceived_gap'}, inplace=True)
t12 = t10.groupby('user_id').min().reset_index()
t12.rename(columns={
           'user_date_datereceived_gap': 'min_user_date_datereceived_gap'}, inplace=True)
t13 = t10.groupby('user_id').max().reset_index()
t13.rename(columns={
           'user_date_datereceived_gap': 'max_user_date_datereceived_gap'}, inplace=True)
t13.head(2)
user_id max_user_date_datereceived_gap
0 184 30
1 417 14
user2_feature = pd.merge(t, t1, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t3, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t4, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t5, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t6, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t7, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t8, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t9, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t11, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t12, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t13, on='user_id', how='left')
user2_feature.buy_use_coupon = user2_feature.buy_use_coupon.replace(np.nan, 0)
user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan, 0)
user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon / \
    user2_feature.buy_total
user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon / \
    user2_feature.coupon_received
user2_feature.buy_total = user2_feature.buy_total.replace(np.nan, 0)
user2_feature.coupon_received = user2_feature.coupon_received.replace(
    np.nan, 0)
user2_feature.to_csv('user2_feature.csv', index=None)
user2_feature.head()
user_id count_merchant user_min_distance user_max_distance user_mean_distance user_median_distance buy_use_coupon buy_total coupon_received avg_user_date_datereceived_gap min_user_date_datereceived_gap max_user_date_datereceived_gap buy_use_coupon_rate user_coupon_transfer_rate
0 1439408 1.0 NaN NaN NaN NaN 0.0 1.0 2.0 NaN NaN NaN 0.0 0.0
1 1832624 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0
2 73611 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0
3 163606 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0
4 94107 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0

dataset1

user1 = feature1[['user_id', 'merchant_id', 'coupon_id',
                  'discount_rate', 'distance', 'date_received', 'date']].copy()
t = user1[['user_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
user_id
0 1439408
8 2029232
# count_merchant:每个用户在多少商家产生消费
t1 = user1[user1.date.notnull()][['user_id', 'merchant_id']].copy()
t1.drop_duplicates(inplace=True)
t1 = t1.groupby('user_id').count().reset_index()
t1.rename(columns={'merchant_id': 'count_merchant'}, inplace=True)
t1.head(2)
user_id count_merchant
0 165 2
1 184 1
# 距离特征
t2 = user1[(user1.date.notnull()) & (user1.coupon_id.notnull())
           ][['user_id', 'distance']].copy()
t3 = t2.groupby('user_id').min().reset_index()
t3.rename(columns={'distance': 'user_min_distance'}, inplace=True)

t4 = t2.groupby('user_id').max().reset_index()
t4.rename(columns={'distance': 'user_max_distance'}, inplace=True)

t5 = t2.groupby('user_id').mean().reset_index()
t5.rename(columns={'distance': 'user_mean_distance'}, inplace=True)

t6 = t2.groupby('user_id').median().reset_index()
t6.rename(columns={'distance': 'user_median_distance'}, inplace=True)
t6.head(2)
user_id user_median_distance
0 184 0.0
1 417 0.0
# 领券又消费的数量
t7 = user1[(user1.date.notnull()) & (
    user1.coupon_id.notnull())][['user_id']].copy()
t7['buy_use_coupon'] = 1
t7 = t7.groupby('user_id').sum().reset_index()
t7.head(2)
user_id buy_use_coupon
0 184 1
1 417 1
# 消费数量
t8 = user1[user1.date.notnull()][['user_id']].copy()
t8['buy_total'] = 1
t8 = t8.groupby('user_id').count().reset_index()
t8.head(2)
user_id buy_total
0 165 6
1 184 1
# 领券数量
t9 = user1[user1.coupon_id.notnull()][['user_id']].copy()
t9['coupon_received'] = 1
t9 = t9.groupby('user_id').count().reset_index()
t9.head(2)
user_id coupon_received
0 4 1
1 35 4
# 领券和消费的时间差
t10 = user1[(user1.date_received.notnull()) & (
    user1.date.notnull())][['user_id', 'date_received', 'date']].copy()
t10[['date_received', 'date']] = t10[[
    'date_received', 'date']].astype(int).astype(str)
t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(
    get_user_date_datereceived_gap)
t10 = t10[['user_id', 'user_date_datereceived_gap']]
t10.head(2)
user_id user_date_datereceived_gap
38 2881376 8
75 114747 25
# 时间差的三个特征
t11 = t10.groupby('user_id').mean().reset_index()
t11.rename(columns={
           'user_date_datereceived_gap': 'avg_user_date_datereceived_gap'}, inplace=True)
t12 = t10.groupby('user_id').min().reset_index()
t12.rename(columns={
           'user_date_datereceived_gap': 'min_user_date_datereceived_gap'}, inplace=True)
t13 = t10.groupby('user_id').max().reset_index()
t13.rename(columns={
           'user_date_datereceived_gap': 'max_user_date_datereceived_gap'}, inplace=True)
t13.head(2)
user_id max_user_date_datereceived_gap
0 184 30
1 417 14
user1_feature = pd.merge(t, t1, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t3, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t4, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t5, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t6, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t7, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t8, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t9, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t11, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t12, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t13, on='user_id', how='left')
user1_feature.buy_use_coupon = user1_feature.buy_use_coupon.replace(np.nan, 0)
user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan, 0)
user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon / \
    user1_feature.buy_total
user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon / \
    user1_feature.coupon_received
user1_feature.buy_total = user1_feature.buy_total.replace(np.nan, 0)
user1_feature.coupon_received = user1_feature.coupon_received.replace(
    np.nan, 0)
user1_feature.to_csv('user1_feature.csv', index=None)
user1_feature.head()
user_id count_merchant user_min_distance user_max_distance user_mean_distance user_median_distance buy_use_coupon buy_total coupon_received avg_user_date_datereceived_gap min_user_date_datereceived_gap max_user_date_datereceived_gap buy_use_coupon_rate user_coupon_transfer_rate
0 1439408 1.0 NaN NaN NaN NaN 0.0 1.0 2.0 NaN NaN NaN 0.0 0.0
1 2029232 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0
2 2223968 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0
3 73611 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0
4 3273056 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0

user_merchant交互特征

column 含义
user_merchant_buy_total 消费的用户-商家行数
user_merchant_received 领券的用户-商家行数
user_merchant_buy_use_coupon 领券且消费的行数
user_merchant_any 用户-商家全部行数
user_merchant_buy_common 没领券且消费的用户-商家行数

dataset3

all_user_merchant = feature3[['user_id','merchant_id']].copy()
all_user_merchant.drop_duplicates(inplace=True)
# user_merchant_buy_total:消费的用户-商家行数
t = feature3[['user_id','merchant_id','date']].copy()
t = t[t.date.notnull()][['user_id','merchant_id']]
t['user_merchant_buy_total'] = 1
t = t.groupby(['user_id','merchant_id']).count().reset_index()
t.head(2)
user_id merchant_id user_merchant_buy_total
0 165 2934 6
1 165 4195 6
# user_merchant_received:领券的用户-商家行数
t1 = feature3[['user_id','merchant_id','coupon_id']].copy()
t1 = t1[t1.coupon_id.notnull()][['user_id','merchant_id']]
t1['user_merchant_received'] = 1
t1 = t1.groupby(['user_id','merchant_id']).count().reset_index()
t1.head(2)
user_id merchant_id user_merchant_received
0 4 1469 1
1 165 2934 1
# user_merchant_buy_use_coupon:领券且消费的行数
t2 = feature3[['user_id','merchant_id','date','date_received']].copy()
t2 = t2[(t2.date.notnull())&(t2.date_received.notnull())][['user_id','merchant_id']]
t2['user_merchant_buy_use_coupon'] = 1
t2 = t2.groupby(['user_id','merchant_id']).count().reset_index()
t2.head(2)
user_id merchant_id user_merchant_buy_use_coupon
0 417 775 1
1 687 8594 1
# user_merchant_any:
t3 = feature3[['user_id','merchant_id']].copy()
t3['user_merchant_any'] = 1
t3 = t3.groupby(['user_id','merchant_id']).count().reset_index()
t3.head(2)
user_id merchant_id user_merchant_any
0 4 1469 1
1 165 2934 7
# user_merchant_buy_common
t4 = feature3[['user_id','merchant_id','date','coupon_id']].copy()
t4 = t4[(t4.date.notnull())&(t4.coupon_id.isnull())][['user_id','merchant_id']]
t4['user_merchant_buy_common'] = 1
t4 = t4.groupby(['user_id','merchant_id']).count().reset_index()
t4.head(2)
user_id merchant_id user_merchant_buy_common
0 165 2934 6
1 165 4195 6
user_merchant3 = pd.merge(all_user_merchant, t, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t1, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t2, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t3, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t4, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(
    np.nan, 0)
user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(
    np.nan, 0)
user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon / \
    user_merchant3.user_merchant_received
user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon / \
    user_merchant3.user_merchant_buy_total
user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total / \
    user_merchant3.user_merchant_any
user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common / \
    user_merchant3.user_merchant_buy_total
user_merchant3.user_merchant_received = user_merchant3.user_merchant_received.fillna(0)
user_merchant3.user_merchant_buy_total = user_merchant3.user_merchant_buy_total.fillna(0)
user_merchant3.user_merchant_any = user_merchant3.user_merchant_any.fillna(0)
user_merchant3.to_csv('user_merchant3.csv', index=None)

dataset2

all_user_merchant = feature2[['user_id','merchant_id']].copy()
all_user_merchant.drop_duplicates(inplace=True)
# user_merchant_buy_total:消费的用户-商家行数
t = feature2[['user_id','merchant_id','date']].copy()
t = t[t.date.notnull()][['user_id','merchant_id']]
t['user_merchant_buy_total'] = 1
t = t.groupby(['user_id','merchant_id']).count().reset_index()
t.head(2)
user_id merchant_id user_merchant_buy_total
0 165 2934 4
1 165 4195 4
# user_merchant_received:领券的用户-商家行数
t1 = feature2[['user_id','merchant_id','coupon_id']].copy()
t1 = t1[t1.coupon_id.notnull()][['user_id','merchant_id']]
t1['user_merchant_received'] = 1
t1 = t1.groupby(['user_id','merchant_id']).count().reset_index()
t1.head(2)
user_id merchant_id user_merchant_received
0 4 1433 1
1 144 1553 1
# user_merchant_buy_use_coupon:领券且消费的行数
t2 = feature2[['user_id','merchant_id','date','date_received']].copy()
t2 = t2[(t2.date.notnull())&(t2.date_received.notnull())][['user_id','merchant_id']]
t2['user_merchant_buy_use_coupon'] = 1
t2 = t2.groupby(['user_id','merchant_id']).count().reset_index()
t2.head(2)
user_id merchant_id user_merchant_buy_use_coupon
0 184 3381 1
1 417 775 1
# user_merchant_any:
t3 = feature2[['user_id','merchant_id']].copy()
t3['user_merchant_any'] = 1
t3 = t3.groupby(['user_id','merchant_id']).count().reset_index()
t3.head(2)
user_id merchant_id user_merchant_any
0 4 1433 1
1 144 1553 1
# user_merchant_buy_common
t4 = feature2[['user_id','merchant_id','date','coupon_id']].copy()
t4 = t4[(t4.date.notnull())&(t4.coupon_id.isnull())][['user_id','merchant_id']]
t4['user_merchant_buy_common'] = 1
t4 = t4.groupby(['user_id','merchant_id']).count().reset_index()
t4.head(2)
user_id merchant_id user_merchant_buy_common
0 165 2934 4
1 165 4195 4
user_merchant2 = pd.merge(all_user_merchant, t, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t1, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t2, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t3, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t4, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(
    np.nan, 0)
user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(
    np.nan, 0)
user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon / \
    user_merchant2.user_merchant_received
user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon / \
    user_merchant2.user_merchant_buy_total
user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total / \
    user_merchant2.user_merchant_any
user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common / \
    user_merchant2.user_merchant_buy_total
user_merchant2.user_merchant_received = user_merchant2.user_merchant_received.fillna(0)
user_merchant2.user_merchant_buy_total = user_merchant2.user_merchant_buy_total.fillna(0)
user_merchant2.user_merchant_any = user_merchant2.user_merchant_any.fillna(0)
user_merchant2.to_csv('user_merchant2.csv', index=None)

dataset1

all_user_merchant = feature1[['user_id','merchant_id']].copy()
all_user_merchant.drop_duplicates(inplace=True)
# user_merchant_buy_total:消费的用户-商家行数
t = feature1[['user_id','merchant_id','date']].copy()
t = t[t.date.notnull()][['user_id','merchant_id']]
t['user_merchant_buy_total'] = 1
t = t.groupby(['user_id','merchant_id']).count().reset_index()
t.head(2)
user_id merchant_id user_merchant_buy_total
0 165 2934 4
1 165 4195 2
# user_merchant_received:领券的用户-商家行数
t1 = feature1[['user_id','merchant_id','coupon_id']].copy()
t1 = t1[t1.coupon_id.notnull()][['user_id','merchant_id']]
t1['user_merchant_received'] = 1
t1 = t1.groupby(['user_id','merchant_id']).count().reset_index()
t1.head(2)
user_id merchant_id user_merchant_received
0 4 1433 1
1 35 3381 4
# user_merchant_buy_use_coupon:领券且消费的行数
t2 = feature1[['user_id','merchant_id','date','date_received']].copy()
t2 = t2[(t2.date.notnull())&(t2.date_received.notnull())][['user_id','merchant_id']]
t2['user_merchant_buy_use_coupon'] = 1
t2 = t2.groupby(['user_id','merchant_id']).count().reset_index()
t2.head(2)
user_id merchant_id user_merchant_buy_use_coupon
0 184 3381 1
1 417 775 1
# user_merchant_any:
t3 = feature1[['user_id','merchant_id']].copy()
t3['user_merchant_any'] = 1
t3 = t3.groupby(['user_id','merchant_id']).count().reset_index()
t3.head(2)
user_id merchant_id user_merchant_any
0 4 1433 1
1 35 3381 4
# user_merchant_buy_common
t4 = feature1[['user_id','merchant_id','date','coupon_id']].copy()
t4 = t4[(t4.date.notnull())&(t4.coupon_id.isnull())][['user_id','merchant_id']]
t4['user_merchant_buy_common'] = 1
t4 = t4.groupby(['user_id','merchant_id']).count().reset_index()
t4.head(2)
user_id merchant_id user_merchant_buy_common
0 165 2934 4
1 165 4195 2
user_merchant1 = pd.merge(all_user_merchant, t, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t1, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t2, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t3, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t4, on=[
                          'user_id', 'merchant_id'], how='left')
user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(
    np.nan, 0)
user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(
    np.nan, 0)
user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon / \
    user_merchant1.user_merchant_received
user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon / \
    user_merchant1.user_merchant_buy_total
user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total / \
    user_merchant1.user_merchant_any
user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common / \
    user_merchant1.user_merchant_buy_total
user_merchant1.user_merchant_received = user_merchant2.user_merchant_received.fillna(
    0)
user_merchant1.user_merchant_buy_total = user_merchant2.user_merchant_buy_total.fillna(
    0)
user_merchant1.user_merchant_any = user_merchant2.user_merchant_any.fillna(0)
user_merchant1.to_csv('user_merchant1.csv', index=None)

生成训练集和测试集

def trans_date(s):
    if s != s:
        return str(s)
    else:
        return str(int(s))


def get_label(s):
    s = s.split(':')
    if s[0] == 'nan':
        return 0
    elif s[1] == 'nan':
        return 1
    elif (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) -
          date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days <= 15:
        return 1
    else:
        return 0

测试集dataset3

coupon3 = pd.read_csv('coupon3_feature.csv')
merchant3 = pd.read_csv('merchant3_feature.csv')
user3 = pd.read_csv('user3_feature.csv')
user_merchant3 = pd.read_csv('user_merchant3.csv')
other_feature3 = pd.read_csv('other_feature3.csv')
coupon3.shape
(113640, 13)
dataset3 = pd.merge(coupon3, merchant3, on='merchant_id', how='left')
dataset3 = pd.merge(dataset3, user3, on='user_id', how='left')
dataset3 = pd.merge(dataset3, user_merchant3, on=[
                    'user_id', 'merchant_id'], how='left')
dataset3 = pd.merge(dataset3, other_feature3, on=[
                    'user_id', 'coupon_id', 'date_received'], how='left')
dataset3.shape
(116204, 50)
dataset3.total_sales = dataset3.total_sales.fillna(0)
dataset3.sales_use_coupon = dataset3.sales_use_coupon.fillna(0)
dataset3.total_coupon = dataset3.total_coupon.fillna(0)
dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(
    np.nan, 0)
dataset3.user_merchant_received = dataset3.user_merchant_received.replace(
    np.nan, 0)
dataset3.user_merchant_buy_use_coupon = dataset3.user_merchant_buy_use_coupon.replace(
    np.nan, 0)
dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan, 0)
dataset3.user_merchant_buy_common = dataset3.user_merchant_buy_common.replace(
    np.nan, 0)
dataset3.count_merchant = dataset3.count_merchant.fillna(0)
dataset3.buy_use_coupon = dataset3.buy_use_coupon.fillna(0)
dataset3.buy_total = dataset3.buy_total.fillna(0)
dataset3.coupon_received = dataset3.coupon_received.fillna(0)
dataset3.isnull().sum()
user_id                                             0
merchant_id                                         0
coupon_id                                           0
discount_rate                                       0
distance                                        12230
date_received                                       0
day_of_week                                         0
day_of_month                                        0
days_distance                                       0
discount_man                                        0
discount_jian                                       0
is_man_jian                                         0
coupon_count                                        0
total_sales                                         0
sales_use_coupon                                    0
total_coupon                                        0
merchant_min_distance                            5096
merchant_max_distance                            5096
merchant_mean_distance                           5096
merchant_median_distance                         5096
merchant_coupon_transfer_rate                     898
coupon_rate                                        55
count_merchant                                      0
user_min_distance                               97093
user_max_distance                               97093
user_mean_distance                              97093
user_median_distance                            97093
buy_use_coupon                                      0
buy_total                                           0
coupon_received                                     0
avg_user_date_datereceived_gap                  95450
min_user_date_datereceived_gap                  95450
max_user_date_datereceived_gap                  95450
buy_use_coupon_rate                             47959
user_coupon_transfer_rate                       21231
user_merchant_buy_total                             0
user_merchant_received                              0
user_merchant_buy_use_coupon                        0
user_merchant_any                                   0
user_merchant_buy_common                            0
user_merchant_coupon_transfer_rate              85190
user_merchant_coupon_buy_rate                   90569
user_merchant_rate                              90569
user_merchant_common_buy_rate                   90569
this_month_user_receive_same_coupon_count           0
this_month_user_receive_all_coupon_count            0
this_month_user_receive_same_coupon_lastone         0
this_month_user_receive_same_coupon_firstone        0
this_day_user_receive_all_coupon_count              0
this_day_user_receive_same_coupon_count             0
dtype: int64
dataset3['is_weekend'] = dataset3.day_of_week.apply(
    lambda x: 1 if x in (6, 7) else 0)
weekday_dummies = pd.get_dummies(dataset3.day_of_week)
weekday_dummies.columns = ['weekday'+str(i+1)
                           for i in range(weekday_dummies.shape[1])]
dataset3 = pd.concat([dataset3, weekday_dummies], axis=1)
dataset3.drop(['merchant_id', 'day_of_week', 'coupon_count'],
              axis=1, inplace=True)
dataset3.head()
dataset3.to_csv('dataset3.csv', index=None)
user_id coupon_id discount_rate distance date_received day_of_month days_distance discount_man discount_jian is_man_jian total_sales sales_use_coupon total_coupon merchant_min_distance merchant_max_distance merchant_mean_distance merchant_median_distance merchant_coupon_transfer_rate coupon_rate count_merchant user_min_distance user_max_distance user_mean_distance user_median_distance buy_use_coupon buy_total coupon_received avg_user_date_datereceived_gap min_user_date_datereceived_gap max_user_date_datereceived_gap buy_use_coupon_rate user_coupon_transfer_rate user_merchant_buy_total user_merchant_received user_merchant_buy_use_coupon user_merchant_any user_merchant_buy_common user_merchant_coupon_transfer_rate user_merchant_coupon_buy_rate user_merchant_rate user_merchant_common_buy_rate this_month_user_receive_same_coupon_count this_month_user_receive_all_coupon_count this_month_user_receive_same_coupon_lastone this_month_user_receive_same_coupon_firstone this_day_user_receive_all_coupon_count this_day_user_receive_same_coupon_count is_weekend weekday1 weekday2 weekday3 weekday4 weekday5 weekday6 weekday7
0 4129537 9983 0.833333 1.0 20160712 12 12 30 5 1 7757.0 1239.0 35933.0 0.0 10.0 0.894068 0.0 0.034481 0.159727 0.0 NaN NaN NaN NaN 0.0 0.0 2.0 NaN NaN NaN NaN 0.0 0.0 1.0 0.0 1.0 0.0 0.0 NaN NaN NaN 1 1 -1 -1 1 1 0 0 1 0 0 0 0 0
1 6949378 3429 0.833333 NaN 20160706 6 6 30 5 1 119.0 17.0 100.0 0.0 8.0 0.666667 0.0 0.170000 0.142857 1.0 0.0 0.0 0.0 0.0 1.0 2.0 1.0 3.0 3.0 3.0 0.5 1.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 1 1 -1 -1 1 1 0 0 0 1 0 0 0 0
2 2166529 6928 0.900000 5.0 20160727 27 27 200 20 1 2280.0 100.0 3529.0 0.0 10.0 2.592593 1.0 0.028337 0.043860 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 1 2 -1 -1 2 1 0 0 0 1 0 0 0 0
3 2166529 1808 0.900000 5.0 20160727 27 27 100 10 1 2280.0 100.0 3529.0 0.0 10.0 2.592593 1.0 0.028337 0.043860 0.0 NaN NaN NaN NaN 0.0 0.0 1.0 NaN NaN NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 1 2 -1 -1 2 1 0 0 0 1 0 0 0 0
4 6172162 6500 0.966667 2.0 20160708 8 8 30 1 1 1065.0 182.0 795.0 0.0 10.0 0.783439 0.0 0.228931 0.170892 1.0 NaN NaN NaN NaN 0.0 1.0 1.0 NaN NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 1 1 -1 -1 1 1 0 0 0 0 0 1 0 0

dataset2

coupon2 = pd.read_csv('coupon2_feature.csv')
merchant2 = pd.read_csv('merchant2_feature.csv')
user2 = pd.read_csv('user2_feature.csv')
user_merchant2 = pd.read_csv('user_merchant2.csv')
other_feature2 = pd.read_csv('other_feature2.csv')
coupon2.shape
(258446, 14)
dataset2 = pd.merge(coupon2, merchant2, on='merchant_id', how='left')
dataset2 = pd.merge(dataset2, user2, on='user_id', how='left')
dataset2 = pd.merge(dataset2, user_merchant2, on=[
                    'user_id', 'merchant_id'], how='left')
dataset2 = pd.merge(dataset2, other_feature2, on=[
                    'user_id', 'coupon_id', 'date_received'], how='left')
dataset2.shape
(262240, 51)
dataset2['date']=dataset2['date'].apply(trans_date)
dataset2['date_received'] = dataset2['date_received'].apply(trans_date)
dataset2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 262240 entries, 0 to 262239
Data columns (total 51 columns):
user_id                                         262240 non-null int64
merchant_id                                     262240 non-null int64
coupon_id                                       262240 non-null float64
discount_rate                                   262240 non-null float64
distance                                        231874 non-null float64
date_received                                   262240 non-null object
date                                            262240 non-null object
day_of_week                                     262240 non-null int64
day_of_month                                    262240 non-null int64
days_distance                                   262240 non-null int64
discount_man                                    262240 non-null int64
discount_jian                                   262240 non-null int64
is_man_jian                                     262240 non-null int64
coupon_count                                    262240 non-null int64
total_sales                                     259460 non-null float64
sales_use_coupon                                259460 non-null float64
total_coupon                                    259460 non-null float64
merchant_min_distance                           217418 non-null float64
merchant_max_distance                           217418 non-null float64
merchant_mean_distance                          217418 non-null float64
merchant_median_distance                        217418 non-null float64
merchant_coupon_transfer_rate                   240217 non-null float64
coupon_rate                                     258655 non-null float64
count_merchant                                  146649 non-null float64
user_min_distance                               21545 non-null float64
user_max_distance                               21545 non-null float64
user_mean_distance                              21545 non-null float64
user_median_distance                            21545 non-null float64
buy_use_coupon                                  146649 non-null float64
buy_total                                       146649 non-null float64
coupon_received                                 146649 non-null float64
avg_user_date_datereceived_gap                  23316 non-null float64
min_user_date_datereceived_gap                  23316 non-null float64
max_user_date_datereceived_gap                  23316 non-null float64
buy_use_coupon_rate                             112596 non-null float64
user_coupon_transfer_rate                       112753 non-null float64
user_merchant_buy_total                         73321 non-null float64
user_merchant_received                          73321 non-null float64
user_merchant_buy_use_coupon                    73321 non-null float64
user_merchant_any                               73321 non-null float64
user_merchant_buy_common                        73321 non-null float64
user_merchant_coupon_transfer_rate              36972 non-null float64
user_merchant_coupon_buy_rate                   64125 non-null float64
user_merchant_rate                              64125 non-null float64
user_merchant_common_buy_rate                   64125 non-null float64
this_month_user_receive_same_coupon_count       262240 non-null int64
this_month_user_receive_all_coupon_count        262240 non-null int64
this_month_user_receive_same_coupon_lastone     262240 non-null int64
this_month_user_receive_same_coupon_firstone    262240 non-null int64
this_day_user_receive_all_coupon_count          262240 non-null int64
this_day_user_receive_same_coupon_count         262240 non-null int64
dtypes: float64(34), int64(15), object(2)
memory usage: 104.0+ MB
dataset2.total_sales = dataset2.total_sales.fillna(0)
dataset2.sales_use_coupon = dataset2.sales_use_coupon.fillna(0)
dataset2.total_coupon = dataset2.total_coupon.fillna(0)
dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(
    np.nan, 0)
dataset2.user_merchant_received = dataset2.user_merchant_received.replace(
    np.nan, 0)
dataset2.user_merchant_buy_use_coupon = dataset2.user_merchant_buy_use_coupon.replace(
    np.nan, 0)
dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan, 0)
dataset2.user_merchant_buy_common = dataset2.user_merchant_buy_common.replace(
    np.nan, 0)
dataset2.count_merchant = dataset2.count_merchant.fillna(0)
dataset2.buy_use_coupon = dataset2.buy_use_coupon.fillna(0)
dataset2.buy_total = dataset2.buy_total.fillna(0)
dataset2.coupon_received = dataset2.coupon_received.fillna(0)
dataset2.isnull().sum()
user_id                                              0
merchant_id                                          0
coupon_id                                            0
discount_rate                                        0
distance                                         30366
date_received                                        0
date                                                 0
day_of_week                                          0
day_of_month                                         0
days_distance                                        0
discount_man                                         0
discount_jian                                        0
is_man_jian                                          0
coupon_count                                         0
total_sales                                          0
sales_use_coupon                                     0
total_coupon                                         0
merchant_min_distance                            44822
merchant_max_distance                            44822
merchant_mean_distance                           44822
merchant_median_distance                         44822
merchant_coupon_transfer_rate                    22023
coupon_rate                                       3585
count_merchant                                       0
user_min_distance                               240695
user_max_distance                               240695
user_mean_distance                              240695
user_median_distance                            240695
buy_use_coupon                                       0
buy_total                                            0
coupon_received                                      0
avg_user_date_datereceived_gap                  238924
min_user_date_datereceived_gap                  238924
max_user_date_datereceived_gap                  238924
buy_use_coupon_rate                             149644
user_coupon_transfer_rate                       149487
user_merchant_buy_total                              0
user_merchant_received                               0
user_merchant_buy_use_coupon                         0
user_merchant_any                                    0
user_merchant_buy_common                             0
user_merchant_coupon_transfer_rate              225268
user_merchant_coupon_buy_rate                   198115
user_merchant_rate                              198115
user_merchant_common_buy_rate                   198115
this_month_user_receive_same_coupon_count            0
this_month_user_receive_all_coupon_count             0
this_month_user_receive_same_coupon_lastone          0
this_month_user_receive_same_coupon_firstone         0
this_day_user_receive_all_coupon_count               0
this_day_user_receive_same_coupon_count              0
dtype: int64
dataset2['is_weekend'] = dataset2.day_of_week.apply(
    lambda x: 1 if x in (6, 7) else 0)
weekday_dummies = pd.get_dummies(dataset2.day_of_week)
weekday_dummies.columns = ['weekday'+str(i+1)
                           for i in range(weekday_dummies.shape[1])]
dataset2 = pd.concat([dataset2, weekday_dummies], axis=1)
dataset2['label'] = dataset2.date + ':' + dataset2.date_received
dataset2.label = dataset2.label.apply(get_label)
dataset2[['date_received','date','label']].head(30)
date_received date label
0 20160528 nan 0
1 20160613 nan 0
2 20160516 20160613 0
3 20160530 nan 0
4 20160519 nan 0
5 20160606 nan 0
6 20160606 nan 0
7 20160518 nan 0
8 20160528 nan 0
9 20160524 nan 0
10 20160523 nan 0
11 20160515 20160521 1
12 20160521 nan 0
13 20160523 nan 0
14 20160602 nan 0
15 20160523 20160605 1
16 20160605 nan 0
17 20160607 nan 0
18 20160605 nan 0
19 20160601 nan 0
20 20160530 nan 0
21 20160522 nan 0
22 20160611 nan 0
23 20160524 nan 0
24 20160526 nan 0
25 20160520 nan 0
26 20160527 nan 0
27 20160602 nan 0
28 20160526 nan 0
29 20160521 nan 0
dataset2.drop(['merchant_id', 'day_of_week', 'date', 'date_received',
               'coupon_id', 'coupon_count'], axis=1, inplace=True)
dataset2.to_csv('dataset2.csv', index=None)

dataset1

coupon1 = pd.read_csv('coupon1_feature.csv')
merchant1 = pd.read_csv('merchant1_feature.csv')
user1 = pd.read_csv('user1_feature.csv')
user_merchant1 = pd.read_csv('user_merchant1.csv')
other_feature1 = pd.read_csv('other_feature1.csv')
dataset1 = pd.merge(coupon1, merchant1, on='merchant_id', how='left')
dataset1 = pd.merge(dataset1, user1, on='user_id', how='left')
dataset1 = pd.merge(dataset1, user_merchant1, on=[
                    'user_id', 'merchant_id'], how='left')
dataset1 = pd.merge(dataset1, other_feature1, on=[
                    'user_id', 'coupon_id', 'date_received'], how='left')
dataset1.shape
(139785, 51)
dataset1.total_sales = dataset1.total_sales.fillna(0)
dataset1.sales_use_coupon = dataset1.sales_use_coupon.fillna(0)
dataset1.total_coupon = dataset1.total_coupon.fillna(0)
dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(
    np.nan, 0)
dataset1.user_merchant_received = dataset1.user_merchant_received.replace(
    np.nan, 0)
dataset1.user_merchant_buy_use_coupon = dataset1.user_merchant_buy_use_coupon.replace(
    np.nan, 0)
dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan, 0)
dataset1.user_merchant_buy_common = dataset1.user_merchant_buy_common.replace(
    np.nan, 0)
dataset1.count_merchant = dataset1.count_merchant.fillna(0)
dataset1.buy_use_coupon = dataset1.buy_use_coupon.fillna(0)
dataset1.buy_total = dataset1.buy_total.fillna(0)
dataset1.coupon_received = dataset1.coupon_received.fillna(0)
dataset1.isnull().sum()
user_id                                              0
merchant_id                                          0
coupon_id                                            0
discount_rate                                        0
distance                                         15492
date_received                                        0
date                                            127747
day_of_week                                          0
day_of_month                                         0
days_distance                                        0
discount_man                                         0
discount_jian                                        0
is_man_jian                                          0
coupon_count                                         0
total_sales                                          0
sales_use_coupon                                     0
total_coupon                                         0
merchant_min_distance                            33627
merchant_max_distance                            33627
merchant_mean_distance                           33627
merchant_median_distance                         33627
merchant_coupon_transfer_rate                    25176
coupon_rate                                       3950
count_merchant                                       0
user_min_distance                               131176
user_max_distance                               131176
user_mean_distance                              131176
user_median_distance                            131176
buy_use_coupon                                       0
buy_total                                            0
coupon_received                                      0
avg_user_date_datereceived_gap                  130508
min_user_date_datereceived_gap                  130508
max_user_date_datereceived_gap                  130508
buy_use_coupon_rate                              84347
user_coupon_transfer_rate                        86167
user_merchant_buy_total                              0
user_merchant_received                               0
user_merchant_buy_use_coupon                         0
user_merchant_any                                    0
user_merchant_buy_common                             0
user_merchant_coupon_transfer_rate              123040
user_merchant_coupon_buy_rate                   103555
user_merchant_rate                              103555
user_merchant_common_buy_rate                   103555
this_month_user_receive_same_coupon_count            0
this_month_user_receive_all_coupon_count             0
this_month_user_receive_same_coupon_lastone          0
this_month_user_receive_same_coupon_firstone         0
this_day_user_receive_all_coupon_count               0
this_day_user_receive_same_coupon_count              0
dtype: int64
dataset1['is_weekend'] = dataset1.day_of_week.apply(
    lambda x: 1 if x in (6, 7) else 0)
weekday_dummies = pd.get_dummies(dataset1.day_of_week)
weekday_dummies.columns = ['weekday'+str(i+1)
                           for i in range(weekday_dummies.shape[1])]
dataset1 = pd.concat([dataset1, weekday_dummies], axis=1)
dataset1['date']=dataset1['date'].apply(trans_date)
dataset1['date_received'] = dataset1['date_received'].apply(trans_date)
dataset1['label'] = dataset1.date + ':' + dataset1.date_received
dataset1.label = dataset1.label.apply(get_label)
dataset1.drop(['merchant_id', 'day_of_week', 'date', 'date_received',
               'coupon_id', 'coupon_count'], axis=1, inplace=True)
dataset1.to_csv('dataset1.csv', index=None)

模型建立与预测

# 导库
from IPython.core.interactiveshell import InteractiveShell
from jupyterthemes import jtplot
import os
from datetime import date

import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
# 魔法函数
%matplotlib inline
# matplotlib 支持中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# matplotlib 正常显示负号
plt.rcParams['axes.unicode_minus'] = False

# dataframe显示不换行
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
# 选择一个绘图主题与本身主题对应
jtplot.style(theme='oceans16')
# 一个cell显示多个输出结果
InteractiveShell.ast_node_interactivity = "all"
dataset1 = pd.read_csv('dataset1.csv')
dataset2 = pd.read_csv('dataset2.csv')
dataset3 = pd.read_csv('dataset3.csv')
'''dataset1.shape,dataset2.shape,dataset3.shape
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset1.shape,dataset2.shape,dataset3.shape'''
'dataset1.shape,dataset2.shape,dataset3.shape\ndataset1.drop_duplicates(inplace=True)\ndataset2.drop_duplicates(inplace=True)\ndataset3.drop_duplicates(inplace=True)\ndataset1.shape,dataset2.shape,dataset3.shape'
dataset12 = pd.concat([dataset1,dataset2],axis=0)
dataset1_y = dataset1.label
# 'day_gap_before','day_gap_after' cause overfitting, 0.77
dataset1_x = dataset1.drop(
    ['user_id', 'label'], axis=1)
dataset2_y = dataset2.label
dataset2_x = dataset2.drop(
    ['user_id', 'label'], axis=1)
dataset12_y = dataset12.label
dataset12_x = dataset12.drop(
    ['user_id', 'label'], axis=1)
dataset3_preds = dataset3[['user_id', 'coupon_id', 'date_received']]
dataset3_x = dataset3.drop(
    ['user_id', 'coupon_id', 'date_received'], axis=1)

dataset1_x.shape, dataset2_x.shape, dataset3_x.shape
((139785, 52), (262240, 52), (116204, 52))
dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y)
dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y)
dataset12 = xgb.DMatrix(dataset12_x,label=dataset12_y)
dataset3 = xgb.DMatrix(dataset3_x)
D:\anaconda3\lib\site-packages\xgboost\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version
  if getattr(data, 'base', None) is not None and \
D:\anaconda3\lib\site-packages\xgboost\core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version
  data.base is not None and isinstance(data, np.ndarray) \
params={'booster':'gbtree',
	    'objective': 'rank:pairwise',
	    'eval_metric':'auc',
	    'gamma':0.1,
	    'min_child_weight':1.1,
	    'max_depth':5,
	    'lambda':10,
	    'subsample':0.7,
	    'colsample_bytree':0.7,
	    'colsample_bylevel':0.7,
	    'eta': 0.01,
	    'tree_method':'exact',
	    'seed':0,
	    'nthread':12
	    }

#train on dataset1, evaluate on dataset2
#watchlist = [(dataset1,'train'),(dataset2,'val')]
#model = xgb.train(params,dataset1,num_boost_round=3000,evals=watchlist,early_stopping_rounds=300)
watchlist = [(dataset12,'train')]
model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist)
# predict test set
dataset3_preds['label'] = model.predict(dataset3)
dataset3_preds.label = MinMaxScaler().fit_transform(
    dataset3_preds.label.values.reshape(-1, 1))
dataset3_preds.sort_values(by=['coupon_id', 'label'], inplace=True)
dataset3_preds.to_csv("xgb_preds.csv", index=None, header=None)
dataset3_preds.describe()
user_id coupon_id date_received label
count 1.162040e+05 116204.000000 1.162040e+05 116204.000000
mean 3.678026e+06 9014.669702 2.016072e+07 0.310783
std 2.122548e+06 4136.488170 9.016543e+00 0.129064
min 2.090000e+02 3.000000 2.016070e+07 0.000000
25% 1.858042e+06 5143.000000 2.016071e+07 0.229695
50% 3.666466e+06 9983.000000 2.016072e+07 0.290300
75% 5.516537e+06 13602.000000 2.016072e+07 0.373554
max 7.361024e+06 14045.000000 2.016073e+07 1.000000