准备工作
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import date
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
from jupyterthemes import jtplot
jtplot.style(theme='onedork')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
off_train = pd.read_csv('ccf_offline_stage1_train.csv')
off_train.columns = ['user_id', 'merchant_id', 'coupon_id',
'discount_rate', 'distance', 'date_received', 'date']
off_test = pd.read_csv('ccf_offline_stage1_test_revised.csv')
off_test.columns = ['user_id', 'merchant_id', 'coupon_id',
'discount_rate', 'distance', 'date_received']
on_train = pd.read_csv('ccf_online_stage1_train.csv')
on_train.columns = ['user_id', 'merchant_id', 'action',
'coupon_id', 'discount_rate', 'date_received', 'date']
term |
来源 |
内容 |
dataset3 |
table3,off_test |
off_test数据 |
dataset2 |
table2,off_train |
领券日期在20160515-20160615之间的 |
dataset1 |
table2,off_train |
领券日期在20160414-20160514的 |
feature3 |
table2,off_train |
消费日期在20160315-20160630的,或领券日期在20160315-20160630但没有消费的 |
feature2 |
table2,off_train |
消费日期在20160201-20160514的,或领券日期在20160201-20160514但没有消费的 |
feature1 |
table2,off_train |
消费日期在20160101-20160413的,或领券日期在20160101-20160413但没有消费的 |
项目 |
预测区间(提取label) |
特征区间(提取feature) |
- |
领券了的 |
消费了的+领券了没消费的 |
测试集 |
dataset3 |
feature3 |
训练集1 |
dataset2 |
feature2 |
训练集2 |
dataset1 |
feature1 |
这里找NaN空值用isnull(),不能用=='null’或者==np.nan之类的。
读取数据集时没有把两个时间列的所有元素转换为datetime格式,不等号右边就用数字20160315这样的就行,如果转换了,不等号右边得用pd.to_datetime转换
dataset3 = off_test
feature3 = off_train[((off_train.date >= 20160315) & (off_train.date <= 20160630)) | (
(off_train.date.isnull()) & (off_train.date_received >= 20160315) & (off_train.date_received <= 20160630))]
dataset2 = off_train[(off_train.date_received >= 20160515)
& (off_train.date_received <= 20160615)]
feature2 = off_train[(off_train.date >= 20160201) & (off_train.date <= 20160514) | (
(off_train.date.isnull()) & (off_train.date_received >= 20160201) & (off_train.date_received <= 20160514))]
dataset1 = off_train[(off_train.date_received >= 20160414)
& (off_train.date_received <= 20160514)]
feature1 = off_train[(off_train.date >= 20160101) & (off_train.date <= 20160413) | (
(off_train.date.isnull()) & (off_train.date_received >= 20160101) & (off_train.date_received <= 20160413))]
其他特征
column |
含义 |
this_month_user_receive_all_coupon_count |
用户领取优惠券的总次数 |
this_month_user_receive_same_coupon_count |
用户领取该行中的优惠券的次数 |
this_month_user_receive_same_coupon_firstone |
用户领取某张优惠券(使用次数不止1次)的首次领取时间 |
this_month_user_receive_same_coupon_lastone |
用户领取某张优惠券(使用次数不止1次)的末次领取时间 |
this_day_user_receive_all_coupon_count |
用户领券当天领券总数量 |
this_day_user_receive_same_coupon_count |
用户领券当天领取该优惠券的数量 |
dataset3
t = dataset3[['user_id']].copy()
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby(['user_id'], as_index=False).count()
t.head(2)
|
user_id |
this_month_user_receive_all_coupon_count |
0 |
209 |
2 |
1 |
215 |
1 |
t1 = dataset3[['user_id', 'coupon_id']].copy()
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['user_id', 'coupon_id'], as_index=False).count()
t1.head(2)
|
user_id |
coupon_id |
this_month_user_receive_same_coupon_count |
0 |
209 |
825 |
1 |
1 |
209 |
7557 |
1 |
t2 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t2.date_received = t2.date_received.astype('str')
t2 = t2.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t2.iloc[5:7]
t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
t2 = t2[t2.receive_number > 1]
t2['max_date_received'] = t2.date_received.apply(
lambda s: max([int(d) for d in s.split(':')]))
t2['min_date_received'] = t2.date_received.apply(
lambda s: min([int(d) for d in s.split(':')]))
t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]
t2.head(2)
|
user_id |
coupon_id |
date_received |
5 |
432 |
10438 |
20160706 |
6 |
448 |
10927 |
20160710:20160707:20160706:20160704 |
|
user_id |
coupon_id |
max_date_received |
min_date_received |
6 |
448 |
10927 |
20160710 |
20160704 |
13 |
736 |
3686 |
20160724 |
20160713 |
t3 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t3 = pd.merge(t3, t2, how='left', on=['user_id', 'coupon_id'])
t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
t3.date_received
t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - \
t3.min_date_received
def is_firstlastone(x):
if x == 0:
return 1
elif x > 0:
return 0
else:
return -1
t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
is_firstlastone)
t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
is_firstlastone)
t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
'this_month_user_receive_same_coupon_firstone']]
t3.head(2)
|
user_id |
coupon_id |
date_received |
this_month_user_receive_same_coupon_lastone |
this_month_user_receive_same_coupon_firstone |
0 |
4129537 |
9983 |
20160712 |
-1 |
-1 |
1 |
6949378 |
3429 |
20160706 |
-1 |
-1 |
t4 = dataset3[['user_id', 'date_received']].copy()
t4['this_day_user_receive_all_coupon_count'] = 1
t4 = t4.groupby(['user_id', 'date_received'], as_index=False).count()
t4.head(2)
|
user_id |
date_received |
this_day_user_receive_all_coupon_count |
0 |
209 |
20160721 |
2 |
1 |
215 |
20160703 |
1 |
t5 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t5['this_day_user_receive_same_coupon_count'] = 1
t5 = t5.groupby(['user_id', 'coupon_id', 'date_received'],
as_index=False).count()
t5.head(2)
|
user_id |
coupon_id |
date_received |
this_day_user_receive_same_coupon_count |
0 |
209 |
825 |
20160721 |
1 |
1 |
209 |
7557 |
20160721 |
1 |
'''t6 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n 'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''# 前、后一张券的时间间隔
def get_day_gap_before(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
def get_day_gap_after(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
t7 = dataset3[['user_id', 'coupon_id', 'date_received']]
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
'day_gap_before', 'day_gap_after']]
t7.head()'''
"# 前、后一张券的时间间隔\ndef get_day_gap_before(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\ndef get_day_gap_after(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\nt7 = dataset3[['user_id', 'coupon_id', 'date_received']]\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n 'day_gap_before', 'day_gap_after']]\nt7.head()"
other_feature3 = pd.merge(t1, t, on='user_id')
other_feature3 = pd.merge(other_feature3, t3, on=['user_id', 'coupon_id'])
other_feature3 = pd.merge(other_feature3, t4, on=['user_id', 'date_received'])
other_feature3 = pd.merge(other_feature3, t5, on=[
'user_id', 'coupon_id', 'date_received'])
other_feature3.to_csv('other_feature3.csv', index=None)
other_feature3.shape
(113640, 9)
dataset2
t = dataset2[['user_id']].copy()
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('user_id', as_index=False).count()
t.head(2)
|
user_id |
this_month_user_receive_all_coupon_count |
0 |
4 |
1 |
1 |
165 |
1 |
t1 = dataset2[['user_id', 'coupon_id']].copy()
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['user_id', 'coupon_id'], as_index=False).count()
t1.head(2)
|
user_id |
coupon_id |
this_month_user_receive_same_coupon_count |
0 |
4 |
2902.0 |
1 |
1 |
165 |
7571.0 |
1 |
t2 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t2.date_received = t2.date_received.astype('int').astype('str')
t2 = t2.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
t2 = t2[t2.receive_number > 1]
t2['max_date_received'] = t2.date_received.apply(
lambda s: max([int(d) for d in s.split(':')]))
t2['min_date_received'] = t2.date_received.apply(
lambda s: min([int(d) for d in s.split(':')]))
t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]
t2.head(2)
|
user_id |
coupon_id |
max_date_received |
min_date_received |
18 |
696 |
7571.0 |
20160524 |
20160520 |
31 |
947 |
12414.0 |
20160610 |
20160609 |
t3 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t3 = pd.merge(t3, t2, on=['user_id', 'coupon_id'], how='left')
t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
t3.date_received.astype('int')
t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(
'int') - t3.min_date_received
def is_firstlastone(x):
if x == 0:
return 1
elif x > 0:
return 0
else:
return -1
t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
is_firstlastone)
t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
is_firstlastone)
t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
'this_month_user_receive_same_coupon_firstone']]
t3.head(2)
|
user_id |
coupon_id |
date_received |
this_month_user_receive_same_coupon_lastone |
this_month_user_receive_same_coupon_firstone |
0 |
1439408 |
11002.0 |
20160528.0 |
-1 |
-1 |
1 |
1439408 |
8591.0 |
20160613.0 |
1 |
0 |
t4 = dataset2[['user_id', 'date_received']].copy()
t4['this_day_user_receive_all_coupon_count'] = 1
t4 = t4.groupby(['user_id', 'date_received'], as_index=False).count()
t4.head(2)
|
user_id |
date_received |
this_day_user_receive_all_coupon_count |
0 |
4 |
20160607.0 |
1 |
1 |
165 |
20160525.0 |
1 |
t5 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t5['this_day_user_receive_same_coupon_count'] = 1
t5 = t5.groupby(['user_id', 'coupon_id', 'date_received'],
as_index=False).count()
t5.head(2)
|
user_id |
coupon_id |
date_received |
this_day_user_receive_same_coupon_count |
0 |
4 |
2902.0 |
20160607.0 |
1 |
1 |
165 |
7571.0 |
20160525.0 |
1 |
'''t6 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('int').astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('int').astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n 'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''def get_day_gap_before(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
def get_day_gap_after(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
t7 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype(
'int').astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
'day_gap_before', 'day_gap_after']]
t7.head(2)'''
"def get_day_gap_before(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\ndef get_day_gap_after(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\nt7 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype(\n 'int').astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n 'day_gap_before', 'day_gap_after']]\nt7.head(2)"
other_feature2 = pd.merge(t1, t, on='user_id')
other_feature2 = pd.merge(other_feature2, t3, on=['user_id', 'coupon_id'])
other_feature2 = pd.merge(other_feature2, t4, on=['user_id', 'date_received'])
other_feature2 = pd.merge(other_feature2, t5, on=[
'user_id', 'coupon_id', 'date_received'])
other_feature2.to_csv('other_feature2.csv', index=None)
other_feature2.head()
|
user_id |
coupon_id |
this_month_user_receive_same_coupon_count |
this_month_user_receive_all_coupon_count |
date_received |
this_month_user_receive_same_coupon_lastone |
this_month_user_receive_same_coupon_firstone |
this_day_user_receive_all_coupon_count |
this_day_user_receive_same_coupon_count |
0 |
4 |
2902.0 |
1 |
1 |
20160607.0 |
-1 |
-1 |
1 |
1 |
1 |
165 |
7571.0 |
1 |
1 |
20160525.0 |
-1 |
-1 |
1 |
1 |
2 |
166 |
9261.0 |
1 |
1 |
20160525.0 |
-1 |
-1 |
1 |
1 |
3 |
215 |
8944.0 |
1 |
1 |
20160524.0 |
-1 |
-1 |
1 |
1 |
4 |
236 |
11002.0 |
1 |
1 |
20160528.0 |
-1 |
-1 |
1 |
1 |
other_feature2.shape
(258446, 9)
dataset1.head()
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
date |
7 |
1832624 |
3381 |
7610.0 |
200:20 |
0.0 |
20160429.0 |
NaN |
18 |
163606 |
1569 |
5054.0 |
200:30 |
10.0 |
20160421.0 |
NaN |
43 |
4061024 |
3381 |
7610.0 |
200:20 |
10.0 |
20160426.0 |
NaN |
52 |
106443 |
450 |
3732.0 |
30:5 |
NaN |
20160429.0 |
NaN |
96 |
114747 |
1569 |
5054.0 |
200:30 |
9.0 |
20160426.0 |
NaN |
t = dataset1[['user_id']].copy()
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('user_id', as_index=False).count()
t.head(2)
|
user_id |
this_month_user_receive_all_coupon_count |
0 |
173 |
1 |
1 |
285 |
1 |
t1 = dataset1[['user_id', 'coupon_id']].copy()
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['user_id', 'coupon_id'], as_index=False).count()
t1.head(2)
|
user_id |
coupon_id |
this_month_user_receive_same_coupon_count |
0 |
173 |
7610.0 |
1 |
1 |
285 |
1532.0 |
1 |
t2 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t2.date_received = t2.date_received.astype('int').astype('str')
t2 = t2.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
t2 = t2[t2.receive_number > 1]
t2['max_date_received'] = t2.date_received.apply(
lambda s: max([int(d) for d in s.split(':')]))
t2['min_date_received'] = t2.date_received.apply(
lambda s: min([int(d) for d in s.split(':')]))
t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]
t2.head(2)
|
user_id |
coupon_id |
max_date_received |
min_date_received |
38 |
2227 |
7088.0 |
20160511 |
20160426 |
147 |
8397 |
8556.0 |
20160506 |
20160425 |
t3 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t3 = pd.merge(t3, t2, on=['user_id', 'coupon_id'], how='left')
t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
t3.date_received.astype('int')
t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(
'int') - t3.min_date_received
def is_firstlastone(x):
if x == 0:
return 1
elif x > 0:
return 0
else:
return -1
t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
is_firstlastone)
t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
is_firstlastone)
t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
'this_month_user_receive_same_coupon_firstone']]
t3.head(2)
|
user_id |
coupon_id |
date_received |
this_month_user_receive_same_coupon_lastone |
this_month_user_receive_same_coupon_firstone |
0 |
1832624 |
7610.0 |
20160429.0 |
-1 |
-1 |
1 |
163606 |
5054.0 |
20160421.0 |
-1 |
-1 |
t4 = dataset1[['user_id', 'date_received']].copy()
t4['this_day_user_receive_all_coupon_count'] = 1
t4 = t4.groupby(['user_id', 'date_received'], as_index=False).count()
t4.head(2)
|
user_id |
date_received |
this_day_user_receive_all_coupon_count |
0 |
173 |
20160414.0 |
1 |
1 |
285 |
20160501.0 |
1 |
t5 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t5['this_day_user_receive_same_coupon_count'] = 1
t5 = t5.groupby(['user_id', 'coupon_id', 'date_received'],
as_index=False).count()
t5.head(2)
|
user_id |
coupon_id |
date_received |
this_day_user_receive_same_coupon_count |
0 |
173 |
7610.0 |
20160414.0 |
1 |
1 |
285 |
1532.0 |
20160501.0 |
1 |
'''t6 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('int').astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('int').astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n 'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''def get_day_gap_before(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
def get_day_gap_after(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
t7 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype(
'int').astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
'day_gap_before', 'day_gap_after']]
t7.head(2)'''
"def get_day_gap_before(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\ndef get_day_gap_after(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\nt7 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype(\n 'int').astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n 'day_gap_before', 'day_gap_after']]\nt7.head(2)"
other_feature1 = pd.merge(t1, t, on='user_id')
other_feature1 = pd.merge(other_feature1, t3, on=['user_id', 'coupon_id'])
other_feature1 = pd.merge(other_feature1, t4, on=['user_id', 'date_received'])
other_feature1 = pd.merge(other_feature1, t5, on=[
'user_id', 'coupon_id', 'date_received'])
other_feature1.to_csv('other_feature1.csv', index=None)
other_feature1.head()
|
user_id |
coupon_id |
this_month_user_receive_same_coupon_count |
this_month_user_receive_all_coupon_count |
date_received |
this_month_user_receive_same_coupon_lastone |
this_month_user_receive_same_coupon_firstone |
this_day_user_receive_all_coupon_count |
this_day_user_receive_same_coupon_count |
0 |
173 |
7610.0 |
1 |
1 |
20160414.0 |
-1 |
-1 |
1 |
1 |
1 |
285 |
1532.0 |
1 |
1 |
20160501.0 |
-1 |
-1 |
1 |
1 |
2 |
316 |
8952.0 |
1 |
1 |
20160430.0 |
-1 |
-1 |
1 |
1 |
3 |
377 |
2857.0 |
1 |
1 |
20160512.0 |
-1 |
-1 |
1 |
1 |
4 |
387 |
7610.0 |
1 |
1 |
20160421.0 |
-1 |
-1 |
1 |
1 |
other_feature1.shape
(137167, 9)
coupon相关特征
column |
含义 |
discount_rate |
优惠券打折率,“满师减一”或0.9统一为0.9 |
distance |
到店距离 |
day_of_week |
周几 |
day_of_month |
几号 |
days_distance |
领券时间距6月30号距离(dataset2和dataset1作相应变化) |
this_day_user_receive_same_coupon_count |
用户领券当天领取该优惠券的数量 |
discount_man |
“满减”折扣中满多少 |
discount_jian |
“满减”折扣中减多少 |
is_man_jian |
优惠券是不是满减形式 |
coupon_count |
优惠券出现次数 |
def calc_discount_rate(s):
s = str(s)
s = s.split(':')
if len(s) == 1:
return float(s[0])
else:
return 1.0-float(s[1])/float(s[0])
def get_discount_man(s):
s = str(s)
s = s.split(':')
if len(s) == 1:
return 0
else:
return int(s[0])
def get_discount_jian(s):
s = str(s)
s = s.split(':')
if len(s) == 1:
return 0
else:
return int(s[1])
def is_man_jian(s):
s = str(s)
s = s.split(':')
if len(s) == 1:
return 0
else:
return 1
dataset3
dataset3['day_of_week'] = dataset3.date_received.astype('int').astype('str').apply(
lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)
dataset3['day_of_month'] = dataset3.date_received.astype('int').astype(
'str').apply(lambda x: int(x[6:8]))
dataset3['days_distance'] = dataset3.date_received.astype('int').astype('str').apply(
lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(2016, 6, 30)).days)
dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian)
dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian)
dataset3['discount_rate'] = dataset3.discount_rate.apply(calc_discount_rate)
dataset3.head()
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
day_of_week |
day_of_month |
days_distance |
discount_man |
discount_jian |
is_man_jian |
0 |
4129537 |
450 |
9983 |
0.833333 |
1.0 |
20160712 |
2 |
12 |
12 |
30 |
5 |
1 |
1 |
6949378 |
1300 |
3429 |
0.833333 |
NaN |
20160706 |
3 |
6 |
6 |
30 |
5 |
1 |
2 |
2166529 |
7113 |
6928 |
0.900000 |
5.0 |
20160727 |
3 |
27 |
27 |
200 |
20 |
1 |
3 |
2166529 |
7113 |
1808 |
0.900000 |
5.0 |
20160727 |
3 |
27 |
27 |
100 |
10 |
1 |
4 |
6172162 |
7605 |
6500 |
0.966667 |
2.0 |
20160708 |
5 |
8 |
8 |
30 |
1 |
1 |
d = dataset3[['coupon_id']].copy()
d['coupon_count'] = 1
d = d.groupby('coupon_id', as_index=False).count()
d.head(2)
|
coupon_id |
coupon_count |
0 |
3 |
16 |
1 |
7 |
1 |
dataset3 = pd.merge(dataset3, d, on='coupon_id', how='left')
dataset3.to_csv('coupon3_feature.csv', index=None)
dataset3.head()
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
day_of_week |
day_of_month |
days_distance |
discount_man |
discount_jian |
is_man_jian |
coupon_count |
0 |
4129537 |
450 |
9983 |
0.833333 |
1.0 |
20160712 |
2 |
12 |
12 |
30 |
5 |
1 |
11586 |
1 |
6949378 |
1300 |
3429 |
0.833333 |
NaN |
20160706 |
3 |
6 |
6 |
30 |
5 |
1 |
4887 |
2 |
2166529 |
7113 |
6928 |
0.900000 |
5.0 |
20160727 |
3 |
27 |
27 |
200 |
20 |
1 |
205 |
3 |
2166529 |
7113 |
1808 |
0.900000 |
5.0 |
20160727 |
3 |
27 |
27 |
100 |
10 |
1 |
205 |
4 |
6172162 |
7605 |
6500 |
0.966667 |
2.0 |
20160708 |
5 |
8 |
8 |
30 |
1 |
1 |
132 |
dataset2.head()
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
date |
1 |
1439408 |
4663 |
11002.0 |
150:20 |
1.0 |
20160528.0 |
NaN |
4 |
1439408 |
2632 |
8591.0 |
20:1 |
0.0 |
20160613.0 |
NaN |
6 |
1439408 |
2632 |
8591.0 |
20:1 |
0.0 |
20160516.0 |
20160613.0 |
9 |
2029232 |
450 |
1532.0 |
30:5 |
0.0 |
20160530.0 |
NaN |
10 |
2029232 |
6459 |
12737.0 |
20:1 |
0.0 |
20160519.0 |
NaN |
dataset2
dataset2['day_of_week'] = dataset2['date_received'].copy().astype('str').apply(
lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)
dataset2['day_of_month'] = dataset2.date_received.copy().astype(
'str').apply(lambda x: int(x[6:8]))
dataset2['days_distance'] = dataset2.date_received.copy().astype('str').apply(
lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(2016, 5, 14)).days)
dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man)
dataset2['discount_jian'] = dataset2.discount_rate.apply(get_discount_jian)
dataset2['is_man_jian'] = dataset2.discount_rate.apply(is_man_jian)
dataset2['discount_rate'] = dataset2.discount_rate.apply(calc_discount_rate)
dataset2.head()
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import sys
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
if __name__ == '__main__':
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Remove the CWD from sys.path while we load stuff.
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# This is added back by InteractiveShellApp.init_path()
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
date |
day_of_week |
day_of_month |
days_distance |
discount_man |
discount_jian |
is_man_jian |
1 |
1439408 |
4663 |
11002.0 |
0.866667 |
1.0 |
20160528.0 |
NaN |
6 |
28 |
14 |
150 |
20 |
1 |
4 |
1439408 |
2632 |
8591.0 |
0.950000 |
0.0 |
20160613.0 |
NaN |
1 |
13 |
30 |
20 |
1 |
1 |
6 |
1439408 |
2632 |
8591.0 |
0.950000 |
0.0 |
20160516.0 |
20160613.0 |
1 |
16 |
2 |
20 |
1 |
1 |
9 |
2029232 |
450 |
1532.0 |
0.833333 |
0.0 |
20160530.0 |
NaN |
1 |
30 |
16 |
30 |
5 |
1 |
10 |
2029232 |
6459 |
12737.0 |
0.950000 |
0.0 |
20160519.0 |
NaN |
4 |
19 |
5 |
20 |
1 |
1 |
d = dataset2[['coupon_id']].copy()
d['coupon_count'] = 1
d = d.groupby('coupon_id', as_index=False).count()
d.head(2)
|
coupon_id |
coupon_count |
0 |
1.0 |
4 |
1 |
3.0 |
12 |
dataset2 = pd.merge(dataset2, d, on='coupon_id', how='left')
dataset2.to_csv('coupon2_feature.csv', index=None)
dataset2.head()
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
date |
day_of_week |
day_of_month |
days_distance |
discount_man |
discount_jian |
is_man_jian |
coupon_count |
0 |
1439408 |
4663 |
11002.0 |
0.866667 |
1.0 |
20160528.0 |
NaN |
6 |
28 |
14 |
150 |
20 |
1 |
7730 |
1 |
1439408 |
2632 |
8591.0 |
0.950000 |
0.0 |
20160613.0 |
NaN |
1 |
13 |
30 |
20 |
1 |
1 |
5 |
2 |
1439408 |
2632 |
8591.0 |
0.950000 |
0.0 |
20160516.0 |
20160613.0 |
1 |
16 |
2 |
20 |
1 |
1 |
5 |
3 |
2029232 |
450 |
1532.0 |
0.833333 |
0.0 |
20160530.0 |
NaN |
1 |
30 |
16 |
30 |
5 |
1 |
11728 |
4 |
2029232 |
6459 |
12737.0 |
0.950000 |
0.0 |
20160519.0 |
NaN |
4 |
19 |
5 |
20 |
1 |
1 |
16 |
dataset1
dataset1['day_of_week'] = dataset1.date_received.copy().astype(
'str').apply(lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)
dataset1['day_of_month'] = dataset1.date_received.copy().astype(
'str').apply(lambda x: int(x[6:8]))
dataset1['days_distance'] = dataset1.date_received.copy().astype('str').apply(
lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(2016, 4, 13)).days)
dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man)
dataset1['discount_jian'] = dataset1.discount_rate.apply(get_discount_jian)
dataset1['is_man_jian'] = dataset1.discount_rate.apply(is_man_jian)
dataset1['discount_rate'] = dataset1.discount_rate.apply(calc_discount_rate)
dataset1.head()
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import sys
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
if __name__ == '__main__':
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Remove the CWD from sys.path while we load stuff.
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
date |
day_of_week |
day_of_month |
days_distance |
discount_man |
discount_jian |
is_man_jian |
7 |
1832624 |
3381 |
7610.0 |
0.900000 |
0.0 |
20160429.0 |
NaN |
5 |
29 |
16 |
200 |
20 |
1 |
18 |
163606 |
1569 |
5054.0 |
0.850000 |
10.0 |
20160421.0 |
NaN |
4 |
21 |
8 |
200 |
30 |
1 |
43 |
4061024 |
3381 |
7610.0 |
0.900000 |
10.0 |
20160426.0 |
NaN |
2 |
26 |
13 |
200 |
20 |
1 |
52 |
106443 |
450 |
3732.0 |
0.833333 |
NaN |
20160429.0 |
NaN |
5 |
29 |
16 |
30 |
5 |
1 |
96 |
114747 |
1569 |
5054.0 |
0.850000 |
9.0 |
20160426.0 |
NaN |
2 |
26 |
13 |
200 |
30 |
1 |
d = dataset1[['coupon_id']].copy()
d['coupon_count'] = 1
d = d.groupby('coupon_id', as_index=False).count()
d.head(2)
|
coupon_id |
coupon_count |
0 |
1.0 |
1 |
1 |
2.0 |
2 |
dataset1 = pd.merge(dataset1, d, on='coupon_id', how='left')
dataset1.to_csv('coupon1_feature.csv', index=None)
dataset1.head(2)
|
user_id |
merchant_id |
coupon_id |
discount_rate |
distance |
date_received |
date |
day_of_week |
day_of_month |
days_distance |
discount_man |
discount_jian |
is_man_jian |
coupon_count |
0 |
1832624 |
3381 |
7610.0 |
0.90 |
0.0 |
20160429.0 |
NaN |
5 |
29 |
16 |
200 |
20 |
1 |
22281 |
1 |
163606 |
1569 |
5054.0 |
0.85 |
10.0 |
20160421.0 |
NaN |
4 |
21 |
8 |
200 |
30 |
1 |
11893 |
Merchant相关特征
column |
含义 |
total_sales |
商家被消费次数 |
sales_use_coupon |
商家被领券后消费的次数 |
total_coupon |
商家被领券次数 |
coupon_rate |
sales_use_coupon/total_sales |
transfer_rate |
sales_use_coupon/total_coupon |
merchant_min_distance |
最小距离 |
merchant_max_distance |
最大距离 |
merchant_min_distance |
最小距离 |
merchant_max_distance |
最大距离 |
dataset3
merchant3 = feature3[['merchant_id', 'coupon_id',
'distance', 'date_received', 'date']].copy()
t = merchant3[['merchant_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
|
merchant_id |
1 |
4663 |
3 |
2632 |
t1 = merchant3[merchant3.date.notnull()][['merchant_id']].copy()
t1['total_sales'] = 1
t1 = t1.groupby('merchant_id', as_index=False).count()
t1.head(2)
|
merchant_id |
total_sales |
0 |
1 |
10 |
1 |
2 |
4 |
t2 = merchant3[(merchant3.date.notnull()) & (
merchant3.coupon_id.notnull())][['merchant_id']].copy()
t2['sales_use_coupon'] = 1
t2 = t2.groupby('merchant_id', as_index=False).count()
t2.head(2)
|
merchant_id |
sales_use_coupon |
0 |
3 |
1 |
1 |
4 |
5 |
t3 = merchant3[merchant3.coupon_id.notnull()][['merchant_id']].copy()
t3['total_coupon'] = 1
t3 = t3.groupby('merchant_id', as_index=False).count()
t3.head(2)
|
merchant_id |
total_coupon |
0 |
2 |
7 |
1 |
3 |
10 |
t4 = merchant3[(merchant3.date.notnull()) & (
merchant3.coupon_id.notnull())][['merchant_id', 'distance']].copy()
t4.head(2)
|
merchant_id |
distance |
6 |
2632 |
0.0 |
33 |
1361 |
0.0 |
t5 = t4.groupby('merchant_id', as_index=False).min()
t5.rename(columns={'distance': 'merchant_min_distance'}, inplace=True)
t6 = t4.groupby('merchant_id', as_index=False).max()
t6.rename(columns={'distance': 'merchant_max_distance'}, inplace=True)
t7 = t4.groupby('merchant_id', as_index=False).mean()
t7.rename(columns={'distance': 'merchant_mean_distance'}, inplace=True)
t8 = t4.groupby('merchant_id', as_index=False).median()
t8.rename(columns={'distance': 'merchant_median_distance'}, inplace=True)
t8.head(2)
|
merchant_id |
merchant_median_distance |
0 |
3 |
0.0 |
1 |
4 |
0.0 |
merchant3_feature = pd.merge(t, t1, on='merchant_id', how='left')
merchant3_feature = pd.merge(
merchant3_feature, t2, on='merchant_id', how='left')
merchant3_feature = pd.merge(
merchant3_feature, t3, on='merchant_id', how='left')
merchant3_feature = pd.merge(
merchant3_feature, t5, on='merchant_id', how='left')
merchant3_feature = pd.merge(
merchant3_feature, t6, on='merchant_id', how='left')
merchant3_feature = pd.merge(
merchant3_feature, t7, on='merchant_id', how='left')
merchant3_feature = pd.merge(
merchant3_feature, t8, on='merchant_id', how='left')
merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(
np.nan, 0)
merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon / \
merchant3_feature.total_coupon
merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype(
'float') / merchant3_feature.total_sales
merchant3_feature.total_sales = merchant3_feature.total_sales.replace(np.nan, 0)
merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(
np.nan, 0)
merchant3_feature.to_csv('merchant3_feature.csv', index=None)
merchant3_feature.head()
|
merchant_id |
total_sales |
sales_use_coupon |
total_coupon |
merchant_min_distance |
merchant_max_distance |
merchant_mean_distance |
merchant_median_distance |
merchant_coupon_transfer_rate |
coupon_rate |
0 |
4663 |
393.0 |
39.0 |
11445.0 |
0.0 |
10.0 |
1.911765 |
1.0 |
0.003408 |
0.099237 |
1 |
2632 |
8.0 |
3.0 |
23.0 |
0.0 |
1.0 |
0.666667 |
1.0 |
0.130435 |
0.375000 |
2 |
3381 |
9579.0 |
416.0 |
46729.0 |
0.0 |
10.0 |
1.713580 |
1.0 |
0.008902 |
0.043428 |
3 |
450 |
7757.0 |
1239.0 |
35933.0 |
0.0 |
10.0 |
0.894068 |
0.0 |
0.034481 |
0.159727 |
4 |
6459 |
20.0 |
0.0 |
16.0 |
NaN |
NaN |
NaN |
NaN |
0.000000 |
0.000000 |
merchant3_feature.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8293 entries, 0 to 8292
Data columns (total 10 columns):
merchant_id 8293 non-null int64
total_sales 8293 non-null float64
sales_use_coupon 8293 non-null float64
total_coupon 8293 non-null float64
merchant_min_distance 3552 non-null float64
merchant_max_distance 3552 non-null float64
merchant_mean_distance 3552 non-null float64
merchant_median_distance 3552 non-null float64
merchant_coupon_transfer_rate 5363 non-null float64
coupon_rate 8194 non-null float64
dtypes: float64(9), int64(1)
memory usage: 712.7 KB
dataset2
merchant2 = feature2[['merchant_id', 'coupon_id',
'distance', 'date_received', 'date']].copy()
t = merchant2[['merchant_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
|
merchant_id |
0 |
2632 |
7 |
3381 |
t1 = merchant2[merchant2.date.notnull()][['merchant_id']].copy()
t1['total_sales'] = 1
t1 = t1.groupby('merchant_id').agg('sum').reset_index()
t1.head(2)
|
merchant_id |
total_sales |
0 |
1 |
3 |
1 |
3 |
8 |
t2 = merchant2[(merchant2.date.notnull()) & (
merchant2.coupon_id.notnull())][['merchant_id']].copy()
t2['sales_use_coupon'] = 1
t2 = t2.groupby('merchant_id').agg('sum').reset_index()
t2.head(2)
|
merchant_id |
sales_use_coupon |
0 |
15 |
11 |
1 |
17 |
1 |
t3 = merchant2[merchant2.coupon_id.notnull()][['merchant_id']].copy()
t3['total_coupon'] = 1
t3 = t3.groupby('merchant_id').agg('sum').reset_index()
t3.head(2)
|
merchant_id |
total_coupon |
0 |
5 |
2 |
1 |
8 |
2 |
t4 = merchant2[(merchant2.date.notnull()) & (
merchant2.coupon_id.notnull())][['merchant_id', 'distance']].copy()
t5 = t4.groupby('merchant_id').agg('min').reset_index()
t5.rename(columns={'distance': 'merchant_min_distance'}, inplace=True)
t6 = t4.groupby('merchant_id').agg('max').reset_index()
t6.rename(columns={'distance': 'merchant_max_distance'}, inplace=True)
t7 = t4.groupby('merchant_id').agg('mean').reset_index()
t7.rename(columns={'distance': 'merchant_mean_distance'}, inplace=True)
t8 = t4.groupby('merchant_id').agg('median').reset_index()
t8.rename(columns={'distance': 'merchant_median_distance'}, inplace=True)
t8.head(2)
|
merchant_id |
merchant_median_distance |
0 |
15 |
0.0 |
1 |
17 |
1.0 |
merchant2_feature = pd.merge(t, t1, on='merchant_id', how='left')
merchant2_feature = pd.merge(
merchant2_feature, t2, on='merchant_id', how='left')
merchant2_feature = pd.merge(
merchant2_feature, t3, on='merchant_id', how='left')
merchant2_feature = pd.merge(
merchant2_feature, t5, on='merchant_id', how='left')
merchant2_feature = pd.merge(
merchant2_feature, t6, on='merchant_id', how='left')
merchant2_feature = pd.merge(
merchant2_feature, t7, on='merchant_id', how='left')
merchant2_feature = pd.merge(
merchant2_feature, t8, on='merchant_id', how='left')
merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(
np.nan, 0)
merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype(
'float') / merchant2_feature.total_coupon
merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype(
'float') / merchant2_feature.total_sales
merchant2_feature.total_sales = merchant2_feature.total_sales.replace(np.nan, 0)
merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(
np.nan, 0)
merchant2_feature.to_csv('merchant2_feature.csv', index=None)
merchant2_feature.head()
|
merchant_id |
total_sales |
sales_use_coupon |
total_coupon |
merchant_min_distance |
merchant_max_distance |
merchant_mean_distance |
merchant_median_distance |
merchant_coupon_transfer_rate |
coupon_rate |
0 |
2632 |
14.0 |
3.0 |
31.0 |
1.0 |
1.0 |
1.000000 |
1.0 |
0.096774 |
0.214286 |
1 |
3381 |
11537.0 |
1980.0 |
56499.0 |
0.0 |
10.0 |
1.697002 |
1.0 |
0.035045 |
0.171622 |
2 |
2099 |
5711.0 |
1497.0 |
12215.0 |
0.0 |
10.0 |
1.004814 |
0.0 |
0.122554 |
0.262126 |
3 |
1569 |
493.0 |
91.0 |
25173.0 |
0.0 |
10.0 |
2.388235 |
1.0 |
0.003615 |
0.184584 |
4 |
8390 |
1207.0 |
133.0 |
690.0 |
0.0 |
10.0 |
0.865079 |
0.0 |
0.192754 |
0.110191 |
dataset1
merchant1 = feature1[['merchant_id', 'coupon_id',
'distance', 'date_received', 'date']].copy()
t = merchant1[['merchant_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
|
merchant_id |
0 |
2632 |
8 |
3381 |
t1 = merchant1[merchant1.date.notnull()][['merchant_id']].copy()
t1['total_sales'] = 1
t1 = t1.groupby('merchant_id').agg('sum').reset_index()
t1.head(2)
|
merchant_id |
total_sales |
0 |
1 |
4 |
1 |
4 |
12 |
t2 = merchant1[(merchant1.date.notnull()) & (
merchant1.coupon_id.notnull())][['merchant_id']].copy()
t2['sales_use_coupon'] = 1
t2 = t2.groupby('merchant_id').agg('sum').reset_index()
t2.head(2)
|
merchant_id |
sales_use_coupon |
0 |
13 |
1 |
1 |
14 |
1 |
t3 = merchant1[merchant1.coupon_id.notnull()][['merchant_id']].copy()
t3['total_coupon'] = 1
t3 = t3.groupby('merchant_id').agg('sum').reset_index()
t3.head(2)
|
merchant_id |
total_coupon |
0 |
8 |
2 |
1 |
13 |
3 |
t4 = merchant1[(merchant1.date.notnull()) & (
merchant1.coupon_id.notnull())][['merchant_id', 'distance']]
t5 = t4.groupby('merchant_id').agg('min').reset_index()
t5.rename(columns={'distance': 'merchant_min_distance'}, inplace=True)
t6 = t4.groupby('merchant_id').agg('max').reset_index()
t6.rename(columns={'distance': 'merchant_max_distance'}, inplace=True)
t7 = t4.groupby('merchant_id').agg('mean').reset_index()
t7.rename(columns={'distance': 'merchant_mean_distance'}, inplace=True)
t8 = t4.groupby('merchant_id').agg('median').reset_index()
t8.rename(columns={'distance': 'merchant_median_distance'}, inplace=True)
t8.head(2)
|
merchant_id |
merchant_median_distance |
0 |
13 |
0.0 |
1 |
14 |
0.0 |
merchant1_feature = pd.merge(t, t1, on='merchant_id', how='left')
merchant1_feature = pd.merge(
merchant1_feature, t2, on='merchant_id', how='left')
merchant1_feature = pd.merge(
merchant1_feature, t3, on='merchant_id', how='left')
merchant1_feature = pd.merge(
merchant1_feature, t5, on='merchant_id', how='left')
merchant1_feature = pd.merge(
merchant1_feature, t6, on='merchant_id', how='left')
merchant1_feature = pd.merge(
merchant1_feature, t7, on='merchant_id', how='left')
merchant1_feature = pd.merge(
merchant1_feature, t8, on='merchant_id', how='left')
merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(
np.nan, 0)
merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype(
'float') / merchant1_feature.total_coupon
merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype(
'float') / merchant1_feature.total_sales
merchant1_feature.total_sales = merchant1_feature.total_sales.replace(
np.nan, 0)
merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(
np.nan, 0)
merchant1_feature.to_csv('merchant1_feature.csv', index=None)
merchant1_feature.head()
|
merchant_id |
total_sales |
sales_use_coupon |
total_coupon |
merchant_min_distance |
merchant_max_distance |
merchant_mean_distance |
merchant_median_distance |
merchant_coupon_transfer_rate |
coupon_rate |
0 |
2632 |
14.0 |
1.0 |
28.0 |
1.0 |
1.0 |
1.000000 |
1.0 |
0.035714 |
0.071429 |
1 |
3381 |
14962.0 |
2168.0 |
100369.0 |
0.0 |
10.0 |
1.650457 |
1.0 |
0.021600 |
0.144900 |
2 |
2099 |
5783.0 |
1705.0 |
16824.0 |
0.0 |
10.0 |
0.968072 |
0.0 |
0.101343 |
0.294830 |
3 |
4833 |
522.0 |
116.0 |
8321.0 |
0.0 |
10.0 |
3.037736 |
2.0 |
0.013941 |
0.222222 |
4 |
8390 |
1056.0 |
133.0 |
690.0 |
0.0 |
10.0 |
0.865079 |
0.0 |
0.192754 |
0.125947 |
User相关特征
column |
含义 |
count_merchant |
每个用户在多少商家产生消费 |
user_%_distance |
距离特征 |
buy_use_coupon |
领券又消费的数量 |
buy_total |
消费数量 |
coupon_received |
领券数量 |
%_user_date_datereceived_gap |
领券和消费的时间差的三个特征 |
buy_use_coupon_rate |
- |
user_coupon_transfer_rate |
- |
dataset3
user3 = feature3[['user_id', 'merchant_id', 'coupon_id',
'discount_rate', 'distance', 'date_received', 'date']].copy()
t = user3[['user_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
|
user_id |
1 |
1439408 |
7 |
1832624 |
t1 = user3[user3.date.notnull()][['user_id', 'merchant_id']].copy()
t1.drop_duplicates(inplace=True)
t1 = t1.groupby('user_id').count().reset_index()
t1.rename(columns={'merchant_id': 'count_merchant'}, inplace=True)
t1.head(2)
|
user_id |
count_merchant |
0 |
165 |
2 |
1 |
209 |
1 |
t2 = user3[(user3.date.notnull()) & (user3.coupon_id.notnull())
][['user_id', 'distance']].copy()
t3 = t2.groupby('user_id').min().reset_index()
t3.rename(columns={'distance': 'user_min_distance'}, inplace=True)
t4 = t2.groupby('user_id').max().reset_index()
t4.rename(columns={'distance': 'user_max_distance'}, inplace=True)
t5 = t2.groupby('user_id').mean().reset_index()
t5.rename(columns={'distance': 'user_mean_distance'}, inplace=True)
t6 = t2.groupby('user_id').median().reset_index()
t6.rename(columns={'distance': 'user_median_distance'}, inplace=True)
t6.head(2)
|
user_id |
user_median_distance |
0 |
417 |
0.0 |
1 |
687 |
NaN |
t7 = user3[(user3.date.notnull()) & (
user3.coupon_id.notnull())][['user_id']].copy()
t7['buy_use_coupon'] = 1
t7 = t7.groupby('user_id').count().reset_index()
t7.head(2)
|
user_id |
buy_use_coupon |
0 |
417 |
1 |
1 |
687 |
1 |
t8 = user3[user3.date.notnull()][['user_id']].copy()
t8['buy_total'] = 1
t8 = t8.groupby('user_id').count().reset_index()
t8.head(2)
|
user_id |
buy_total |
0 |
165 |
12 |
1 |
209 |
1 |
t9 = user3[user3.coupon_id.notnull()][['user_id']].copy()
t9['coupon_received'] = 1
t9 = t9.groupby('user_id').count().reset_index()
t9.head(2)
|
user_id |
coupon_received |
0 |
4 |
1 |
1 |
165 |
2 |
def get_user_date_datereceived_gap(s):
s = s.split(':')
return (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) -
date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days
t10 = user3[(user3.date_received.notnull()) & (
user3.date.notnull())][['user_id', 'date_received', 'date']].copy()
t10[['date_received', 'date']] = t10[[
'date_received', 'date']].astype(str)
t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(
get_user_date_datereceived_gap)
t10 = t10[['user_id', 'user_date_datereceived_gap']]
t10.head(2)
|
user_id |
user_date_datereceived_gap |
6 |
1439408 |
28 |
33 |
1113008 |
6 |
t11 = t10.groupby('user_id').mean().reset_index()
t11.rename(columns={
'user_date_datereceived_gap': 'avg_user_date_datereceived_gap'}, inplace=True)
t12 = t10.groupby('user_id').min().reset_index()
t12.rename(columns={
'user_date_datereceived_gap': 'min_user_date_datereceived_gap'}, inplace=True)
t13 = t10.groupby('user_id').max().reset_index()
t13.rename(columns={
'user_date_datereceived_gap': 'max_user_date_datereceived_gap'}, inplace=True)
t13.head(2)
|
user_id |
max_user_date_datereceived_gap |
0 |
417 |
14 |
1 |
687 |
5 |
user3_feature = pd.merge(t, t1, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t3, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t4, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t5, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t6, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t7, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t8, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t9, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t11, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t12, on='user_id', how='left')
user3_feature = pd.merge(user3_feature, t13, on='user_id', how='left')
user3_feature.buy_use_coupon = user3_feature.buy_use_coupon.replace(np.nan, 0)
user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan, 0)
user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon / \
user3_feature.buy_total
user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon / \
user3_feature.coupon_received
user3_feature.buy_total = user3_feature.buy_total.replace(np.nan, 0)
user3_feature.coupon_received = user3_feature.coupon_received.replace(
np.nan, 0)
user3_feature.to_csv('user3_feature.csv', index=None)
user3_feature.isnull().sum()
user3_feature.head()
user_id 0
count_merchant 0
user_min_distance 320676
user_max_distance 320676
user_mean_distance 320676
user_median_distance 320676
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 316780
min_user_date_datereceived_gap 316780
max_user_date_datereceived_gap 316780
buy_use_coupon_rate 170895
user_coupon_transfer_rate 38354
dtype: int64
|
user_id |
count_merchant |
user_min_distance |
user_max_distance |
user_mean_distance |
user_median_distance |
buy_use_coupon |
buy_total |
coupon_received |
avg_user_date_datereceived_gap |
min_user_date_datereceived_gap |
max_user_date_datereceived_gap |
buy_use_coupon_rate |
user_coupon_transfer_rate |
0 |
1439408 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
2.0 |
4.0 |
28.0 |
28.0 |
28.0 |
0.5 |
0.25 |
1 |
1832624 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.00 |
2 |
2029232 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
2.0 |
2.0 |
NaN |
NaN |
NaN |
0.0 |
0.00 |
3 |
2747744 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.00 |
4 |
196342 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
1.0 |
1.0 |
NaN |
NaN |
NaN |
0.0 |
0.00 |
dataset2
user2 = feature2[['user_id', 'merchant_id', 'coupon_id',
'discount_rate', 'distance', 'date_received', 'date']].copy()
t = user2[['user_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
|
user_id |
0 |
1439408 |
7 |
1832624 |
t1 = user2[user2.date.notnull()][['user_id', 'merchant_id']].copy()
t1.drop_duplicates(inplace=True)
t1 = t1.groupby('user_id').count().reset_index()
t1.rename(columns={'merchant_id': 'count_merchant'}, inplace=True)
t1.head(2)
|
user_id |
count_merchant |
0 |
165 |
2 |
1 |
184 |
1 |
t2 = user2[(user2.date.notnull()) & (user2.coupon_id.notnull())
][['user_id', 'distance']].copy()
t3 = t2.groupby('user_id').min().reset_index()
t3.rename(columns={'distance': 'user_min_distance'}, inplace=True)
t4 = t2.groupby('user_id').max().reset_index()
t4.rename(columns={'distance': 'user_max_distance'}, inplace=True)
t5 = t2.groupby('user_id').mean().reset_index()
t5.rename(columns={'distance': 'user_mean_distance'}, inplace=True)
t6 = t2.groupby('user_id').median().reset_index()
t6.rename(columns={'distance': 'user_median_distance'}, inplace=True)
t6.head(2)
|
user_id |
user_median_distance |
0 |
184 |
0.0 |
1 |
417 |
0.0 |
t7 = user2[(user2.date.notnull()) & (
user2.coupon_id.notnull())][['user_id']].copy()
t7['buy_use_coupon'] = 1
t7 = t7.groupby('user_id').count().reset_index()
t7.head(2)
|
user_id |
buy_use_coupon |
0 |
184 |
1 |
1 |
417 |
1 |
t8 = user2[user2.date.notnull()][['user_id']].copy()
t8['buy_total'] = 1
t8 = t8.groupby('user_id').count().reset_index()
t8.head(2)
|
user_id |
buy_total |
0 |
165 |
8 |
1 |
184 |
1 |
t9 = user2[user2.coupon_id.notnull()][['user_id']].copy()
t9['coupon_received'] = 1
t9 = t9.groupby('user_id').count().reset_index()
t9.head(2)
|
user_id |
coupon_received |
0 |
4 |
1 |
1 |
144 |
1 |
def get_user_date_datereceived_gap(s):
s = s.split(':')
return (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) - date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days
t10 = user2[(user2.date_received.notnull()) & (
user2.date.notnull())][['user_id', 'date_received', 'date']].copy()
t10[['date_received', 'date']] = t10[[
'date_received', 'date']].astype(int).astype(str)
t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(
get_user_date_datereceived_gap)
t10 = t10[['user_id', 'user_date_datereceived_gap']]
t10.head(2)
|
user_id |
user_date_datereceived_gap |
38 |
2881376 |
8 |
75 |
114747 |
25 |
t11 = t10.groupby('user_id').mean().reset_index()
t11.rename(columns={
'user_date_datereceived_gap': 'avg_user_date_datereceived_gap'}, inplace=True)
t12 = t10.groupby('user_id').min().reset_index()
t12.rename(columns={
'user_date_datereceived_gap': 'min_user_date_datereceived_gap'}, inplace=True)
t13 = t10.groupby('user_id').max().reset_index()
t13.rename(columns={
'user_date_datereceived_gap': 'max_user_date_datereceived_gap'}, inplace=True)
t13.head(2)
|
user_id |
max_user_date_datereceived_gap |
0 |
184 |
30 |
1 |
417 |
14 |
user2_feature = pd.merge(t, t1, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t3, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t4, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t5, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t6, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t7, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t8, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t9, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t11, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t12, on='user_id', how='left')
user2_feature = pd.merge(user2_feature, t13, on='user_id', how='left')
user2_feature.buy_use_coupon = user2_feature.buy_use_coupon.replace(np.nan, 0)
user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan, 0)
user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon / \
user2_feature.buy_total
user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon / \
user2_feature.coupon_received
user2_feature.buy_total = user2_feature.buy_total.replace(np.nan, 0)
user2_feature.coupon_received = user2_feature.coupon_received.replace(
np.nan, 0)
user2_feature.to_csv('user2_feature.csv', index=None)
user2_feature.head()
|
user_id |
count_merchant |
user_min_distance |
user_max_distance |
user_mean_distance |
user_median_distance |
buy_use_coupon |
buy_total |
coupon_received |
avg_user_date_datereceived_gap |
min_user_date_datereceived_gap |
max_user_date_datereceived_gap |
buy_use_coupon_rate |
user_coupon_transfer_rate |
0 |
1439408 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
1.0 |
2.0 |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1 |
1832624 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
2 |
73611 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
3 |
163606 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
4 |
94107 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
dataset1
user1 = feature1[['user_id', 'merchant_id', 'coupon_id',
'discount_rate', 'distance', 'date_received', 'date']].copy()
t = user1[['user_id']].copy()
t.drop_duplicates(inplace=True)
t.head(2)
|
user_id |
0 |
1439408 |
8 |
2029232 |
t1 = user1[user1.date.notnull()][['user_id', 'merchant_id']].copy()
t1.drop_duplicates(inplace=True)
t1 = t1.groupby('user_id').count().reset_index()
t1.rename(columns={'merchant_id': 'count_merchant'}, inplace=True)
t1.head(2)
|
user_id |
count_merchant |
0 |
165 |
2 |
1 |
184 |
1 |
t2 = user1[(user1.date.notnull()) & (user1.coupon_id.notnull())
][['user_id', 'distance']].copy()
t3 = t2.groupby('user_id').min().reset_index()
t3.rename(columns={'distance': 'user_min_distance'}, inplace=True)
t4 = t2.groupby('user_id').max().reset_index()
t4.rename(columns={'distance': 'user_max_distance'}, inplace=True)
t5 = t2.groupby('user_id').mean().reset_index()
t5.rename(columns={'distance': 'user_mean_distance'}, inplace=True)
t6 = t2.groupby('user_id').median().reset_index()
t6.rename(columns={'distance': 'user_median_distance'}, inplace=True)
t6.head(2)
|
user_id |
user_median_distance |
0 |
184 |
0.0 |
1 |
417 |
0.0 |
t7 = user1[(user1.date.notnull()) & (
user1.coupon_id.notnull())][['user_id']].copy()
t7['buy_use_coupon'] = 1
t7 = t7.groupby('user_id').sum().reset_index()
t7.head(2)
|
user_id |
buy_use_coupon |
0 |
184 |
1 |
1 |
417 |
1 |
t8 = user1[user1.date.notnull()][['user_id']].copy()
t8['buy_total'] = 1
t8 = t8.groupby('user_id').count().reset_index()
t8.head(2)
|
user_id |
buy_total |
0 |
165 |
6 |
1 |
184 |
1 |
t9 = user1[user1.coupon_id.notnull()][['user_id']].copy()
t9['coupon_received'] = 1
t9 = t9.groupby('user_id').count().reset_index()
t9.head(2)
|
user_id |
coupon_received |
0 |
4 |
1 |
1 |
35 |
4 |
t10 = user1[(user1.date_received.notnull()) & (
user1.date.notnull())][['user_id', 'date_received', 'date']].copy()
t10[['date_received', 'date']] = t10[[
'date_received', 'date']].astype(int).astype(str)
t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(
get_user_date_datereceived_gap)
t10 = t10[['user_id', 'user_date_datereceived_gap']]
t10.head(2)
|
user_id |
user_date_datereceived_gap |
38 |
2881376 |
8 |
75 |
114747 |
25 |
t11 = t10.groupby('user_id').mean().reset_index()
t11.rename(columns={
'user_date_datereceived_gap': 'avg_user_date_datereceived_gap'}, inplace=True)
t12 = t10.groupby('user_id').min().reset_index()
t12.rename(columns={
'user_date_datereceived_gap': 'min_user_date_datereceived_gap'}, inplace=True)
t13 = t10.groupby('user_id').max().reset_index()
t13.rename(columns={
'user_date_datereceived_gap': 'max_user_date_datereceived_gap'}, inplace=True)
t13.head(2)
|
user_id |
max_user_date_datereceived_gap |
0 |
184 |
30 |
1 |
417 |
14 |
user1_feature = pd.merge(t, t1, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t3, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t4, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t5, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t6, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t7, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t8, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t9, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t11, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t12, on='user_id', how='left')
user1_feature = pd.merge(user1_feature, t13, on='user_id', how='left')
user1_feature.buy_use_coupon = user1_feature.buy_use_coupon.replace(np.nan, 0)
user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan, 0)
user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon / \
user1_feature.buy_total
user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon / \
user1_feature.coupon_received
user1_feature.buy_total = user1_feature.buy_total.replace(np.nan, 0)
user1_feature.coupon_received = user1_feature.coupon_received.replace(
np.nan, 0)
user1_feature.to_csv('user1_feature.csv', index=None)
user1_feature.head()
|
user_id |
count_merchant |
user_min_distance |
user_max_distance |
user_mean_distance |
user_median_distance |
buy_use_coupon |
buy_total |
coupon_received |
avg_user_date_datereceived_gap |
min_user_date_datereceived_gap |
max_user_date_datereceived_gap |
buy_use_coupon_rate |
user_coupon_transfer_rate |
0 |
1439408 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
1.0 |
2.0 |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1 |
2029232 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
2 |
2223968 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
3 |
73611 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
4 |
3273056 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
user_merchant交互特征
column |
含义 |
user_merchant_buy_total |
消费的用户-商家行数 |
user_merchant_received |
领券的用户-商家行数 |
user_merchant_buy_use_coupon |
领券且消费的行数 |
user_merchant_any |
用户-商家全部行数 |
user_merchant_buy_common |
没领券且消费的用户-商家行数 |
dataset3
all_user_merchant = feature3[['user_id','merchant_id']].copy()
all_user_merchant.drop_duplicates(inplace=True)
t = feature3[['user_id','merchant_id','date']].copy()
t = t[t.date.notnull()][['user_id','merchant_id']]
t['user_merchant_buy_total'] = 1
t = t.groupby(['user_id','merchant_id']).count().reset_index()
t.head(2)
|
user_id |
merchant_id |
user_merchant_buy_total |
0 |
165 |
2934 |
6 |
1 |
165 |
4195 |
6 |
t1 = feature3[['user_id','merchant_id','coupon_id']].copy()
t1 = t1[t1.coupon_id.notnull()][['user_id','merchant_id']]
t1['user_merchant_received'] = 1
t1 = t1.groupby(['user_id','merchant_id']).count().reset_index()
t1.head(2)
|
user_id |
merchant_id |
user_merchant_received |
0 |
4 |
1469 |
1 |
1 |
165 |
2934 |
1 |
t2 = feature3[['user_id','merchant_id','date','date_received']].copy()
t2 = t2[(t2.date.notnull())&(t2.date_received.notnull())][['user_id','merchant_id']]
t2['user_merchant_buy_use_coupon'] = 1
t2 = t2.groupby(['user_id','merchant_id']).count().reset_index()
t2.head(2)
|
user_id |
merchant_id |
user_merchant_buy_use_coupon |
0 |
417 |
775 |
1 |
1 |
687 |
8594 |
1 |
t3 = feature3[['user_id','merchant_id']].copy()
t3['user_merchant_any'] = 1
t3 = t3.groupby(['user_id','merchant_id']).count().reset_index()
t3.head(2)
|
user_id |
merchant_id |
user_merchant_any |
0 |
4 |
1469 |
1 |
1 |
165 |
2934 |
7 |
t4 = feature3[['user_id','merchant_id','date','coupon_id']].copy()
t4 = t4[(t4.date.notnull())&(t4.coupon_id.isnull())][['user_id','merchant_id']]
t4['user_merchant_buy_common'] = 1
t4 = t4.groupby(['user_id','merchant_id']).count().reset_index()
t4.head(2)
|
user_id |
merchant_id |
user_merchant_buy_common |
0 |
165 |
2934 |
6 |
1 |
165 |
4195 |
6 |
user_merchant3 = pd.merge(all_user_merchant, t, on=[
'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t1, on=[
'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t2, on=[
'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t3, on=[
'user_id', 'merchant_id'], how='left')
user_merchant3 = pd.merge(user_merchant3, t4, on=[
'user_id', 'merchant_id'], how='left')
user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(
np.nan, 0)
user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(
np.nan, 0)
user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon / \
user_merchant3.user_merchant_received
user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon / \
user_merchant3.user_merchant_buy_total
user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total / \
user_merchant3.user_merchant_any
user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common / \
user_merchant3.user_merchant_buy_total
user_merchant3.user_merchant_received = user_merchant3.user_merchant_received.fillna(0)
user_merchant3.user_merchant_buy_total = user_merchant3.user_merchant_buy_total.fillna(0)
user_merchant3.user_merchant_any = user_merchant3.user_merchant_any.fillna(0)
user_merchant3.to_csv('user_merchant3.csv', index=None)
dataset2
all_user_merchant = feature2[['user_id','merchant_id']].copy()
all_user_merchant.drop_duplicates(inplace=True)
t = feature2[['user_id','merchant_id','date']].copy()
t = t[t.date.notnull()][['user_id','merchant_id']]
t['user_merchant_buy_total'] = 1
t = t.groupby(['user_id','merchant_id']).count().reset_index()
t.head(2)
|
user_id |
merchant_id |
user_merchant_buy_total |
0 |
165 |
2934 |
4 |
1 |
165 |
4195 |
4 |
t1 = feature2[['user_id','merchant_id','coupon_id']].copy()
t1 = t1[t1.coupon_id.notnull()][['user_id','merchant_id']]
t1['user_merchant_received'] = 1
t1 = t1.groupby(['user_id','merchant_id']).count().reset_index()
t1.head(2)
|
user_id |
merchant_id |
user_merchant_received |
0 |
4 |
1433 |
1 |
1 |
144 |
1553 |
1 |
t2 = feature2[['user_id','merchant_id','date','date_received']].copy()
t2 = t2[(t2.date.notnull())&(t2.date_received.notnull())][['user_id','merchant_id']]
t2['user_merchant_buy_use_coupon'] = 1
t2 = t2.groupby(['user_id','merchant_id']).count().reset_index()
t2.head(2)
|
user_id |
merchant_id |
user_merchant_buy_use_coupon |
0 |
184 |
3381 |
1 |
1 |
417 |
775 |
1 |
t3 = feature2[['user_id','merchant_id']].copy()
t3['user_merchant_any'] = 1
t3 = t3.groupby(['user_id','merchant_id']).count().reset_index()
t3.head(2)
|
user_id |
merchant_id |
user_merchant_any |
0 |
4 |
1433 |
1 |
1 |
144 |
1553 |
1 |
t4 = feature2[['user_id','merchant_id','date','coupon_id']].copy()
t4 = t4[(t4.date.notnull())&(t4.coupon_id.isnull())][['user_id','merchant_id']]
t4['user_merchant_buy_common'] = 1
t4 = t4.groupby(['user_id','merchant_id']).count().reset_index()
t4.head(2)
|
user_id |
merchant_id |
user_merchant_buy_common |
0 |
165 |
2934 |
4 |
1 |
165 |
4195 |
4 |
user_merchant2 = pd.merge(all_user_merchant, t, on=[
'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t1, on=[
'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t2, on=[
'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t3, on=[
'user_id', 'merchant_id'], how='left')
user_merchant2 = pd.merge(user_merchant2, t4, on=[
'user_id', 'merchant_id'], how='left')
user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(
np.nan, 0)
user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(
np.nan, 0)
user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon / \
user_merchant2.user_merchant_received
user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon / \
user_merchant2.user_merchant_buy_total
user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total / \
user_merchant2.user_merchant_any
user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common / \
user_merchant2.user_merchant_buy_total
user_merchant2.user_merchant_received = user_merchant2.user_merchant_received.fillna(0)
user_merchant2.user_merchant_buy_total = user_merchant2.user_merchant_buy_total.fillna(0)
user_merchant2.user_merchant_any = user_merchant2.user_merchant_any.fillna(0)
user_merchant2.to_csv('user_merchant2.csv', index=None)
dataset1
all_user_merchant = feature1[['user_id','merchant_id']].copy()
all_user_merchant.drop_duplicates(inplace=True)
t = feature1[['user_id','merchant_id','date']].copy()
t = t[t.date.notnull()][['user_id','merchant_id']]
t['user_merchant_buy_total'] = 1
t = t.groupby(['user_id','merchant_id']).count().reset_index()
t.head(2)
|
user_id |
merchant_id |
user_merchant_buy_total |
0 |
165 |
2934 |
4 |
1 |
165 |
4195 |
2 |
t1 = feature1[['user_id','merchant_id','coupon_id']].copy()
t1 = t1[t1.coupon_id.notnull()][['user_id','merchant_id']]
t1['user_merchant_received'] = 1
t1 = t1.groupby(['user_id','merchant_id']).count().reset_index()
t1.head(2)
|
user_id |
merchant_id |
user_merchant_received |
0 |
4 |
1433 |
1 |
1 |
35 |
3381 |
4 |
t2 = feature1[['user_id','merchant_id','date','date_received']].copy()
t2 = t2[(t2.date.notnull())&(t2.date_received.notnull())][['user_id','merchant_id']]
t2['user_merchant_buy_use_coupon'] = 1
t2 = t2.groupby(['user_id','merchant_id']).count().reset_index()
t2.head(2)
|
user_id |
merchant_id |
user_merchant_buy_use_coupon |
0 |
184 |
3381 |
1 |
1 |
417 |
775 |
1 |
t3 = feature1[['user_id','merchant_id']].copy()
t3['user_merchant_any'] = 1
t3 = t3.groupby(['user_id','merchant_id']).count().reset_index()
t3.head(2)
|
user_id |
merchant_id |
user_merchant_any |
0 |
4 |
1433 |
1 |
1 |
35 |
3381 |
4 |
t4 = feature1[['user_id','merchant_id','date','coupon_id']].copy()
t4 = t4[(t4.date.notnull())&(t4.coupon_id.isnull())][['user_id','merchant_id']]
t4['user_merchant_buy_common'] = 1
t4 = t4.groupby(['user_id','merchant_id']).count().reset_index()
t4.head(2)
|
user_id |
merchant_id |
user_merchant_buy_common |
0 |
165 |
2934 |
4 |
1 |
165 |
4195 |
2 |
user_merchant1 = pd.merge(all_user_merchant, t, on=[
'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t1, on=[
'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t2, on=[
'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t3, on=[
'user_id', 'merchant_id'], how='left')
user_merchant1 = pd.merge(user_merchant1, t4, on=[
'user_id', 'merchant_id'], how='left')
user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(
np.nan, 0)
user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(
np.nan, 0)
user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon / \
user_merchant1.user_merchant_received
user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon / \
user_merchant1.user_merchant_buy_total
user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total / \
user_merchant1.user_merchant_any
user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common / \
user_merchant1.user_merchant_buy_total
user_merchant1.user_merchant_received = user_merchant2.user_merchant_received.fillna(
0)
user_merchant1.user_merchant_buy_total = user_merchant2.user_merchant_buy_total.fillna(
0)
user_merchant1.user_merchant_any = user_merchant2.user_merchant_any.fillna(0)
user_merchant1.to_csv('user_merchant1.csv', index=None)
生成训练集和测试集
def trans_date(s):
if s != s:
return str(s)
else:
return str(int(s))
def get_label(s):
s = s.split(':')
if s[0] == 'nan':
return 0
elif s[1] == 'nan':
return 1
elif (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) -
date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days <= 15:
return 1
else:
return 0
测试集dataset3
coupon3 = pd.read_csv('coupon3_feature.csv')
merchant3 = pd.read_csv('merchant3_feature.csv')
user3 = pd.read_csv('user3_feature.csv')
user_merchant3 = pd.read_csv('user_merchant3.csv')
other_feature3 = pd.read_csv('other_feature3.csv')
coupon3.shape
(113640, 13)
dataset3 = pd.merge(coupon3, merchant3, on='merchant_id', how='left')
dataset3 = pd.merge(dataset3, user3, on='user_id', how='left')
dataset3 = pd.merge(dataset3, user_merchant3, on=[
'user_id', 'merchant_id'], how='left')
dataset3 = pd.merge(dataset3, other_feature3, on=[
'user_id', 'coupon_id', 'date_received'], how='left')
dataset3.shape
(116204, 50)
dataset3.total_sales = dataset3.total_sales.fillna(0)
dataset3.sales_use_coupon = dataset3.sales_use_coupon.fillna(0)
dataset3.total_coupon = dataset3.total_coupon.fillna(0)
dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(
np.nan, 0)
dataset3.user_merchant_received = dataset3.user_merchant_received.replace(
np.nan, 0)
dataset3.user_merchant_buy_use_coupon = dataset3.user_merchant_buy_use_coupon.replace(
np.nan, 0)
dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan, 0)
dataset3.user_merchant_buy_common = dataset3.user_merchant_buy_common.replace(
np.nan, 0)
dataset3.count_merchant = dataset3.count_merchant.fillna(0)
dataset3.buy_use_coupon = dataset3.buy_use_coupon.fillna(0)
dataset3.buy_total = dataset3.buy_total.fillna(0)
dataset3.coupon_received = dataset3.coupon_received.fillna(0)
dataset3.isnull().sum()
user_id 0
merchant_id 0
coupon_id 0
discount_rate 0
distance 12230
date_received 0
day_of_week 0
day_of_month 0
days_distance 0
discount_man 0
discount_jian 0
is_man_jian 0
coupon_count 0
total_sales 0
sales_use_coupon 0
total_coupon 0
merchant_min_distance 5096
merchant_max_distance 5096
merchant_mean_distance 5096
merchant_median_distance 5096
merchant_coupon_transfer_rate 898
coupon_rate 55
count_merchant 0
user_min_distance 97093
user_max_distance 97093
user_mean_distance 97093
user_median_distance 97093
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 95450
min_user_date_datereceived_gap 95450
max_user_date_datereceived_gap 95450
buy_use_coupon_rate 47959
user_coupon_transfer_rate 21231
user_merchant_buy_total 0
user_merchant_received 0
user_merchant_buy_use_coupon 0
user_merchant_any 0
user_merchant_buy_common 0
user_merchant_coupon_transfer_rate 85190
user_merchant_coupon_buy_rate 90569
user_merchant_rate 90569
user_merchant_common_buy_rate 90569
this_month_user_receive_same_coupon_count 0
this_month_user_receive_all_coupon_count 0
this_month_user_receive_same_coupon_lastone 0
this_month_user_receive_same_coupon_firstone 0
this_day_user_receive_all_coupon_count 0
this_day_user_receive_same_coupon_count 0
dtype: int64
dataset3['is_weekend'] = dataset3.day_of_week.apply(
lambda x: 1 if x in (6, 7) else 0)
weekday_dummies = pd.get_dummies(dataset3.day_of_week)
weekday_dummies.columns = ['weekday'+str(i+1)
for i in range(weekday_dummies.shape[1])]
dataset3 = pd.concat([dataset3, weekday_dummies], axis=1)
dataset3.drop(['merchant_id', 'day_of_week', 'coupon_count'],
axis=1, inplace=True)
dataset3.head()
dataset3.to_csv('dataset3.csv', index=None)
|
user_id |
coupon_id |
discount_rate |
distance |
date_received |
day_of_month |
days_distance |
discount_man |
discount_jian |
is_man_jian |
total_sales |
sales_use_coupon |
total_coupon |
merchant_min_distance |
merchant_max_distance |
merchant_mean_distance |
merchant_median_distance |
merchant_coupon_transfer_rate |
coupon_rate |
count_merchant |
user_min_distance |
user_max_distance |
user_mean_distance |
user_median_distance |
buy_use_coupon |
buy_total |
coupon_received |
avg_user_date_datereceived_gap |
min_user_date_datereceived_gap |
max_user_date_datereceived_gap |
buy_use_coupon_rate |
user_coupon_transfer_rate |
user_merchant_buy_total |
user_merchant_received |
user_merchant_buy_use_coupon |
user_merchant_any |
user_merchant_buy_common |
user_merchant_coupon_transfer_rate |
user_merchant_coupon_buy_rate |
user_merchant_rate |
user_merchant_common_buy_rate |
this_month_user_receive_same_coupon_count |
this_month_user_receive_all_coupon_count |
this_month_user_receive_same_coupon_lastone |
this_month_user_receive_same_coupon_firstone |
this_day_user_receive_all_coupon_count |
this_day_user_receive_same_coupon_count |
is_weekend |
weekday1 |
weekday2 |
weekday3 |
weekday4 |
weekday5 |
weekday6 |
weekday7 |
0 |
4129537 |
9983 |
0.833333 |
1.0 |
20160712 |
12 |
12 |
30 |
5 |
1 |
7757.0 |
1239.0 |
35933.0 |
0.0 |
10.0 |
0.894068 |
0.0 |
0.034481 |
0.159727 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
2.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
NaN |
NaN |
NaN |
1 |
1 |
-1 |
-1 |
1 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
6949378 |
3429 |
0.833333 |
NaN |
20160706 |
6 |
6 |
30 |
5 |
1 |
119.0 |
17.0 |
100.0 |
0.0 |
8.0 |
0.666667 |
0.0 |
0.170000 |
0.142857 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
2.0 |
1.0 |
3.0 |
3.0 |
3.0 |
0.5 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
NaN |
NaN |
NaN |
1 |
1 |
-1 |
-1 |
1 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
2 |
2166529 |
6928 |
0.900000 |
5.0 |
20160727 |
27 |
27 |
200 |
20 |
1 |
2280.0 |
100.0 |
3529.0 |
0.0 |
10.0 |
2.592593 |
1.0 |
0.028337 |
0.043860 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
NaN |
NaN |
NaN |
1 |
2 |
-1 |
-1 |
2 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
3 |
2166529 |
1808 |
0.900000 |
5.0 |
20160727 |
27 |
27 |
100 |
10 |
1 |
2280.0 |
100.0 |
3529.0 |
0.0 |
10.0 |
2.592593 |
1.0 |
0.028337 |
0.043860 |
0.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
NaN |
NaN |
NaN |
1 |
2 |
-1 |
-1 |
2 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
4 |
6172162 |
6500 |
0.966667 |
2.0 |
20160708 |
8 |
8 |
30 |
1 |
1 |
1065.0 |
182.0 |
795.0 |
0.0 |
10.0 |
0.783439 |
0.0 |
0.228931 |
0.170892 |
1.0 |
NaN |
NaN |
NaN |
NaN |
0.0 |
1.0 |
1.0 |
NaN |
NaN |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
NaN |
NaN |
NaN |
1 |
1 |
-1 |
-1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
dataset2
coupon2 = pd.read_csv('coupon2_feature.csv')
merchant2 = pd.read_csv('merchant2_feature.csv')
user2 = pd.read_csv('user2_feature.csv')
user_merchant2 = pd.read_csv('user_merchant2.csv')
other_feature2 = pd.read_csv('other_feature2.csv')
coupon2.shape
(258446, 14)
dataset2 = pd.merge(coupon2, merchant2, on='merchant_id', how='left')
dataset2 = pd.merge(dataset2, user2, on='user_id', how='left')
dataset2 = pd.merge(dataset2, user_merchant2, on=[
'user_id', 'merchant_id'], how='left')
dataset2 = pd.merge(dataset2, other_feature2, on=[
'user_id', 'coupon_id', 'date_received'], how='left')
dataset2.shape
(262240, 51)
dataset2['date']=dataset2['date'].apply(trans_date)
dataset2['date_received'] = dataset2['date_received'].apply(trans_date)
dataset2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 262240 entries, 0 to 262239
Data columns (total 51 columns):
user_id 262240 non-null int64
merchant_id 262240 non-null int64
coupon_id 262240 non-null float64
discount_rate 262240 non-null float64
distance 231874 non-null float64
date_received 262240 non-null object
date 262240 non-null object
day_of_week 262240 non-null int64
day_of_month 262240 non-null int64
days_distance 262240 non-null int64
discount_man 262240 non-null int64
discount_jian 262240 non-null int64
is_man_jian 262240 non-null int64
coupon_count 262240 non-null int64
total_sales 259460 non-null float64
sales_use_coupon 259460 non-null float64
total_coupon 259460 non-null float64
merchant_min_distance 217418 non-null float64
merchant_max_distance 217418 non-null float64
merchant_mean_distance 217418 non-null float64
merchant_median_distance 217418 non-null float64
merchant_coupon_transfer_rate 240217 non-null float64
coupon_rate 258655 non-null float64
count_merchant 146649 non-null float64
user_min_distance 21545 non-null float64
user_max_distance 21545 non-null float64
user_mean_distance 21545 non-null float64
user_median_distance 21545 non-null float64
buy_use_coupon 146649 non-null float64
buy_total 146649 non-null float64
coupon_received 146649 non-null float64
avg_user_date_datereceived_gap 23316 non-null float64
min_user_date_datereceived_gap 23316 non-null float64
max_user_date_datereceived_gap 23316 non-null float64
buy_use_coupon_rate 112596 non-null float64
user_coupon_transfer_rate 112753 non-null float64
user_merchant_buy_total 73321 non-null float64
user_merchant_received 73321 non-null float64
user_merchant_buy_use_coupon 73321 non-null float64
user_merchant_any 73321 non-null float64
user_merchant_buy_common 73321 non-null float64
user_merchant_coupon_transfer_rate 36972 non-null float64
user_merchant_coupon_buy_rate 64125 non-null float64
user_merchant_rate 64125 non-null float64
user_merchant_common_buy_rate 64125 non-null float64
this_month_user_receive_same_coupon_count 262240 non-null int64
this_month_user_receive_all_coupon_count 262240 non-null int64
this_month_user_receive_same_coupon_lastone 262240 non-null int64
this_month_user_receive_same_coupon_firstone 262240 non-null int64
this_day_user_receive_all_coupon_count 262240 non-null int64
this_day_user_receive_same_coupon_count 262240 non-null int64
dtypes: float64(34), int64(15), object(2)
memory usage: 104.0+ MB
dataset2.total_sales = dataset2.total_sales.fillna(0)
dataset2.sales_use_coupon = dataset2.sales_use_coupon.fillna(0)
dataset2.total_coupon = dataset2.total_coupon.fillna(0)
dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(
np.nan, 0)
dataset2.user_merchant_received = dataset2.user_merchant_received.replace(
np.nan, 0)
dataset2.user_merchant_buy_use_coupon = dataset2.user_merchant_buy_use_coupon.replace(
np.nan, 0)
dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan, 0)
dataset2.user_merchant_buy_common = dataset2.user_merchant_buy_common.replace(
np.nan, 0)
dataset2.count_merchant = dataset2.count_merchant.fillna(0)
dataset2.buy_use_coupon = dataset2.buy_use_coupon.fillna(0)
dataset2.buy_total = dataset2.buy_total.fillna(0)
dataset2.coupon_received = dataset2.coupon_received.fillna(0)
dataset2.isnull().sum()
user_id 0
merchant_id 0
coupon_id 0
discount_rate 0
distance 30366
date_received 0
date 0
day_of_week 0
day_of_month 0
days_distance 0
discount_man 0
discount_jian 0
is_man_jian 0
coupon_count 0
total_sales 0
sales_use_coupon 0
total_coupon 0
merchant_min_distance 44822
merchant_max_distance 44822
merchant_mean_distance 44822
merchant_median_distance 44822
merchant_coupon_transfer_rate 22023
coupon_rate 3585
count_merchant 0
user_min_distance 240695
user_max_distance 240695
user_mean_distance 240695
user_median_distance 240695
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 238924
min_user_date_datereceived_gap 238924
max_user_date_datereceived_gap 238924
buy_use_coupon_rate 149644
user_coupon_transfer_rate 149487
user_merchant_buy_total 0
user_merchant_received 0
user_merchant_buy_use_coupon 0
user_merchant_any 0
user_merchant_buy_common 0
user_merchant_coupon_transfer_rate 225268
user_merchant_coupon_buy_rate 198115
user_merchant_rate 198115
user_merchant_common_buy_rate 198115
this_month_user_receive_same_coupon_count 0
this_month_user_receive_all_coupon_count 0
this_month_user_receive_same_coupon_lastone 0
this_month_user_receive_same_coupon_firstone 0
this_day_user_receive_all_coupon_count 0
this_day_user_receive_same_coupon_count 0
dtype: int64
dataset2['is_weekend'] = dataset2.day_of_week.apply(
lambda x: 1 if x in (6, 7) else 0)
weekday_dummies = pd.get_dummies(dataset2.day_of_week)
weekday_dummies.columns = ['weekday'+str(i+1)
for i in range(weekday_dummies.shape[1])]
dataset2 = pd.concat([dataset2, weekday_dummies], axis=1)
dataset2['label'] = dataset2.date + ':' + dataset2.date_received
dataset2.label = dataset2.label.apply(get_label)
dataset2[['date_received','date','label']].head(30)
|
date_received |
date |
label |
0 |
20160528 |
nan |
0 |
1 |
20160613 |
nan |
0 |
2 |
20160516 |
20160613 |
0 |
3 |
20160530 |
nan |
0 |
4 |
20160519 |
nan |
0 |
5 |
20160606 |
nan |
0 |
6 |
20160606 |
nan |
0 |
7 |
20160518 |
nan |
0 |
8 |
20160528 |
nan |
0 |
9 |
20160524 |
nan |
0 |
10 |
20160523 |
nan |
0 |
11 |
20160515 |
20160521 |
1 |
12 |
20160521 |
nan |
0 |
13 |
20160523 |
nan |
0 |
14 |
20160602 |
nan |
0 |
15 |
20160523 |
20160605 |
1 |
16 |
20160605 |
nan |
0 |
17 |
20160607 |
nan |
0 |
18 |
20160605 |
nan |
0 |
19 |
20160601 |
nan |
0 |
20 |
20160530 |
nan |
0 |
21 |
20160522 |
nan |
0 |
22 |
20160611 |
nan |
0 |
23 |
20160524 |
nan |
0 |
24 |
20160526 |
nan |
0 |
25 |
20160520 |
nan |
0 |
26 |
20160527 |
nan |
0 |
27 |
20160602 |
nan |
0 |
28 |
20160526 |
nan |
0 |
29 |
20160521 |
nan |
0 |
dataset2.drop(['merchant_id', 'day_of_week', 'date', 'date_received',
'coupon_id', 'coupon_count'], axis=1, inplace=True)
dataset2.to_csv('dataset2.csv', index=None)
dataset1
coupon1 = pd.read_csv('coupon1_feature.csv')
merchant1 = pd.read_csv('merchant1_feature.csv')
user1 = pd.read_csv('user1_feature.csv')
user_merchant1 = pd.read_csv('user_merchant1.csv')
other_feature1 = pd.read_csv('other_feature1.csv')
dataset1 = pd.merge(coupon1, merchant1, on='merchant_id', how='left')
dataset1 = pd.merge(dataset1, user1, on='user_id', how='left')
dataset1 = pd.merge(dataset1, user_merchant1, on=[
'user_id', 'merchant_id'], how='left')
dataset1 = pd.merge(dataset1, other_feature1, on=[
'user_id', 'coupon_id', 'date_received'], how='left')
dataset1.shape
(139785, 51)
dataset1.total_sales = dataset1.total_sales.fillna(0)
dataset1.sales_use_coupon = dataset1.sales_use_coupon.fillna(0)
dataset1.total_coupon = dataset1.total_coupon.fillna(0)
dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(
np.nan, 0)
dataset1.user_merchant_received = dataset1.user_merchant_received.replace(
np.nan, 0)
dataset1.user_merchant_buy_use_coupon = dataset1.user_merchant_buy_use_coupon.replace(
np.nan, 0)
dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan, 0)
dataset1.user_merchant_buy_common = dataset1.user_merchant_buy_common.replace(
np.nan, 0)
dataset1.count_merchant = dataset1.count_merchant.fillna(0)
dataset1.buy_use_coupon = dataset1.buy_use_coupon.fillna(0)
dataset1.buy_total = dataset1.buy_total.fillna(0)
dataset1.coupon_received = dataset1.coupon_received.fillna(0)
dataset1.isnull().sum()
user_id 0
merchant_id 0
coupon_id 0
discount_rate 0
distance 15492
date_received 0
date 127747
day_of_week 0
day_of_month 0
days_distance 0
discount_man 0
discount_jian 0
is_man_jian 0
coupon_count 0
total_sales 0
sales_use_coupon 0
total_coupon 0
merchant_min_distance 33627
merchant_max_distance 33627
merchant_mean_distance 33627
merchant_median_distance 33627
merchant_coupon_transfer_rate 25176
coupon_rate 3950
count_merchant 0
user_min_distance 131176
user_max_distance 131176
user_mean_distance 131176
user_median_distance 131176
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 130508
min_user_date_datereceived_gap 130508
max_user_date_datereceived_gap 130508
buy_use_coupon_rate 84347
user_coupon_transfer_rate 86167
user_merchant_buy_total 0
user_merchant_received 0
user_merchant_buy_use_coupon 0
user_merchant_any 0
user_merchant_buy_common 0
user_merchant_coupon_transfer_rate 123040
user_merchant_coupon_buy_rate 103555
user_merchant_rate 103555
user_merchant_common_buy_rate 103555
this_month_user_receive_same_coupon_count 0
this_month_user_receive_all_coupon_count 0
this_month_user_receive_same_coupon_lastone 0
this_month_user_receive_same_coupon_firstone 0
this_day_user_receive_all_coupon_count 0
this_day_user_receive_same_coupon_count 0
dtype: int64
dataset1['is_weekend'] = dataset1.day_of_week.apply(
lambda x: 1 if x in (6, 7) else 0)
weekday_dummies = pd.get_dummies(dataset1.day_of_week)
weekday_dummies.columns = ['weekday'+str(i+1)
for i in range(weekday_dummies.shape[1])]
dataset1 = pd.concat([dataset1, weekday_dummies], axis=1)
dataset1['date']=dataset1['date'].apply(trans_date)
dataset1['date_received'] = dataset1['date_received'].apply(trans_date)
dataset1['label'] = dataset1.date + ':' + dataset1.date_received
dataset1.label = dataset1.label.apply(get_label)
dataset1.drop(['merchant_id', 'day_of_week', 'date', 'date_received',
'coupon_id', 'coupon_count'], axis=1, inplace=True)
dataset1.to_csv('dataset1.csv', index=None)
模型建立与预测
from IPython.core.interactiveshell import InteractiveShell
from jupyterthemes import jtplot
import os
from datetime import date
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
jtplot.style(theme='oceans16')
InteractiveShell.ast_node_interactivity = "all"
dataset1 = pd.read_csv('dataset1.csv')
dataset2 = pd.read_csv('dataset2.csv')
dataset3 = pd.read_csv('dataset3.csv')
'''dataset1.shape,dataset2.shape,dataset3.shape
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset1.shape,dataset2.shape,dataset3.shape'''
'dataset1.shape,dataset2.shape,dataset3.shape\ndataset1.drop_duplicates(inplace=True)\ndataset2.drop_duplicates(inplace=True)\ndataset3.drop_duplicates(inplace=True)\ndataset1.shape,dataset2.shape,dataset3.shape'
dataset12 = pd.concat([dataset1,dataset2],axis=0)
dataset1_y = dataset1.label
dataset1_x = dataset1.drop(
['user_id', 'label'], axis=1)
dataset2_y = dataset2.label
dataset2_x = dataset2.drop(
['user_id', 'label'], axis=1)
dataset12_y = dataset12.label
dataset12_x = dataset12.drop(
['user_id', 'label'], axis=1)
dataset3_preds = dataset3[['user_id', 'coupon_id', 'date_received']]
dataset3_x = dataset3.drop(
['user_id', 'coupon_id', 'date_received'], axis=1)
dataset1_x.shape, dataset2_x.shape, dataset3_x.shape
((139785, 52), (262240, 52), (116204, 52))
dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y)
dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y)
dataset12 = xgb.DMatrix(dataset12_x,label=dataset12_y)
dataset3 = xgb.DMatrix(dataset3_x)
D:\anaconda3\lib\site-packages\xgboost\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version
if getattr(data, 'base', None) is not None and \
D:\anaconda3\lib\site-packages\xgboost\core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version
data.base is not None and isinstance(data, np.ndarray) \
params={'booster':'gbtree',
'objective': 'rank:pairwise',
'eval_metric':'auc',
'gamma':0.1,
'min_child_weight':1.1,
'max_depth':5,
'lambda':10,
'subsample':0.7,
'colsample_bytree':0.7,
'colsample_bylevel':0.7,
'eta': 0.01,
'tree_method':'exact',
'seed':0,
'nthread':12
}
watchlist = [(dataset12,'train')]
model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist)
dataset3_preds['label'] = model.predict(dataset3)
dataset3_preds.label = MinMaxScaler().fit_transform(
dataset3_preds.label.values.reshape(-1, 1))
dataset3_preds.sort_values(by=['coupon_id', 'label'], inplace=True)
dataset3_preds.to_csv("xgb_preds.csv", index=None, header=None)
dataset3_preds.describe()
|
user_id |
coupon_id |
date_received |
label |
count |
1.162040e+05 |
116204.000000 |
1.162040e+05 |
116204.000000 |
mean |
3.678026e+06 |
9014.669702 |
2.016072e+07 |
0.310783 |
std |
2.122548e+06 |
4136.488170 |
9.016543e+00 |
0.129064 |
min |
2.090000e+02 |
3.000000 |
2.016070e+07 |
0.000000 |
25% |
1.858042e+06 |
5143.000000 |
2.016071e+07 |
0.229695 |
50% |
3.666466e+06 |
9983.000000 |
2.016072e+07 |
0.290300 |
75% |
5.516537e+06 |
13602.000000 |
2.016072e+07 |
0.373554 |
max |
7.361024e+06 |
14045.000000 |
2.016073e+07 |
1.000000 |