-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_feat.py
94 lines (85 loc) · 4.14 KB
/
gen_feat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from datetime import datetime,timedelta
import pandas as pd
import os,copy,math,time
import numpy as np
np.random.seed(2017)
from sklearn import preprocessing
from sklearn.utils import shuffle
from config import config
from utils import *
# 划分训练数据集合测试数据集
def split_train_test():
action = pd.read_csv(config.action_data_file,sep="\t")
split_date = datetime.strptime(config.split_date, config.date_format )
fun = lambda x: datetime.strptime(x,config.date_format)
action["FirstDate"] = action["FirstDate"].map( fun )
train_df = action[ action["FirstDate"]<split_date ].reset_index(drop=True)
train_df["FirstDate"] = train_df["FirstDate"].map( lambda x: x.strftime(config.date_format) )
train_df.to_csv(config.train_data_file,index=False,index_label=False)
test_df = action[ action["FirstDate"]>=split_date ].reset_index(drop=True)
test_df["FirstDate"] = test_df["FirstDate"].map( lambda x: x.strftime(config.date_format) )
test_df.to_csv(config.test_data_file,index=False,index_label=False)
# 获取label=1的用户
def get_label_1_users(action):
user_df = action[["CustomerID","label"]].groupby(["CustomerID"],as_index=False).sum().reset_index(drop=True)
user_df = user_df[user_df["label"]>=1].reset_index(drop=True)
user_df["label"] = 1.0
return user_df
# groupby CustomerID 前操作
def gen_feat1( data_file ):
print( "begin to gen_feat1 ......" )
action = pd.read_csv(data_file)
# 去除掉退货的数据: Trans=-1的列
action = action[action["Trans"]==1].reset_index(drop=True)
fun1 = lambda x: datetime.strptime(x, config.date_format) - datetime.strptime(config.begin_date, config.date_format)
fun2 = lambda x: fun1(x).days #时间差转换成距离历史开始的天数
action["FirstDate"] = action["FirstDate"].map( fun2 )
action["OrderDate"] = action["OrderDate"].map( fun2 )
# 标注label
action["label"] = 0.0
label_1_idx = action[ (action["OrderDate"]-action["FirstDate"]<=45)&(action["OrderDate"]-action["FirstDate"]>=1)].index
action.ix[list(label_1_idx),"label"] = 1.0
user_df = get_label_1_users(action)
del action["label"]
action = pd.merge(action,user_df,on="CustomerID",how="left")
action = action.replace(np.nan,0.0)
# 获取用户age
fun1 = lambda x: datetime.strptime(config.now_date, config.date_format) - datetime.strptime(x, config.date_format)
fun2 = lambda x: int(fun1(x).days/365) # 转换为用户的age属性
action["Birthday"] = action["Birthday"].map( fun2 )
action.rename(columns={'Birthday' : 'age'}, inplace=True)
################## ProductCategory 列one-hot处理 ###################
category_df = pd.get_dummies(action["ProductCategory"], prefix="")
action = pd.concat([action,category_df],axis=1)
# 转换product为int值
action["Product"] = action["Product"].map( lambda x: int(x[3:]) )
del action["ProductCategory"],action["FirstDate"],action["OrderDate"],action["Trans"]
action = action.groupby(["CustomerID"],as_index=False).first().reset_index(drop=True)
return action
# 获得train_data或者vali_data
def get_train_data():
print( "begin to get_train_data ..." )
user_df = gen_feat1(config.train_data_file)
user_df = shuffle( user_df )
train_column = ["age","Product","Items","UnitPrice","_Category1","_Category2","_Category3"]
train_x = user_df[train_column].values
train_y = user_df["label"].values
return train_x, train_y
def get_test_data():
print( "begin to get_test_data ..." )
user_df = gen_feat1(config.test_data_file)
user_df = user_df[user_df["label"]==0].reset_index(drop=0)
test_column = ["age","Product","Items","UnitPrice","_Category1","_Category2","_Category3"]
test_x = user_df[test_column].values
user_id = user_df["CustomerID"].values
return user_id, test_x
if __name__ == '__main__':
split_train_test()
# action = pd.read_csv(config.action_data_file,sep="\t")
# a = action[["CustomerID"]].drop_duplicates(["CustomerID"])
# print len(a.index)
# split_train_test()
# train_x, train_y = get_train_data()
# user_id, test_x = get_test_data()