-
Notifications
You must be signed in to change notification settings - Fork 15
/
base_add_feature.py
117 lines (70 loc) · 3.29 KB
/
base_add_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# coding: utf-8
# In[1]:
import pandas as pd
from tqdm import tqdm
import jieba, os, Levenshtein, time
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from utility import read_file, lcseque_lens, lcsubstr_lens, find_longest_prefix, printlog
from sklearn import preprocessing
import numpy as np
from xpinyin import Pinyin
# In[2]:
print('run base_add_feature')
# 配置信息
is_print_output = True
all_start_time = time.time()
# In[3]:
since = time.time()
# 读入数据
train_data = read_file('Demo/DataSets/oppo_data_ronud2_20181107/data_train.txt')
val_data = read_file('Demo/DataSets/oppo_data_ronud2_20181107/data_vali.txt')
test_data = read_file('Demo/DataSets/oppo_round2_test_B/oppo_round2_test_B.txt', True)
# 拼接数据一起做特征
not_zip_all_data = pd.concat((train_data, val_data, test_data), axis = 0, ignore_index = True, sort = False)
time_elapsed = time.time() - since
print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
# In[4]:
since = time.time()
# 修正为空的query_prediction
not_zip_all_data.loc[not_zip_all_data.query_prediction == '', 'query_prediction'] = '{}'
# 修正label为int
not_zip_all_data['label'] = not_zip_all_data.label.astype('int')
# 保存要丢弃掉的列
drop_feature = []
time_elapsed = time.time() - since
print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
# In[5]:
since = time.time()
# 根据prefix query_prediction title tag来merge, query_prediction需要编码处理
encoder = preprocessing.LabelEncoder()
not_zip_all_data['diction_label'] = encoder.fit_transform(not_zip_all_data.query_prediction)
# 去除重复算非统计量特征
all_data = not_zip_all_data.drop('label', axis = 1).drop_duplicates().reset_index(drop = True)
drop_feature.append('diction_label')
time_elapsed = time.time() - since
print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
# In[9]:
all_data['prefix_contains_tag'] = all_data.groupby('prefix').tag.transform(lambda x: " ".join(x))
# In[12]:
since = time.time()
# 拼接回原数据
all_data = all_data.drop('query_prediction', axis = 1)
not_zip_all_data = pd.merge(not_zip_all_data, all_data, how = 'left', on = ['prefix', 'title', 'tag', 'diction_label'])
time_elapsed = time.time() - since
print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
# In[14]:
since = time.time()
cv = CountVectorizer() #, max_df = 0.03
prefix_contains_tag_matrix = cv.fit_transform(not_zip_all_data.prefix_contains_tag)
time_elapsed = time.time() - since
print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
# In[17]:
since = time.time()
# 保存数据
sparse.save_npz('Demo/Cases/prefix_contains_tag.npz', prefix_contains_tag_matrix)
time_elapsed = time.time() - since
print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
# In[18]:
time_elapsed = time.time() - all_start_time
print('final complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间