-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_proc.py
197 lines (142 loc) · 5.63 KB
/
data_proc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# Author: Alastair Hamilton
# Date: May/June 2018
# Title: Model for Home-depot Kaggle Competition
# Imports
## Data Wrangling
import numpy as np
import pandas as pd
## Misc
import os
import re
from pprint import pprint as pp
import time
## Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
## NLP
import spacy
## ML
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Config
# # Pandas error display OFF
pd.options.mode.chained_assignment = None
# # Set path to data
data_path = "./data/"
# # Create spaCy nlp tagger
nlp_tag = spacy.load('en', disable=['parser', 'ner'])
# # Processing features
proc_feat = ['search_term', 'product_title', 'product_description', 'attributes']
# Functions
# Remove punctuation from column of dataframe
def rmv_punc(df, col):
df.loc[:, col] = df.loc[:,col].apply(lambda x: tuple(filter(lambda y: not y.is_punct, x)))
return df
# Remove stop words and punctuation
def rmv_stoppunc(s):
return s.apply(lambda x: tuple(filter(lambda y: not (y.is_stop or y.is_punct), x)))
# Apply function on rows of data frame (2 cols max)
def func_row(df, func):
return df.apply(lambda row: func(row[0], row[1]), axis=1)
# Find number of words in one document (doc1) that are in another document (doc2)
def common_words_doc(doc1, doc2):
tot = 0
for w1 in doc1:
for w2 in doc2:
# if w1.lemma_ == w2.lemma_:
# tot += 1
# break
if w2.lemma_.find(w1.lemma_) >= 0:
tot += 1
break
return tot
# Get starting time for script
stime = time.time()
# -------------------------
print('\n-------------------------')
# Import data
print('Importing data...')
# # Get all zipped files in data path
zips = [f for f in os.listdir(data_path) if re.search(".zip$", f)]
# # Unzip all files and put into dictionary, keyed by file stem
data_dict = {}
for zipped in zips:
print('- Importing {}...'.format(zipped))
data_dict[zipped.split('.')[0]] = pd.read_csv(data_path+zipped, compression='zip', encoding='latin1')
# # Set dataframe to piece in data dictionary
train_df = data_dict['train']
prod_desc = data_dict['product_descriptions']
attributes = data_dict['attributes']
# # Clean up
del data_dict
# # Time
print('Took {:.2f} minutes.'.format((time.time()-stime)/60))
# -------------------------
print('-------------------------')
# Time
itime = time.time()
# Process data
print("Processing data...")
# # Process attributes data
print("- Handling attributes data...")
# # # Deal with N/As in attributes data (drop empty records and fill in name and values with empty string)
attr = attributes.dropna(how='all')
attr[['name','value']] = attr[['name','value']].fillna('')
# # # Ensure UID is int
attr['product_uid'] = attr['product_uid'].apply(lambda x: int(x))
# # # If "bullet" in attribute name then asserting name is meaningless - make an empty string
attr['name'] = attr['name'].apply(lambda x: '' if "Bullet" in x else x)
# # # Group name and value in attributes into single column, separated by a tab and ending in newline (for grouping stage next)
attr['attributes'] = attr['name'] + '\t' + attr['value'] + '\n'
# # # Drop name and values, groupby UID and sum grouped values, reset index...
# # # ...(ie. all attributes in single cell now, separated by newlines as set up above)
attr = attr.drop(['name','value'], axis=1).groupby('product_uid').sum().reset_index()
# # Create master data frame
print("- Creating master data frame...")
# # # Merge all data into one master dataframe by merging descriptions and attributes onto training data on UID...
# # # ...Fill any NAs with empty string
data = pd.merge(train_df, prod_desc, how='left', on='product_uid').drop('id', axis=1).merge(attr, on='product_uid', how='left').fillna('')
# # # Finally create a master index column, which will be used to reference individual search terms
data = data.reset_index().drop('index', axis=1).reset_index()
# # Clean up
del train_df
del prod_desc
del attr
# # Time
print('Took {:.2f} minutes.'.format((time.time()-itime)/60))
# -------------------------
print("-------------------------")
# Time
itime = time.time()
# Feature Generation
print("Generating features...")
# # NLP
print("- Applying spaCy NLP processor to all string features...")
for feat in proc_feat:
print('-- Applying to {}'.format(feat))
data[feat] = data[feat].apply(nlp_tag)
# # Len of query
print('- Creating length of query column...')
data['q_len'] = data['search_term'].apply(lambda x: len(x))
# # Get common words between query and returned product title
print('- Creating query-title common words column...')
data['com_title'] = func_row(data[['search_term', 'product_title']], common_words_doc)
# # Get common words between query and returned product description
print('- Creating query-description common words column...')
data['com_desc'] = func_row(data[['search_term', 'product_description']], common_words_doc)
# # Get common words between query and returned product description
print('- Creating query-attributes common words column...')
data['com_attr'] = func_row(data[['search_term', 'attributes']], common_words_doc)
# # Clean up
data = data.drop(proc_feat, axis=1)
# # Write data to file
print('- Writing data to file...')
data.to_csv(data_path+'features.csv')
# # Time
print('Took {:.2f} minutes.'.format((time.time()-itime)/60))
# Finish up
pp(data.head())
print('Script took a total of {:.2f} minutes'.format((time.time()-stime)/60))
print('Done!')