-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_wrangling.py
61 lines (53 loc) · 1.94 KB
/
data_wrangling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 26 16:06:40 2020
@author: adtor97
"""
#%%
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
#%%
path_to_file = "C:\\Users\\USUARIO\\Desktop\\Python\\Repo\\customers_clustering\\inputs\\customers.csv"
df = pd.read_csv(path_to_file, sep = ";")
df = shuffle(df)
df = df.reset_index(drop = True)
original_columns = df.drop("ID_customer", axis = 1).columns.to_list()
print(df.head(), df.columns, df.shape)
#%%
print(df.describe())
#%%
#Analize data distribution
f, axes = plt.subplots(5, 5, figsize=(20, 20), sharex=False)
for i, feature in enumerate(df.drop("ID_customer", axis = 1).columns):
x = df[feature]
x = x.replace([np.inf, -np.inf], 0)
sns.distplot(x , color="skyblue", ax=axes[i%5, i//5])
#%%
#Apply proportion transformation for behavioural variables
columns_associated = [['web_purchases',
'app_purchases'], ['mon_thur_purchases', 'frid_sund_purchases'],
['lunch_purchases', 'evening_purchases'],
['low_ticket', 'medium_ticket' , 'big_ticket']
]
for columns in columns_associated:
if len(columns) == 1:
column = columns[0]
df["prop_"+column] = (df[column] / df["purchases"]).fillna(0)
else:
for column in columns:
df["prop_"+column] = (df[column] / df[columns].sum(axis=1)).fillna(0)
prop_columns = [col for col in df.columns if ('prop_' in col)]
print(len(df.columns))
#%%
#Apply log transformation for skewed data
for column in df.drop(["ID_customer"], axis = 1).columns:
new_column = "converted_" + column
df[new_column] = np.log(df[column])
df[new_column] = df[new_column].replace([np.inf, -np.inf], 0)
converted_columns = [col for col in df.columns if ('converted_' in col)]
print(len(df.columns))
#%%
df.to_csv ("C:\\Users\\USUARIO\\Desktop\\Python\\Repo\\customers_clustering\\outputs\\customers_wrangled.csv", index = False )