#Team 2A: Wei Wang, Zhang Haoran,Shane Reid
#import all the library we need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.formula.api import ols
#import the WineSales dataset
Sales=pd.read_csv("WineSales.csv")
Sales
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
sales |
nps |
marketing_spend |
products |
week |
quarter |
0 |
$1,068 |
4.8 |
$442 |
wine 2 |
2015/1/4 |
20151 |
1 |
$992 |
4.7 |
$442 |
wine 2 |
2015/1/11 |
20151 |
2 |
$1,025 |
5.5 |
$442 |
wine 2 |
2015/1/18 |
20151 |
3 |
$1,030 |
5.0 |
$442 |
wine 2 |
2015/1/25 |
20151 |
4 |
$850 |
4.6 |
$442 |
wine 2 |
2015/2/1 |
20151 |
... |
... |
... |
... |
... |
... |
... |
243 |
$2,917 |
8.2 |
$234 |
wine 3 |
2019/9/1 |
20193 |
244 |
$3,218 |
8.3 |
$234 |
wine 3 |
2019/9/8 |
20193 |
245 |
$2,847 |
6.8 |
$234 |
wine 3 |
2019/9/15 |
20193 |
246 |
$3,292 |
7.8 |
$234 |
wine 3 |
2019/9/22 |
20193 |
247 |
$2,999 |
7.8 |
$234 |
wine 3 |
2019/9/29 |
20193 |
248 rows × 6 columns
#GOAL 1.1 Impute missing values
#There are 34 missing values in nps
print("Missing value before interpolate\n",Sales.isnull().sum())
#We use linear interpolate method to impute missing values
Sales['nps']=Sales['nps'].interpolate()
print("\nNo Missing value after interpolate\n",Sales.isnull().sum())
#We can see from below that all missing values are imputed.
Missing value before interpolate
sales 0
nps 34
marketing_spend 0
products 0
week 0
quarter 0
dtype: int64
No Missing value after interpolate
sales 0
nps 0
marketing_spend 0
products 0
week 0
quarter 0
dtype: int64
#GOAL 1.2 one-hot encode products category
Sales.replace("wine 1,wine 3,wine 2","wine 1,wine 2,wine 3",inplace=True)
Sales.replace("wine 2,wine 1,wine 3","wine 1,wine 2,wine 3",inplace=True)
Sales.replace("wine 3,wine 1","wine 1,wine 3",inplace=True)
Sales.replace("wine 2","wine2",inplace=True)
#Use get_dummies method to one-hot encode the products category.
OneHotEncode= pd.get_dummies(Sales.products)
## delete the previous products column
Sales.drop('products',inplace=True,axis=1)
## add the onehotencode to the table
Sales=Sales.join(OneHotEncode)
Sales
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
sales |
nps |
marketing_spend |
week |
quarter |
wine 1,wine 2,wine 3 |
wine 1,wine 3 |
wine 2,wine 1 |
wine 2,wine 3 |
wine 3 |
wine2 |
0 |
$1,068 |
4.8 |
$442 |
2015/1/4 |
20151 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
$992 |
4.7 |
$442 |
2015/1/11 |
20151 |
0 |
0 |
0 |
0 |
0 |
1 |
2 |
$1,025 |
5.5 |
$442 |
2015/1/18 |
20151 |
0 |
0 |
0 |
0 |
0 |
1 |
3 |
$1,030 |
5.0 |
$442 |
2015/1/25 |
20151 |
0 |
0 |
0 |
0 |
0 |
1 |
4 |
$850 |
4.6 |
$442 |
2015/2/1 |
20151 |
0 |
0 |
0 |
0 |
0 |
1 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
243 |
$2,917 |
8.2 |
$234 |
2019/9/1 |
20193 |
0 |
0 |
0 |
0 |
1 |
0 |
244 |
$3,218 |
8.3 |
$234 |
2019/9/8 |
20193 |
0 |
0 |
0 |
0 |
1 |
0 |
245 |
$2,847 |
6.8 |
$234 |
2019/9/15 |
20193 |
0 |
0 |
0 |
0 |
1 |
0 |
246 |
$3,292 |
7.8 |
$234 |
2019/9/22 |
20193 |
0 |
0 |
0 |
0 |
1 |
0 |
247 |
$2,999 |
7.8 |
$234 |
2019/9/29 |
20193 |
0 |
0 |
0 |
0 |
1 |
0 |
248 rows × 11 columns
# GOAL 2.1 Aggregate features by quarter
Sales['sales'] = Sales['sales'].map(lambda x: str(x)[1:])
Sales['sales'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
Sales=Sales.astype({"sales":"float"})
Sales['marketing_spend'] = Sales['marketing_spend'].map(lambda x: str(x)[1:])
Sales['marketing_spend'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
Sales=Sales.astype({"marketing_spend":"float"})
#Aggregate features by quarter and then we get total quarter sales, nps, marketing_spend, and wines products
Sales = Sales.groupby(['quarter']).agg({"sales":"sum",'nps':'sum','marketing_spend':'sum','wine 1,wine 2,wine 3':'sum','wine 1,wine 3':'sum','wine2':'sum','wine 2,wine 1':'sum','wine 2,wine 3':'sum','wine 3':'sum'})
Sales
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
sales |
nps |
marketing_spend |
wine 1,wine 2,wine 3 |
wine 1,wine 3 |
wine2 |
wine 2,wine 1 |
wine 2,wine 3 |
wine 3 |
quarter |
|
|
|
|
|
|
|
|
|
20151 |
13330.0 |
65.35 |
5746.0 |
0 |
0 |
13 |
0 |
0 |
0 |
20152 |
46022.0 |
64.15 |
1664.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20153 |
35934.0 |
66.30 |
2119.0 |
0 |
0 |
0 |
0 |
13 |
0 |
20154 |
28126.0 |
63.20 |
3874.0 |
0 |
0 |
13 |
0 |
0 |
0 |
20161 |
35362.0 |
59.60 |
4992.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20162 |
44154.0 |
64.65 |
481.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20163 |
34924.0 |
64.10 |
273.0 |
0 |
0 |
0 |
0 |
13 |
0 |
20164 |
36365.0 |
63.60 |
5135.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20171 |
30554.0 |
65.95 |
845.0 |
0 |
0 |
0 |
13 |
0 |
0 |
20172 |
39667.0 |
70.25 |
7306.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20173 |
47186.0 |
78.65 |
845.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20174 |
41857.0 |
105.35 |
3696.0 |
14 |
0 |
0 |
0 |
0 |
0 |
20181 |
36054.0 |
95.25 |
6648.0 |
0 |
12 |
0 |
0 |
0 |
0 |
20182 |
51559.0 |
105.00 |
7631.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20183 |
54924.0 |
110.50 |
1344.0 |
14 |
0 |
0 |
0 |
0 |
0 |
20184 |
41654.0 |
105.10 |
4147.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20191 |
39984.0 |
101.90 |
4888.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20192 |
49520.0 |
102.90 |
5265.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20193 |
45107.0 |
101.45 |
3042.0 |
0 |
0 |
0 |
0 |
0 |
13 |
# GOAL 2.2
#shift these features with quarter sales
Sales=Sales.assign(nps=lambda x: x.nps.shift(1).values,
marketing_spend=lambda x: x.marketing_spend.shift(1).values)
Sales
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
sales |
nps |
marketing_spend |
wine 1,wine 2,wine 3 |
wine 1,wine 3 |
wine2 |
wine 2,wine 1 |
wine 2,wine 3 |
wine 3 |
quarter |
|
|
|
|
|
|
|
|
|
20151 |
13330.0 |
NaN |
NaN |
0 |
0 |
13 |
0 |
0 |
0 |
20152 |
46022.0 |
65.35 |
5746.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20153 |
35934.0 |
64.15 |
1664.0 |
0 |
0 |
0 |
0 |
13 |
0 |
20154 |
28126.0 |
66.30 |
2119.0 |
0 |
0 |
13 |
0 |
0 |
0 |
20161 |
35362.0 |
63.20 |
3874.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20162 |
44154.0 |
59.60 |
4992.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20163 |
34924.0 |
64.65 |
481.0 |
0 |
0 |
0 |
0 |
13 |
0 |
20164 |
36365.0 |
64.10 |
273.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20171 |
30554.0 |
63.60 |
5135.0 |
0 |
0 |
0 |
13 |
0 |
0 |
20172 |
39667.0 |
65.95 |
845.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20173 |
47186.0 |
70.25 |
7306.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20174 |
41857.0 |
78.65 |
845.0 |
14 |
0 |
0 |
0 |
0 |
0 |
20181 |
36054.0 |
105.35 |
3696.0 |
0 |
12 |
0 |
0 |
0 |
0 |
20182 |
51559.0 |
95.25 |
6648.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20183 |
54924.0 |
105.00 |
7631.0 |
14 |
0 |
0 |
0 |
0 |
0 |
20184 |
41654.0 |
110.50 |
1344.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20191 |
39984.0 |
105.10 |
4147.0 |
13 |
0 |
0 |
0 |
0 |
0 |
20192 |
49520.0 |
101.90 |
4888.0 |
0 |
13 |
0 |
0 |
0 |
0 |
20193 |
45107.0 |
102.90 |
5265.0 |
0 |
0 |
0 |
0 |
0 |
13 |
# GOAL 2.3 Find correlation between features
Sales.corr()
#nps,marketing_spend and wine 2 has a correlation absolute value > 0.4
#We use these three variables to fit the multi-regression model
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
sales |
nps |
marketing_spend |
wine 1,wine 2,wine 3 |
wine 1,wine 3 |
wine2 |
wine 2,wine 1 |
wine 2,wine 3 |
wine 3 |
sales |
1.000000 |
0.487498 |
0.633842 |
0.294349 |
0.332046 |
-0.701769 |
-0.231075 |
-0.154923 |
0.140926 |
nps |
0.487498 |
1.000000 |
0.284025 |
0.255818 |
-0.018123 |
-0.183782 |
-0.218348 |
-0.303365 |
0.284777 |
marketing_spend |
0.633842 |
0.284025 |
1.000000 |
-0.188336 |
0.430298 |
-0.164598 |
0.146133 |
-0.397111 |
0.159527 |
wine 1,wine 2,wine 3 |
0.294349 |
0.255818 |
-0.188336 |
1.000000 |
-0.508841 |
-0.292240 |
-0.200822 |
-0.292240 |
-0.200822 |
wine 1,wine 3 |
0.332046 |
-0.018123 |
0.430298 |
-0.508841 |
1.000000 |
-0.204844 |
-0.140766 |
-0.204844 |
-0.140766 |
wine2 |
-0.701769 |
-0.183782 |
-0.164598 |
-0.292240 |
-0.204844 |
1.000000 |
-0.080845 |
-0.117647 |
-0.080845 |
wine 2,wine 1 |
-0.231075 |
-0.218348 |
0.146133 |
-0.200822 |
-0.140766 |
-0.080845 |
1.000000 |
-0.080845 |
-0.055556 |
wine 2,wine 3 |
-0.154923 |
-0.303365 |
-0.397111 |
-0.292240 |
-0.204844 |
-0.117647 |
-0.080845 |
1.000000 |
-0.080845 |
wine 3 |
0.140926 |
0.284777 |
0.159527 |
-0.200822 |
-0.140766 |
-0.080845 |
-0.055556 |
-0.080845 |
1.000000 |
# GOAL 3.1 build multi-regression model using nps,marketing_spend and wine 2 as the independent variables, sales as dependent variable.
model=ols("sales~nps+marketing_spend+wine2", Sales).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: sales R-squared: 0.597
Model: OLS Adj. R-squared: 0.510
Method: Least Squares F-statistic: 6.908
Date: Fri, 01 Apr 2022 Prob (F-statistic): 0.00438
Time: 13:07:25 Log-Likelihood: -176.79
No. Observations: 18 AIC: 361.6
Df Residuals: 14 BIC: 365.1
Df Model: 3
Covariance Type: nonrobust
===================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------
Intercept 2.742e+04 5391.733 5.086 0.000 1.59e+04 3.9e+04
nps 106.7628 66.295 1.610 0.130 -35.427 248.952
marketing_spend 1.4937 0.532 2.809 0.014 0.353 2.634
wine2 -733.6809 409.996 -1.789 0.095 -1613.034 145.673
==============================================================================
Omnibus: 5.641 Durbin-Watson: 2.775
Prob(Omnibus): 0.060 Jarque-Bera (JB): 3.624
Skew: -1.085 Prob(JB): 0.163
Kurtosis: 3.353 Cond. No. 1.99e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.99e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
C:\Users\Lenovo\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=18
warnings.warn("kurtosistest only valid for n>=20 ... continuing "
# GOAL 3.2 Predict 2019 Q4 sales
q4nps=sum(Sales['nps'][16:19])/3
q4mspend=sum(Sales['marketing_spend'][16:19])/3
q4wine2=sum(Sales['wine2'][16:19])/3
results=model.predict({"nps":q4nps, "marketing_spend":q4mspend, "wine2":q4wine2})
#The final prediction for Q4 sales is:
results
0 45568.866671
dtype: float64