Skip to content

Latest commit

 

History

History
1196 lines (1122 loc) · 23.8 KB

File metadata and controls

1196 lines (1122 loc) · 23.8 KB
#Team 2A: Wei Wang, Zhang Haoran,Shane Reid
#import all the library we need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.formula.api import ols
#import the WineSales dataset
Sales=pd.read_csv("WineSales.csv")
Sales
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sales nps marketing_spend products week quarter
0 $1,068 4.8 $442 wine 2 2015/1/4 20151
1 $992 4.7 $442 wine 2 2015/1/11 20151
2 $1,025 5.5 $442 wine 2 2015/1/18 20151
3 $1,030 5.0 $442 wine 2 2015/1/25 20151
4 $850 4.6 $442 wine 2 2015/2/1 20151
... ... ... ... ... ... ...
243 $2,917 8.2 $234 wine 3 2019/9/1 20193
244 $3,218 8.3 $234 wine 3 2019/9/8 20193
245 $2,847 6.8 $234 wine 3 2019/9/15 20193
246 $3,292 7.8 $234 wine 3 2019/9/22 20193
247 $2,999 7.8 $234 wine 3 2019/9/29 20193

248 rows × 6 columns

#GOAL 1.1 Impute missing values
#There are 34 missing values in nps
print("Missing value before interpolate\n",Sales.isnull().sum())
#We use linear interpolate method to impute missing values
Sales['nps']=Sales['nps'].interpolate()
print("\nNo Missing value after interpolate\n",Sales.isnull().sum())
#We can see from below that all missing values are imputed.
Missing value before interpolate
 sales               0
nps                34
marketing_spend     0
products            0
week                0
quarter             0
dtype: int64

No Missing value after interpolate
 sales              0
nps                0
marketing_spend    0
products           0
week               0
quarter            0
dtype: int64
#GOAL 1.2 one-hot encode products category
Sales.replace("wine 1,wine 3,wine 2","wine 1,wine 2,wine 3",inplace=True)
Sales.replace("wine 2,wine 1,wine 3","wine 1,wine 2,wine 3",inplace=True)
Sales.replace("wine 3,wine 1","wine 1,wine 3",inplace=True)
Sales.replace("wine 2","wine2",inplace=True)

#Use get_dummies method to one-hot encode the products category.
OneHotEncode= pd.get_dummies(Sales.products)
## delete the previous products column
Sales.drop('products',inplace=True,axis=1)
## add the onehotencode to the table
Sales=Sales.join(OneHotEncode)
Sales
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sales nps marketing_spend week quarter wine 1,wine 2,wine 3 wine 1,wine 3 wine 2,wine 1 wine 2,wine 3 wine 3 wine2
0 $1,068 4.8 $442 2015/1/4 20151 0 0 0 0 0 1
1 $992 4.7 $442 2015/1/11 20151 0 0 0 0 0 1
2 $1,025 5.5 $442 2015/1/18 20151 0 0 0 0 0 1
3 $1,030 5.0 $442 2015/1/25 20151 0 0 0 0 0 1
4 $850 4.6 $442 2015/2/1 20151 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ...
243 $2,917 8.2 $234 2019/9/1 20193 0 0 0 0 1 0
244 $3,218 8.3 $234 2019/9/8 20193 0 0 0 0 1 0
245 $2,847 6.8 $234 2019/9/15 20193 0 0 0 0 1 0
246 $3,292 7.8 $234 2019/9/22 20193 0 0 0 0 1 0
247 $2,999 7.8 $234 2019/9/29 20193 0 0 0 0 1 0

248 rows × 11 columns

# GOAL 2.1 Aggregate features by quarter
Sales['sales'] = Sales['sales'].map(lambda x: str(x)[1:])
Sales['sales'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
Sales=Sales.astype({"sales":"float"})
Sales['marketing_spend'] = Sales['marketing_spend'].map(lambda x: str(x)[1:])
Sales['marketing_spend'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
Sales=Sales.astype({"marketing_spend":"float"})

#Aggregate features by quarter and then we get total quarter sales, nps, marketing_spend, and wines products
Sales = Sales.groupby(['quarter']).agg({"sales":"sum",'nps':'sum','marketing_spend':'sum','wine 1,wine 2,wine 3':'sum','wine 1,wine 3':'sum','wine2':'sum','wine 2,wine 1':'sum','wine 2,wine 3':'sum','wine 3':'sum'})
Sales
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sales nps marketing_spend wine 1,wine 2,wine 3 wine 1,wine 3 wine2 wine 2,wine 1 wine 2,wine 3 wine 3
quarter
20151 13330.0 65.35 5746.0 0 0 13 0 0 0
20152 46022.0 64.15 1664.0 0 13 0 0 0 0
20153 35934.0 66.30 2119.0 0 0 0 0 13 0
20154 28126.0 63.20 3874.0 0 0 13 0 0 0
20161 35362.0 59.60 4992.0 13 0 0 0 0 0
20162 44154.0 64.65 481.0 0 13 0 0 0 0
20163 34924.0 64.10 273.0 0 0 0 0 13 0
20164 36365.0 63.60 5135.0 13 0 0 0 0 0
20171 30554.0 65.95 845.0 0 0 0 13 0 0
20172 39667.0 70.25 7306.0 13 0 0 0 0 0
20173 47186.0 78.65 845.0 0 13 0 0 0 0
20174 41857.0 105.35 3696.0 14 0 0 0 0 0
20181 36054.0 95.25 6648.0 0 12 0 0 0 0
20182 51559.0 105.00 7631.0 13 0 0 0 0 0
20183 54924.0 110.50 1344.0 14 0 0 0 0 0
20184 41654.0 105.10 4147.0 13 0 0 0 0 0
20191 39984.0 101.90 4888.0 13 0 0 0 0 0
20192 49520.0 102.90 5265.0 0 13 0 0 0 0
20193 45107.0 101.45 3042.0 0 0 0 0 0 13
# GOAL 2.2
#shift these features with quarter sales
Sales=Sales.assign(nps=lambda x: x.nps.shift(1).values,
marketing_spend=lambda x: x.marketing_spend.shift(1).values)
Sales
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sales nps marketing_spend wine 1,wine 2,wine 3 wine 1,wine 3 wine2 wine 2,wine 1 wine 2,wine 3 wine 3
quarter
20151 13330.0 NaN NaN 0 0 13 0 0 0
20152 46022.0 65.35 5746.0 0 13 0 0 0 0
20153 35934.0 64.15 1664.0 0 0 0 0 13 0
20154 28126.0 66.30 2119.0 0 0 13 0 0 0
20161 35362.0 63.20 3874.0 13 0 0 0 0 0
20162 44154.0 59.60 4992.0 0 13 0 0 0 0
20163 34924.0 64.65 481.0 0 0 0 0 13 0
20164 36365.0 64.10 273.0 13 0 0 0 0 0
20171 30554.0 63.60 5135.0 0 0 0 13 0 0
20172 39667.0 65.95 845.0 13 0 0 0 0 0
20173 47186.0 70.25 7306.0 0 13 0 0 0 0
20174 41857.0 78.65 845.0 14 0 0 0 0 0
20181 36054.0 105.35 3696.0 0 12 0 0 0 0
20182 51559.0 95.25 6648.0 13 0 0 0 0 0
20183 54924.0 105.00 7631.0 14 0 0 0 0 0
20184 41654.0 110.50 1344.0 13 0 0 0 0 0
20191 39984.0 105.10 4147.0 13 0 0 0 0 0
20192 49520.0 101.90 4888.0 0 13 0 0 0 0
20193 45107.0 102.90 5265.0 0 0 0 0 0 13
# GOAL 2.3 Find correlation between features
Sales.corr()
#nps,marketing_spend and wine 2 has a correlation absolute value > 0.4
#We use these three variables to fit the multi-regression model
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sales nps marketing_spend wine 1,wine 2,wine 3 wine 1,wine 3 wine2 wine 2,wine 1 wine 2,wine 3 wine 3
sales 1.000000 0.487498 0.633842 0.294349 0.332046 -0.701769 -0.231075 -0.154923 0.140926
nps 0.487498 1.000000 0.284025 0.255818 -0.018123 -0.183782 -0.218348 -0.303365 0.284777
marketing_spend 0.633842 0.284025 1.000000 -0.188336 0.430298 -0.164598 0.146133 -0.397111 0.159527
wine 1,wine 2,wine 3 0.294349 0.255818 -0.188336 1.000000 -0.508841 -0.292240 -0.200822 -0.292240 -0.200822
wine 1,wine 3 0.332046 -0.018123 0.430298 -0.508841 1.000000 -0.204844 -0.140766 -0.204844 -0.140766
wine2 -0.701769 -0.183782 -0.164598 -0.292240 -0.204844 1.000000 -0.080845 -0.117647 -0.080845
wine 2,wine 1 -0.231075 -0.218348 0.146133 -0.200822 -0.140766 -0.080845 1.000000 -0.080845 -0.055556
wine 2,wine 3 -0.154923 -0.303365 -0.397111 -0.292240 -0.204844 -0.117647 -0.080845 1.000000 -0.080845
wine 3 0.140926 0.284777 0.159527 -0.200822 -0.140766 -0.080845 -0.055556 -0.080845 1.000000
# GOAL 3.1 build multi-regression model using nps,marketing_spend and wine 2 as the independent variables, sales as dependent variable.
model=ols("sales~nps+marketing_spend+wine2", Sales).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  sales   R-squared:                       0.597
Model:                            OLS   Adj. R-squared:                  0.510
Method:                 Least Squares   F-statistic:                     6.908
Date:                Fri, 01 Apr 2022   Prob (F-statistic):            0.00438
Time:                        13:07:25   Log-Likelihood:                -176.79
No. Observations:                  18   AIC:                             361.6
Df Residuals:                      14   BIC:                             365.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept        2.742e+04   5391.733      5.086      0.000    1.59e+04     3.9e+04
nps               106.7628     66.295      1.610      0.130     -35.427     248.952
marketing_spend     1.4937      0.532      2.809      0.014       0.353       2.634
wine2            -733.6809    409.996     -1.789      0.095   -1613.034     145.673
==============================================================================
Omnibus:                        5.641   Durbin-Watson:                   2.775
Prob(Omnibus):                  0.060   Jarque-Bera (JB):                3.624
Skew:                          -1.085   Prob(JB):                        0.163
Kurtosis:                       3.353   Cond. No.                     1.99e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.99e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


C:\Users\Lenovo\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=18
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "
# GOAL 3.2 Predict 2019 Q4 sales
q4nps=sum(Sales['nps'][16:19])/3
q4mspend=sum(Sales['marketing_spend'][16:19])/3
q4wine2=sum(Sales['wine2'][16:19])/3
results=model.predict({"nps":q4nps, "marketing_spend":q4mspend, "wine2":q4wine2})
#The final prediction for Q4 sales is:
results
0    45568.866671
dtype: float64