#Team 2A: Wei Wang, Zhang Haoran,Shane Reid

#import all the library we need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.formula.api import ols
#import the WineSales dataset
Sales=pd.read_csv("WineSales.csv")
Sales

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	sales	nps	marketing_spend	products	week	quarter
0	$1,068	4.8	$442	wine 2	2015/1/4	20151
1	$992	4.7	$442	wine 2	2015/1/11	20151
2	$1,025	5.5	$442	wine 2	2015/1/18	20151
3	$1,030	5.0	$442	wine 2	2015/1/25	20151
4	$850	4.6	$442	wine 2	2015/2/1	20151
...	...	...	...	...	...	...
243	$2,917	8.2	$234	wine 3	2019/9/1	20193
244	$3,218	8.3	$234	wine 3	2019/9/8	20193
245	$2,847	6.8	$234	wine 3	2019/9/15	20193
246	$3,292	7.8	$234	wine 3	2019/9/22	20193
247	$2,999	7.8	$234	wine 3	2019/9/29	20193

248 rows × 6 columns

#GOAL 1.1 Impute missing values
#There are 34 missing values in nps
print("Missing value before interpolate\n",Sales.isnull().sum())
#We use linear interpolate method to impute missing values
Sales['nps']=Sales['nps'].interpolate()
print("\nNo Missing value after interpolate\n",Sales.isnull().sum())
#We can see from below that all missing values are imputed.

Missing value before interpolate
 sales               0
nps                34
marketing_spend     0
products            0
week                0
quarter             0
dtype: int64

No Missing value after interpolate
 sales              0
nps                0
marketing_spend    0
products           0
week               0
quarter            0
dtype: int64

#GOAL 1.2 one-hot encode products category
Sales.replace("wine 1,wine 3,wine 2","wine 1,wine 2,wine 3",inplace=True)
Sales.replace("wine 2,wine 1,wine 3","wine 1,wine 2,wine 3",inplace=True)
Sales.replace("wine 3,wine 1","wine 1,wine 3",inplace=True)
Sales.replace("wine 2","wine2",inplace=True)

#Use get_dummies method to one-hot encode the products category.
OneHotEncode= pd.get_dummies(Sales.products)
## delete the previous products column
Sales.drop('products',inplace=True,axis=1)
## add the onehotencode to the table
Sales=Sales.join(OneHotEncode)
Sales

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	sales	nps	marketing_spend	week	quarter	wine 1,wine 2,wine 3	wine 1,wine 3	wine 2,wine 1	wine 2,wine 3	wine 3	wine2
0	$1,068	4.8	$442	2015/1/4	20151	0	0	0	0	0	1
1	$992	4.7	$442	2015/1/11	20151	0	0	0	0	0	1
2	$1,025	5.5	$442	2015/1/18	20151	0	0	0	0	0	1
3	$1,030	5.0	$442	2015/1/25	20151	0	0	0	0	0	1
4	$850	4.6	$442	2015/2/1	20151	0	0	0	0	0	1
...	...	...	...	...	...	...	...	...	...	...	...
243	$2,917	8.2	$234	2019/9/1	20193	0	0	0	0	1	0
244	$3,218	8.3	$234	2019/9/8	20193	0	0	0	0	1	0
245	$2,847	6.8	$234	2019/9/15	20193	0	0	0	0	1	0
246	$3,292	7.8	$234	2019/9/22	20193	0	0	0	0	1	0
247	$2,999	7.8	$234	2019/9/29	20193	0	0	0	0	1	0

248 rows × 11 columns

# GOAL 2.1 Aggregate features by quarter
Sales['sales'] = Sales['sales'].map(lambda x: str(x)[1:])
Sales['sales'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
Sales=Sales.astype({"sales":"float"})
Sales['marketing_spend'] = Sales['marketing_spend'].map(lambda x: str(x)[1:])
Sales['marketing_spend'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
Sales=Sales.astype({"marketing_spend":"float"})

#Aggregate features by quarter and then we get total quarter sales, nps, marketing_spend, and wines products
Sales = Sales.groupby(['quarter']).agg({"sales":"sum",'nps':'sum','marketing_spend':'sum','wine 1,wine 2,wine 3':'sum','wine 1,wine 3':'sum','wine2':'sum','wine 2,wine 1':'sum','wine 2,wine 3':'sum','wine 3':'sum'})
Sales

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	sales	nps	marketing_spend	wine 1,wine 2,wine 3	wine 1,wine 3	wine2	wine 2,wine 1	wine 2,wine 3	wine 3
quarter
20151	13330.0	65.35	5746.0	0	0	13	0	0	0
20152	46022.0	64.15	1664.0	0	13	0	0	0	0
20153	35934.0	66.30	2119.0	0	0	0	0	13	0
20154	28126.0	63.20	3874.0	0	0	13	0	0	0
20161	35362.0	59.60	4992.0	13	0	0	0	0	0
20162	44154.0	64.65	481.0	0	13	0	0	0	0
20163	34924.0	64.10	273.0	0	0	0	0	13	0
20164	36365.0	63.60	5135.0	13	0	0	0	0	0
20171	30554.0	65.95	845.0	0	0	0	13	0	0
20172	39667.0	70.25	7306.0	13	0	0	0	0	0
20173	47186.0	78.65	845.0	0	13	0	0	0	0
20174	41857.0	105.35	3696.0	14	0	0	0	0	0
20181	36054.0	95.25	6648.0	0	12	0	0	0	0
20182	51559.0	105.00	7631.0	13	0	0	0	0	0
20183	54924.0	110.50	1344.0	14	0	0	0	0	0
20184	41654.0	105.10	4147.0	13	0	0	0	0	0
20191	39984.0	101.90	4888.0	13	0	0	0	0	0
20192	49520.0	102.90	5265.0	0	13	0	0	0	0
20193	45107.0	101.45	3042.0	0	0	0	0	0	13

# GOAL 2.2
#shift these features with quarter sales
Sales=Sales.assign(nps=lambda x: x.nps.shift(1).values,
marketing_spend=lambda x: x.marketing_spend.shift(1).values)
Sales

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	sales	nps	marketing_spend	wine 1,wine 2,wine 3	wine 1,wine 3	wine2	wine 2,wine 1	wine 2,wine 3	wine 3
quarter
20151	13330.0	NaN	NaN	0	0	13	0	0	0
20152	46022.0	65.35	5746.0	0	13	0	0	0	0
20153	35934.0	64.15	1664.0	0	0	0	0	13	0
20154	28126.0	66.30	2119.0	0	0	13	0	0	0
20161	35362.0	63.20	3874.0	13	0	0	0	0	0
20162	44154.0	59.60	4992.0	0	13	0	0	0	0
20163	34924.0	64.65	481.0	0	0	0	0	13	0
20164	36365.0	64.10	273.0	13	0	0	0	0	0
20171	30554.0	63.60	5135.0	0	0	0	13	0	0
20172	39667.0	65.95	845.0	13	0	0	0	0	0
20173	47186.0	70.25	7306.0	0	13	0	0	0	0
20174	41857.0	78.65	845.0	14	0	0	0	0	0
20181	36054.0	105.35	3696.0	0	12	0	0	0	0
20182	51559.0	95.25	6648.0	13	0	0	0	0	0
20183	54924.0	105.00	7631.0	14	0	0	0	0	0
20184	41654.0	110.50	1344.0	13	0	0	0	0	0
20191	39984.0	105.10	4147.0	13	0	0	0	0	0
20192	49520.0	101.90	4888.0	0	13	0	0	0	0
20193	45107.0	102.90	5265.0	0	0	0	0	0	13

# GOAL 2.3 Find correlation between features
Sales.corr()
#nps,marketing_spend and wine 2 has a correlation absolute value > 0.4
#We use these three variables to fit the multi-regression model

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	sales	nps	marketing_spend	wine 1,wine 2,wine 3	wine 1,wine 3	wine2	wine 2,wine 1	wine 2,wine 3	wine 3
sales	1.000000	0.487498	0.633842	0.294349	0.332046	-0.701769	-0.231075	-0.154923	0.140926
nps	0.487498	1.000000	0.284025	0.255818	-0.018123	-0.183782	-0.218348	-0.303365	0.284777
marketing_spend	0.633842	0.284025	1.000000	-0.188336	0.430298	-0.164598	0.146133	-0.397111	0.159527
wine 1,wine 2,wine 3	0.294349	0.255818	-0.188336	1.000000	-0.508841	-0.292240	-0.200822	-0.292240	-0.200822
wine 1,wine 3	0.332046	-0.018123	0.430298	-0.508841	1.000000	-0.204844	-0.140766	-0.204844	-0.140766
wine2	-0.701769	-0.183782	-0.164598	-0.292240	-0.204844	1.000000	-0.080845	-0.117647	-0.080845
wine 2,wine 1	-0.231075	-0.218348	0.146133	-0.200822	-0.140766	-0.080845	1.000000	-0.080845	-0.055556
wine 2,wine 3	-0.154923	-0.303365	-0.397111	-0.292240	-0.204844	-0.117647	-0.080845	1.000000	-0.080845
wine 3	0.140926	0.284777	0.159527	-0.200822	-0.140766	-0.080845	-0.055556	-0.080845	1.000000

# GOAL 3.1 build multi-regression model using nps,marketing_spend and wine 2 as the independent variables, sales as dependent variable.
model=ols("sales~nps+marketing_spend+wine2", Sales).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  sales   R-squared:                       0.597
Model:                            OLS   Adj. R-squared:                  0.510
Method:                 Least Squares   F-statistic:                     6.908
Date:                Fri, 01 Apr 2022   Prob (F-statistic):            0.00438
Time:                        13:07:25   Log-Likelihood:                -176.79
No. Observations:                  18   AIC:                             361.6
Df Residuals:                      14   BIC:                             365.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept        2.742e+04   5391.733      5.086      0.000    1.59e+04     3.9e+04
nps               106.7628     66.295      1.610      0.130     -35.427     248.952
marketing_spend     1.4937      0.532      2.809      0.014       0.353       2.634
wine2            -733.6809    409.996     -1.789      0.095   -1613.034     145.673
==============================================================================
Omnibus:                        5.641   Durbin-Watson:                   2.775
Prob(Omnibus):                  0.060   Jarque-Bera (JB):                3.624
Skew:                          -1.085   Prob(JB):                        0.163
Kurtosis:                       3.353   Cond. No.                     1.99e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.99e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


C:\Users\Lenovo\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=18
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "

# GOAL 3.2 Predict 2019 Q4 sales
q4nps=sum(Sales['nps'][16:19])/3
q4mspend=sum(Sales['marketing_spend'][16:19])/3
q4wine2=sum(Sales['wine2'][16:19])/3
results=model.predict({"nps":q4nps, "marketing_spend":q4mspend, "wine2":q4wine2})

#The final prediction for Q4 sales is:
results

0    45568.866671
dtype: float64

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ds-2A.md

ds-2A.md

Files

ds-2A.md

Latest commit

History

ds-2A.md

File metadata and controls